diff -urN vm-ref/arch/i386/config.in vm/arch/i386/config.in
--- vm-ref/arch/i386/config.in	Tue Oct 30 04:32:40 2001
+++ vm/arch/i386/config.in	Tue Oct 30 04:32:51 2001
@@ -403,6 +403,7 @@
    bool '  Magic SysRq key' CONFIG_MAGIC_SYSRQ
    bool '  Spinlock debugging' CONFIG_DEBUG_SPINLOCK
    bool '  Verbose BUG() reporting (adds 70K)' CONFIG_DEBUG_BUGVERBOSE
+   bool '  Debug allocation faliures' CONFIG_DEBUG_GFP
 fi
 
 endmenu
diff -urN vm-ref/fs/buffer.c vm/fs/buffer.c
--- vm-ref/fs/buffer.c	Tue Oct 30 04:32:40 2001
+++ vm/fs/buffer.c	Tue Oct 30 04:32:51 2001
@@ -115,7 +115,7 @@
 		int dummy5;	/* unused */
 	} b_un;
 	unsigned int data[N_PARAM];
-} bdf_prm = {{40, 0, 0, 0, 5*HZ, 30*HZ, 60, 0, 0}};
+} bdf_prm = {{20, 0, 0, 0, 5*HZ, 30*HZ, 40, 0, 0}};
 
 /* These are the min and max parameter values that we will allow to be assigned */
 int bdflush_min[N_PARAM] = {  0,  10,    5,   25,  0,   1*HZ,   0, 0, 0};
@@ -710,12 +710,8 @@
 
 static void free_more_memory(void)
 {
-	zone_t * zone = contig_page_data.node_zonelists[GFP_NOFS & GFP_ZONEMASK].zones[0];
-	
-	balance_dirty();
 	wakeup_bdflush();
-	try_to_free_pages(zone, GFP_NOFS, 0);
-	run_task_queue(&tq_disk);
+	try_to_free_pages_nozone(GFP_NOIO);
 	current->policy |= SCHED_YIELD;
 	__set_current_state(TASK_RUNNING);
 	schedule();
@@ -1057,19 +1053,17 @@
 	if (state < 0)
 		return;
 
-	/* If we're getting into imbalance, start write-out */
-	spin_lock(&lru_list_lock);
-	write_some_buffers(NODEV);
+	wakeup_bdflush();
 
 	/*
 	 * And if we're _really_ out of balance, wait for
-	 * some of the dirty/locked buffers ourselves and
-	 * start bdflush.
+	 * some of the dirty/locked buffers ourselves.
 	 * This will throttle heavy writers.
 	 */
 	if (state > 0) {
+		spin_lock(&lru_list_lock);
+		write_some_buffers(NODEV);
 		wait_for_some_buffers(NODEV);
-		wakeup_bdflush();
 	}
 }
 
@@ -2376,23 +2370,27 @@
 	return 1;
 }
 
-static int sync_page_buffers(struct buffer_head *head, unsigned int gfp_mask)
+static int sync_page_buffers(struct buffer_head *head)
 {
 	struct buffer_head * bh = head;
-	int tryagain = 0;
+	int tryagain = 1;
 
 	do {
 		if (!buffer_dirty(bh) && !buffer_locked(bh))
 			continue;
 
 		/* Don't start IO first time around.. */
-		if (!test_and_set_bit(BH_Wait_IO, &bh->b_state))
+		if (!test_and_set_bit(BH_Wait_IO, &bh->b_state)) {
+			tryagain = 0;
 			continue;
+		}
 
 		/* Second time through we start actively writing out.. */
 		if (test_and_set_bit(BH_Lock, &bh->b_state)) {
-			if (!test_bit(BH_launder, &bh->b_state))
+			if (!test_bit(BH_launder, &bh->b_state)) {
+				tryagain = 0;
 				continue;
+			}
 			wait_on_buffer(bh);
 			tryagain = 1;
 			continue;
@@ -2479,7 +2477,7 @@
 	spin_unlock(&lru_list_lock);
 	if (gfp_mask & __GFP_IO) {
 		if ((gfp_mask & __GFP_HIGHIO) || !PageHighMem(page)) {
-			if (sync_page_buffers(bh, gfp_mask)) {
+			if (sync_page_buffers(bh)) {
 				/* no IO or waiting next time */
 				gfp_mask = 0;
 				goto cleaned_buffers_try_again;
diff -urN vm-ref/fs/exec.c vm/fs/exec.c
--- vm-ref/fs/exec.c	Tue Oct 30 04:32:40 2001
+++ vm/fs/exec.c	Tue Oct 30 04:32:51 2001
@@ -275,7 +275,6 @@
 		goto out;
 	if (!pte_none(*pte))
 		goto out;
-	lru_cache_add(page);
 	flush_dcache_page(page);
 	flush_page_to_ram(page);
 	set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, PAGE_COPY))));
diff -urN vm-ref/include/linux/mm.h vm/include/linux/mm.h
--- vm-ref/include/linux/mm.h	Tue Oct 30 04:32:40 2001
+++ vm/include/linux/mm.h	Tue Oct 30 04:32:51 2001
@@ -296,6 +296,10 @@
 #define PageLaunder(page)	test_bit(PG_launder, &(page)->flags)
 #define SetPageLaunder(page)	set_bit(PG_launder, &(page)->flags)
 
+#define PageLaunder(page)	test_bit(PG_launder, &(page)->flags)
+#define SetPageLaunder(page)	set_bit(PG_launder, &(page)->flags)
+#define ClearPageLaunder(page)	clear_bit(PG_launder, &(page)->flags)
+
 extern void __set_page_dirty(struct page *);
 
 static inline void set_page_dirty(struct page * page)
@@ -311,7 +315,7 @@
  * parallel wait_on_page).
  */
 #define UnlockPage(page)	do { \
-					clear_bit(PG_launder, &(page)->flags); \
+					ClearPageLaunder(page); \
 					smp_mb__before_clear_bit(); \
 					if (!test_and_clear_bit(PG_locked, &(page)->flags)) BUG(); \
 					smp_mb__after_clear_bit(); \
@@ -399,7 +403,6 @@
 /*
  * There is only one 'core' page-freeing function.
  */
-extern void FASTCALL(free_lru_page(struct page *));
 extern void FASTCALL(__free_pages(struct page *page, unsigned int order));
 extern void FASTCALL(free_pages(unsigned long addr, unsigned int order));
 
@@ -409,6 +412,8 @@
 #define __free_page(page) __free_pages((page), 0)
 #define free_page(addr) free_pages((addr),0)
 
+extern int start_aggressive_readahead(unsigned int);
+
 extern void show_free_areas(void);
 extern void show_free_areas_node(pg_data_t *pgdat);
 
@@ -469,7 +474,20 @@
 	return page_count(page) - !!page->buffers == 1;
 }
 
-extern int remove_exclusive_swap_page(struct page *);
+/*
+ * Work out if there are any other processes sharing this
+ * swap cache page. Never mind the buffers.
+ */
+static inline int exclusive_swap_page(struct page *page)
+{
+	if (!PageLocked(page))
+		BUG();
+	if (!PageSwapCache(page))
+		return 0;
+	if (page_count(page) - !!page->buffers != 2)	/* 2: us + cache */
+		return 0;
+	return swap_count(page) == 1;			/* 1: just cache */
+}
 
 extern void __free_pte(pte_t);
 
diff -urN vm-ref/include/linux/mmzone.h vm/include/linux/mmzone.h
--- vm-ref/include/linux/mmzone.h	Mon Oct 29 01:49:56 2001
+++ vm/include/linux/mmzone.h	Tue Oct 30 04:32:51 2001
@@ -41,6 +41,7 @@
 	unsigned long		free_pages;
 	unsigned long		pages_min, pages_low, pages_high;
 	int			need_balance;
+	int			nr_active_pages, nr_inactive_pages;
 
 	/*
 	 * free areas of different sizes
diff -urN vm-ref/include/linux/pagemap.h vm/include/linux/pagemap.h
--- vm-ref/include/linux/pagemap.h	Tue Oct 30 03:37:11 2001
+++ vm/include/linux/pagemap.h	Tue Oct 30 04:32:51 2001
@@ -29,7 +29,7 @@
 #define PAGE_CACHE_ALIGN(addr)	(((addr)+PAGE_CACHE_SIZE-1)&PAGE_CACHE_MASK)
 
 #define page_cache_get(x)	get_page(x)
-#define page_cache_release(x)	free_lru_page(x)
+#define page_cache_release(x)	__free_page(x)
 
 static inline struct page *page_cache_alloc(struct address_space *x)
 {
diff -urN vm-ref/include/linux/sched.h vm/include/linux/sched.h
--- vm-ref/include/linux/sched.h	Tue Oct 30 04:32:40 2001
+++ vm/include/linux/sched.h	Tue Oct 30 04:32:51 2001
@@ -280,6 +280,14 @@
 extern struct user_struct root_user;
 #define INIT_USER (&root_user)
 
+struct zone_struct;
+
+struct local_pages {
+	struct list_head list;
+	unsigned int order, nr;
+	struct zone_struct * classzone;
+};
+
 struct task_struct {
 	/*
 	 * offsets of these are hardcoded elsewhere - touch with care
@@ -318,8 +326,7 @@
 	struct task_struct *next_task, *prev_task;
 	struct mm_struct *active_mm;
 	struct rw_sem_recursor mm_recursor;
-	struct list_head local_pages;
-	unsigned int allocation_order, nr_local_pages;
+	struct local_pages local_pages;
 
 /* task state */
 	struct linux_binfmt *binfmt;
diff -urN vm-ref/include/linux/swap.h vm/include/linux/swap.h
--- vm-ref/include/linux/swap.h	Tue Oct 30 04:32:40 2001
+++ vm/include/linux/swap.h	Tue Oct 30 04:32:51 2001
@@ -112,6 +112,8 @@
 /* linux/mm/vmscan.c */
 extern wait_queue_head_t kswapd_wait;
 extern int FASTCALL(try_to_free_pages(zone_t *, unsigned int, unsigned int));
+extern int FASTCALL(try_to_free_pages_nozone(unsigned int));
+extern int vm_scan_ratio, vm_balance_ratio, vm_mapped_ratio;
 
 /* linux/mm/page_io.c */
 extern void rw_swap_page(int, struct page *);
@@ -132,7 +134,6 @@
 extern struct page * read_swap_cache_async(swp_entry_t);
 
 /* linux/mm/oom_kill.c */
-extern int out_of_memory(void);
 extern void oom_kill(void);
 
 /* linux/mm/swapfile.c */
@@ -176,34 +177,100 @@
 		BUG();				\
 } while (0)
 
+#define inc_nr_active_pages(page)				\
+do {								\
+	pg_data_t * __pgdat;					\
+	zone_t * __classzone, * __overflow;			\
+								\
+	__classzone = (page)->zone;				\
+	__pgdat = __classzone->zone_pgdat;			\
+	__overflow = __pgdat->node_zones + __pgdat->nr_zones;	\
+								\
+	while (__classzone < __overflow) {			\
+		__classzone->nr_active_pages++;			\
+		__classzone++;					\
+	}							\
+	nr_active_pages++;					\
+} while (0)
+
+#define dec_nr_active_pages(page)				\
+do {								\
+	pg_data_t * __pgdat;					\
+	zone_t * __classzone, * __overflow;			\
+								\
+	__classzone = (page)->zone;				\
+	__pgdat = __classzone->zone_pgdat;			\
+	__overflow = __pgdat->node_zones + __pgdat->nr_zones;	\
+								\
+	while (__classzone < __overflow) {			\
+		__classzone->nr_active_pages--;			\
+		__classzone++;					\
+	}							\
+	nr_active_pages--;					\
+} while (0)
+
+#define inc_nr_inactive_pages(page)				\
+do {								\
+	pg_data_t * __pgdat;					\
+	zone_t * __classzone, * __overflow;			\
+								\
+	__classzone = (page)->zone;				\
+	__pgdat = __classzone->zone_pgdat;			\
+	__overflow = __pgdat->node_zones + __pgdat->nr_zones;	\
+								\
+	while (__classzone < __overflow) {			\
+		__classzone->nr_inactive_pages++;		\
+		__classzone++;					\
+	}							\
+	nr_inactive_pages++;					\
+} while (0)
+
+#define dec_nr_inactive_pages(page)				\
+do {								\
+	pg_data_t * __pgdat;					\
+	zone_t * __classzone, * __overflow;			\
+								\
+	__classzone = (page)->zone;				\
+	__pgdat = __classzone->zone_pgdat;			\
+	__overflow = __pgdat->node_zones + __pgdat->nr_zones;	\
+								\
+	while (__classzone < __overflow) {			\
+		__classzone->nr_inactive_pages--;		\
+		__classzone++;					\
+	}							\
+	nr_inactive_pages--;					\
+} while (0)
+
 #define add_page_to_active_list(page)		\
 do {						\
 	DEBUG_LRU_PAGE(page);			\
 	SetPageActive(page);			\
 	list_add(&(page)->lru, &active_list);	\
-	nr_active_pages++;			\
+	inc_nr_active_pages(page);		\
 } while (0)
 
 #define add_page_to_inactive_list(page)		\
 do {						\
 	DEBUG_LRU_PAGE(page);			\
-	SetPageInactive(page);		\
+	SetPageInactive(page);			\
 	list_add(&(page)->lru, &inactive_list);	\
-	nr_inactive_pages++;			\
+	inc_nr_inactive_pages(page);		\
 } while (0)
 
 #define del_page_from_active_list(page)		\
 do {						\
 	list_del(&(page)->lru);			\
 	ClearPageActive(page);			\
-	nr_active_pages--;			\
+	dec_nr_active_pages(page);		\
+	DEBUG_LRU_PAGE(page);			\
 } while (0)
 
 #define del_page_from_inactive_list(page)	\
 do {						\
 	list_del(&(page)->lru);			\
 	ClearPageInactive(page);		\
-	nr_inactive_pages--;			\
+	dec_nr_inactive_pages(page);		\
+	DEBUG_LRU_PAGE(page);			\
 } while (0)
 
 /*
diff -urN vm-ref/include/linux/sysctl.h vm/include/linux/sysctl.h
--- vm-ref/include/linux/sysctl.h	Tue Oct 30 04:32:40 2001
+++ vm/include/linux/sysctl.h	Tue Oct 30 04:32:51 2001
@@ -134,12 +134,13 @@
 	VM_FREEPG=3,		/* struct: Set free page thresholds */
 	VM_BDFLUSH=4,		/* struct: Control buffer cache flushing */
 	VM_OVERCOMMIT_MEMORY=5,	/* Turn off the virtual memory safety limit */
-	VM_BUFFERMEM=6,		/* struct: Set buffer memory thresholds */
-	VM_PAGECACHE=7,		/* struct: Set cache memory thresholds */
 	VM_PAGERDAEMON=8,	/* struct: Control kswapd behaviour */
 	VM_PGT_CACHE=9,		/* struct: Set page table cache parameters */
 	VM_PAGE_CLUSTER=10,	/* int: set number of pages to swap together */
 	VM_HEAP_STACK_GAP=11,	/* int: page gap between heap and stack */
+	VM_SCAN_RATIO=12,	/* part of the inactive list to scan */
+	VM_BALANCE_RATIO=13,	/* balance active and inactive caches */
+	VM_MAPPED_RATIO=14,	/* pageout when we find too many mapped pages */
 };
 
 
diff -urN vm-ref/kernel/fork.c vm/kernel/fork.c
--- vm-ref/kernel/fork.c	Tue Oct 30 04:32:39 2001
+++ vm/kernel/fork.c	Tue Oct 30 04:32:51 2001
@@ -645,7 +645,7 @@
 	p->lock_depth = -1;		/* -1 = no lock */
 	p->start_time = jiffies;
 
-	INIT_LIST_HEAD(&p->local_pages);
+	INIT_LIST_HEAD(&p->local_pages.list);
 
 	retval = -ENOMEM;
 	/* copy all the process information */
diff -urN vm-ref/kernel/ksyms.c vm/kernel/ksyms.c
--- vm-ref/kernel/ksyms.c	Tue Oct 30 04:32:40 2001
+++ vm/kernel/ksyms.c	Tue Oct 30 04:32:51 2001
@@ -89,6 +89,7 @@
 EXPORT_SYMBOL(exit_sighand);
 
 /* internal kernel memory management */
+EXPORT_SYMBOL(start_aggressive_readahead);
 EXPORT_SYMBOL(_alloc_pages);
 EXPORT_SYMBOL(__alloc_pages);
 EXPORT_SYMBOL(alloc_pages_node);
diff -urN vm-ref/kernel/sysctl.c vm/kernel/sysctl.c
--- vm-ref/kernel/sysctl.c	Tue Oct 30 04:32:40 2001
+++ vm/kernel/sysctl.c	Tue Oct 30 04:32:51 2001
@@ -30,6 +30,7 @@
 #include <linux/init.h>
 #include <linux/sysrq.h>
 #include <linux/highuid.h>
+#include <linux/swap.h>
 
 #include <asm/uaccess.h>
 
@@ -259,6 +260,12 @@
 };
 
 static ctl_table vm_table[] = {
+	{VM_SCAN_RATIO, "vm_scan_ratio", 
+	 &vm_scan_ratio, sizeof(int), 0644, NULL, &proc_dointvec},
+	{VM_BALANCE_RATIO, "vm_balance_ratio", 
+	 &vm_balance_ratio, sizeof(int), 0644, NULL, &proc_dointvec},
+	{VM_MAPPED_RATIO, "vm_mapped_ratio", 
+	 &vm_mapped_ratio, sizeof(int), 0644, NULL, &proc_dointvec},
 	{VM_BDFLUSH, "bdflush", &bdf_prm, 9*sizeof(int), 0644, NULL,
 	 &proc_dointvec_minmax, &sysctl_intvec, NULL,
 	 &bdflush_min, &bdflush_max},
diff -urN vm-ref/mm/filemap.c vm/mm/filemap.c
--- vm-ref/mm/filemap.c	Tue Oct 30 04:32:41 2001
+++ vm/mm/filemap.c	Tue Oct 30 04:32:51 2001
@@ -1853,7 +1853,7 @@
 	 * Found the page and have a reference on it, need to check sharing
 	 * and possibly copy it over to another page..
 	 */
-	mark_page_accessed(page);
+	activate_page(page);
 	flush_page_to_ram(page);
 	return page;
 
@@ -2957,8 +2957,15 @@
 		}
 unlock:
 		kunmap(page);
+
+		/*
+		 * Mark the page accessed if we wrote the
+		 * beginning or we just did an lseek.
+		 */
+		if (!offset || !file->f_reada)
+			SetPageReferenced(page);
+
 		/* Mark it unlocked again and drop the page.. */
-		SetPageReferenced(page);
 		UnlockPage(page);
 		page_cache_release(page);
 
diff -urN vm-ref/mm/memory.c vm/mm/memory.c
--- vm-ref/mm/memory.c	Tue Oct 30 04:32:40 2001
+++ vm/mm/memory.c	Tue Oct 30 04:32:51 2001
@@ -941,7 +941,9 @@
 		if (TryLockPage(old_page))
 			break;
 		/* Recheck swapcachedness once the page is locked */
-		can_reuse = remove_exclusive_swap_page(old_page);
+		can_reuse = exclusive_swap_page(old_page);
+		if (can_reuse)
+			delete_from_swap_cache(old_page);
 		UnlockPage(old_page);
 		if (!can_reuse)
 			break;
@@ -975,7 +977,6 @@
 		if (PageReserved(old_page))
 			++mm->rss;
 		break_cow(vma, new_page, address, page_table);
-		lru_cache_add(new_page);
 
 		/* Free the old page.. */
 		new_page = old_page;
@@ -1141,8 +1142,12 @@
 		ret = 2;
 	}
 
-	if (!Page_Uptodate(page))
-		wait_on_page(page);
+	/*
+	 * Freeze the "shared"ness of the page, ie page_count + swap_count.
+	 * Must lock page before transferring our swap count to already
+	 * obtained page count.
+	 */
+	lock_page(page);
 
 	/*
 	 * Back out if somebody else faulted in this pte while we
@@ -1150,6 +1155,7 @@
 	 */
 	spin_lock(&mm->page_table_lock);
 	if (!pte_same(*page_table, orig_pte)) {
+		UnlockPage(page);
 		page_cache_release(page);
 		spin_unlock(&mm->page_table_lock);
 		return 1;
@@ -1161,6 +1167,17 @@
 
 	swap_free(entry);
 
+	if (exclusive_swap_page(page)) {
+		if (write_access || vm_swap_full()) {
+			pte = pte_mkdirty(pte);
+			if (vma->vm_flags & VM_WRITE)
+				pte = pte_mkwrite(pte);
+			delete_from_swap_cache(page);
+		}
+	}
+	activate_page(page);
+	UnlockPage(page);
+
 	flush_page_to_ram(page);
 	flush_icache_page(vma, page);
 	set_pte(page_table, pte);
@@ -1204,7 +1221,6 @@
 		mm->rss++;
 		flush_page_to_ram(page);
 		entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
-		lru_cache_add(page);
 	}
 
 	set_pte(page_table, entry);
@@ -1256,7 +1272,6 @@
 			return -1;
 		copy_highpage(page, new_page);
 		page_cache_release(new_page);
-		lru_cache_add(page);
 		new_page = page;
 	}
 
diff -urN vm-ref/mm/mmap.c vm/mm/mmap.c
--- vm-ref/mm/mmap.c	Tue Oct 30 04:32:40 2001
+++ vm/mm/mmap.c	Tue Oct 30 04:32:51 2001
@@ -74,6 +74,14 @@
 	free += nr_swap_pages;
 
 	/*
+	 * This double-counts: the nrpages are both in the page-cache
+	 * and in the swapper space. At the same time, this compensates
+	 * for the swap-space over-allocation (ie "nr_swap_pages" being
+	 * too small. 
+	 */
+	free += swapper_space.nrpages;
+
+	/*
 	 * The code below doesn't account for free space in the inode
 	 * and dentry slab cache, slab cache fragmentation, inodes and
 	 * dentries which will become freeable under VM load, etc.
diff -urN vm-ref/mm/oom_kill.c vm/mm/oom_kill.c
--- vm-ref/mm/oom_kill.c	Wed Oct 10 02:16:27 2001
+++ vm/mm/oom_kill.c	Tue Oct 30 04:32:51 2001
@@ -192,67 +192,3 @@
 	schedule();
 	return;
 }
-
-static inline int node_zones_low(pg_data_t *pgdat)
-{
-	zone_t * zone;
-	int i;
-
-	for (i = pgdat->nr_zones-1; i >= 0; i--) {
-		zone = pgdat->node_zones + i;
-
-		if (zone->free_pages > (zone->pages_low))
-			return 0;
-
-	}
-	return 1;
-}
-
-static int all_zones_low(void)
-{
-	pg_data_t * pgdat = pgdat_list;
-
-	pgdat = pgdat_list;
-	do {
-		if (node_zones_low(pgdat))
-			continue;
-		return 0;
-	} while ((pgdat = pgdat->node_next));
-
-	return 1;
-}
-
-/**
- * out_of_memory - is the system out of memory?
- *
- * Returns 0 if there is still enough memory left,
- * 1 when we are out of memory (otherwise).
- */
-int out_of_memory(void)
-{
-	long cache_mem, limit;
-
-	/* Enough free memory?  Not OOM. */
-	if (!all_zones_low())
-		return 0;
-
-	/* Enough swap space left?  Not OOM. */
-	if (nr_swap_pages > 0)
-		return 0;
-
-	/*
-	 * If the buffer and page cache (including swap cache) are over
-	 * their (/proc tunable) minimum, we're still not OOM.  We test
-	 * this to make sure we don't return OOM when the system simply
-	 * has a hard time with the cache.
-	 */
-	cache_mem = atomic_read(&page_cache_size);
-	limit = 2;
-	limit *= num_physpages / 100;
-
-	if (cache_mem > limit)
-		return 0;
-
-	/* Else... */
-	return 1;
-}
diff -urN vm-ref/mm/page_alloc.c vm/mm/page_alloc.c
--- vm-ref/mm/page_alloc.c	Tue Oct 30 04:32:40 2001
+++ vm/mm/page_alloc.c	Tue Oct 30 04:32:51 2001
@@ -144,12 +144,13 @@
 	 * local since we must deal with fragmentation too and we
 	 * can't rely on the nr_local_pages information.
 	 */
-	if (current->nr_local_pages && !current->allocation_order)
+	if ((current->local_pages.nr && !current->local_pages.order) ||
+	    !memclass(page->zone, current->local_pages.classzone))
 		goto back_local_freelist;
 
-	list_add(&page->list, &current->local_pages);
+	list_add(&page->list, &current->local_pages.list);
 	page->index = order;
-	current->nr_local_pages++;
+	current->local_pages.nr++;
 }
 
 #define MARK_USED(index, order, area) \
@@ -231,35 +232,36 @@
 static struct page * balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed)
 {
 	struct page * page = NULL;
-	int __freed = 0;
+	int __freed;
 
-	if (!(gfp_mask & __GFP_WAIT))
-		goto out;
 	if (in_interrupt())
 		BUG();
 
-	current->allocation_order = order;
+	current->local_pages.order = order;
+	current->local_pages.classzone = classzone;
 	current->flags |= PF_MEMALLOC | PF_FREE_PAGES;
 
 	__freed = try_to_free_pages(classzone, gfp_mask, order);
 
 	current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES);
 
-	if (current->nr_local_pages) {
+	if (current->local_pages.nr) {
 		struct list_head * entry, * local_pages;
 		struct page * tmp;
 		int nr_pages;
 
-		local_pages = &current->local_pages;
+		local_pages = &current->local_pages.list;
 
 		if (likely(__freed)) {
 			/* pick from the last inserted so we're lifo */
 			entry = local_pages->next;
 			do {
 				tmp = list_entry(entry, struct page, list);
-				if (tmp->index == order && memclass(tmp->zone, classzone)) {
+				if (!memclass(tmp->zone, classzone))
+					BUG();
+				if (tmp->index == order) {
 					list_del(entry);
-					current->nr_local_pages--;
+					current->local_pages.nr--;
 					set_page_count(tmp, 1);
 					page = tmp;
 
@@ -285,7 +287,7 @@
 			} while ((entry = entry->next) != local_pages);
 		}
 
-		nr_pages = current->nr_local_pages;
+		nr_pages = current->local_pages.nr;
 		/* free in reverse order so that the global order will be lifo */
 		while ((entry = local_pages->prev) != local_pages) {
 			list_del(entry);
@@ -294,9 +296,8 @@
 			if (!nr_pages--)
 				BUG();
 		}
-		current->nr_local_pages = 0;
+		current->local_pages.nr = 0;
 	}
- out:
 	*freed = __freed;
 	return page;
 }
@@ -354,8 +355,7 @@
 
 	/* here we're in the low on memory slow path */
 
-rebalance:
-	if (current->flags & PF_MEMALLOC) {
+	if (current->flags & PF_MEMALLOC && !in_interrupt()) {
 		zone = zonelist->zones;
 		for (;;) {
 			zone_t *z = *(zone++);
@@ -371,34 +371,52 @@
 
 	/* Atomic allocations - we can't balance anything */
 	if (!(gfp_mask & __GFP_WAIT))
-		return NULL;
+		goto out;
 
+ rebalance:
 	page = balance_classzone(classzone, gfp_mask, order, &freed);
 	if (page)
 		return page;
 
 	zone = zonelist->zones;
-	for (;;) {
-		zone_t *z = *(zone++);
-		if (!z)
-			break;
+	if (likely(freed)) {
+		for (;;) {
+			zone_t *z = *(zone++);
+			if (!z)
+				break;
 
-		if (zone_free_pages(z, order) > z->pages_min) {
-			page = rmqueue(z, order);
-			if (page)
-				return page;
+			if (zone_free_pages(z, order) > z->pages_min) {
+				page = rmqueue(z, order);
+				if (page)
+					return page;
+			}
 		}
-	}
+		goto rebalance;
+	} else {
+		/* 
+		 * Check that no other task is been killed meanwhile,
+		 * in such a case we can succeed the allocation.
+		 */
+		for (;;) {
+			zone_t *z = *(zone++);
+			if (!z)
+				break;
 
-	/* Don't let big-order allocations loop */
-	if (order > 1)
-		return NULL;
+			if (zone_free_pages(z, order) > z->pages_high) {
+				page = rmqueue(z, order);
+				if (page)
+					return page;
+			}
+		}
+	}
 
-	/* Yield for kswapd, and try again */
-	current->policy |= SCHED_YIELD;
-	__set_current_state(TASK_RUNNING);
-	schedule();
-	goto rebalance;
+ out:
+	printk(KERN_NOTICE "__alloc_pages: %u-order allocation failed (gfp=0x%x/%i)\n",
+	       order, gfp_mask, !!(current->flags & PF_MEMALLOC));
+#ifdef CONFIG_DEBUG_GFP
+	show_stack(NULL);
+#endif
+	return NULL;
 }
 
 /*
@@ -427,15 +445,6 @@
 	return 0;
 }
 
-void free_lru_page(struct page *page)
-{
-	if (!PageReserved(page) && put_page_testzero(page)) {
-		if (PageActive(page) || PageInactive(page))
-			lru_cache_del(page);
-		__free_pages_ok(page, 0);
-	}
-}
-
 void __free_pages(struct page *page, unsigned int order)
 {
 	if (!PageReserved(page) && put_page_testzero(page))
@@ -521,17 +530,24 @@
 {
 	pg_data_t *pgdat = pgdat_list;
 	unsigned int sum = 0;
+	zonelist_t *zonelist;
+	zone_t **zonep, *zone;
 
 	do {
-		zonelist_t *zonelist = pgdat->node_zonelists + (GFP_USER & GFP_ZONEMASK);
-		zone_t **zonep = zonelist->zones;
-		zone_t *zone;
+		zonelist = pgdat->node_zonelists + (GFP_USER & GFP_ZONEMASK);
+		zonep = zonelist->zones;
 
-		for (zone = *zonep++; zone; zone = *zonep++) {
-			unsigned long size = zone->size;
-			unsigned long high = zone->pages_high;
-			if (size > high)
-				sum += size - high;
+		zone = *zonep;
+		if (zone) {
+			sum += zone->nr_inactive_pages;
+			do {
+				unsigned int free = zone->free_pages - zone->pages_high;
+				zonep++;
+				zone = *zonep;
+				if (free <= 0)
+					continue;
+				sum += free;
+			} while (zone);
 		}
 
 		pgdat = pgdat->node_next;
@@ -554,6 +570,62 @@
 }
 #endif
 
+/*
+ * If it returns non zero it means there's lots of ram "free"
+ * (note: not in cache!) so any caller will know that
+ * he can allocate some memory to do some more aggressive
+ * (possibly wasteful) readahead. The state of the memory
+ * should be rechecked after every few pages allocated for
+ * doing this aggressive readahead.
+ *
+ * The gfp_mask parameter specifies in which kind of memory
+ * the readahead information will be applocated to.
+ */
+int start_aggressive_readahead(unsigned int gfp_mask)
+{
+	pg_data_t *pgdat = pgdat_list;
+	zonelist_t *zonelist;
+	zone_t **zonep, *zone;
+	int ret = 0;
+
+	do {
+		zonelist = pgdat->node_zonelists + (gfp_mask & GFP_ZONEMASK);
+		zonep = zonelist->zones;
+
+		for (zone = *zonep++; zone; zone = *zonep++)
+			if (zone->free_pages > zone->pages_high * 2)
+				ret = 1;
+
+		pgdat = pgdat->node_next;
+	} while (pgdat);
+
+	return ret;
+}
+
+int try_to_free_pages_nozone(unsigned int gfp_mask)
+{
+	pg_data_t *pgdat = pgdat_list;
+	zonelist_t *zonelist;
+	zone_t **zonep;
+	int ret = 0;
+	unsigned long pf_free_pages;
+
+	pf_free_pages = current->flags & PF_FREE_PAGES;
+	current->flags &= ~PF_FREE_PAGES;
+
+	do {
+		zonelist = pgdat->node_zonelists + (gfp_mask & GFP_ZONEMASK);
+		zonep = zonelist->zones;
+
+		ret |= try_to_free_pages(*zonep, gfp_mask, 0);
+
+		pgdat = pgdat->node_next;
+	} while (pgdat);
+
+	current->flags |= pf_free_pages;
+	return ret;
+}
+
 #define K(x) ((x) << (PAGE_SHIFT-10))
 
 /*
@@ -568,28 +640,31 @@
 	pg_data_t *tmpdat = pgdat;
 
 	printk("Free pages:      %6dkB (%6dkB HighMem)\n",
-		nr_free_pages() << (PAGE_SHIFT-10),
-		nr_free_highpages() << (PAGE_SHIFT-10));
+	       K(nr_free_pages()),
+	       K(nr_free_highpages()));
 
 	while (tmpdat) {
 		zone_t *zone;
 		for (zone = tmpdat->node_zones;
 			       	zone < tmpdat->node_zones + MAX_NR_ZONES; zone++)
-			printk("Zone:%s freepages:%6lukB min:%6luKB low:%6lukB " 
-				       "high:%6lukB\n", 
-					zone->name,
-					K(zone->free_pages),
-					K(zone->pages_min),
-					K(zone->pages_low),
-					K(zone->pages_high));
-			
+			printk("Zone:%s freepages:%6lukB|%lu min:%6luKB|%lu low:%6lukB|%lu high:%6lukB:%lu active:%6dkB|%d inactive:%6dkB|%d\n",
+			       zone->name,
+			       K(zone->free_pages),
+			       zone->free_pages,
+			       K(zone->pages_min),
+			       zone->pages_min,
+			       K(zone->pages_low),
+			       zone->pages_low,
+			       K(zone->pages_high),
+			       zone->pages_high,
+			       K(zone->nr_active_pages),
+			       zone->nr_active_pages,
+			       K(zone->nr_inactive_pages),
+			       zone->nr_inactive_pages);
+
 		tmpdat = tmpdat->node_next;
 	}
 
-	printk("Free pages:      %6dkB (%6dkB HighMem)\n",
-		K(nr_free_pages()),
-		K(nr_free_highpages()));
-
 	printk("( Active: %d, inactive: %d, free: %d )\n",
 	       nr_active_pages,
 	       nr_inactive_pages,
@@ -764,6 +839,7 @@
 		zone->zone_pgdat = pgdat;
 		zone->free_pages = 0;
 		zone->need_balance = 0;
+		zone->nr_active_pages = zone->nr_inactive_pages = 0;
 		if (!size)
 			continue;
 
diff -urN vm-ref/mm/page_io.c vm/mm/page_io.c
--- vm-ref/mm/page_io.c	Tue Oct 30 03:37:11 2001
+++ vm/mm/page_io.c	Tue Oct 30 04:32:51 2001
@@ -41,7 +41,6 @@
 	kdev_t dev = 0;
 	int block_size;
 	struct inode *swapf = 0;
-	int wait = 0;
 
 	if (rw == READ) {
 		ClearPageUptodate(page);
@@ -73,18 +72,6 @@
 
  	/* block_size == PAGE_SIZE/zones_used */
  	brw_page(rw, page, dev, zones, block_size);
-
- 	/* Note! For consistency we do all of the logic,
- 	 * decrementing the page count, and unlocking the page in the
- 	 * swap lock map - in the IO completion handler.
- 	 */
- 	if (!wait)
- 		return 1;
-
- 	wait_on_page(page);
-	/* This shouldn't happen, but check to be sure. */
-	if (page_count(page) == 0)
-		printk(KERN_ERR "rw_swap_page: page unused while waiting!\n");
 
 	return 1;
 }
diff -urN vm-ref/mm/shmem.c vm/mm/shmem.c
--- vm-ref/mm/shmem.c	Tue Oct 30 03:37:11 2001
+++ vm/mm/shmem.c	Tue Oct 30 04:32:53 2001
@@ -212,9 +212,7 @@
 		entry = *ptr;
 		*ptr = (swp_entry_t){0};
 		freed++;
-
-		/* vmscan will do the actual page freeing later.. */
-		swap_free (entry);
+		free_swap_and_cache(entry);
 	}
 	return freed;
 }
@@ -449,6 +447,7 @@
 		BUG();
 
 	/* Remove it from the page cache */
+	lru_cache_del(page);
 	remove_inode_page(page);
 	page_cache_release(page);
 
@@ -550,7 +549,7 @@
 		swap_free(*entry);
 		*entry = (swp_entry_t) {0};
 		delete_from_swap_cache(page);
-		flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_referenced) | (1 << PG_arch_1));
+		flags = page->flags & ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_referenced | 1 << PG_arch_1);
 		page->flags = flags | (1 << PG_dirty);
 		add_to_page_cache_locked(page, mapping, idx);
 		info->swapped--;
diff -urN vm-ref/mm/swap.c vm/mm/swap.c
--- vm-ref/mm/swap.c	Tue Oct 30 03:37:11 2001
+++ vm/mm/swap.c	Tue Oct 30 04:32:51 2001
@@ -49,6 +49,8 @@
 	if (PageActive(page)) {
 		del_page_from_active_list(page);
 		add_page_to_inactive_list(page);
+		/* deactivate yes, but refile at the first mark_page_accessed */
+		SetPageReferenced(page);
 	}
 }	
 
@@ -67,7 +69,9 @@
 	if (PageInactive(page)) {
 		del_page_from_inactive_list(page);
 		add_page_to_active_list(page);
-	}
+		ClearPageReferenced(page);
+	} else
+		SetPageReferenced(page);
 }
 
 void activate_page(struct page * page)
@@ -83,11 +87,11 @@
  */
 void lru_cache_add(struct page * page)
 {
-	if (!PageActive(page) && !PageInactive(page)) {
-		spin_lock(&pagemap_lru_lock);
-		add_page_to_inactive_list(page);
-		spin_unlock(&pagemap_lru_lock);
-	}
+	if (!PageLocked(page))
+		BUG();
+	spin_lock(&pagemap_lru_lock);
+	add_page_to_inactive_list(page);
+	spin_unlock(&pagemap_lru_lock);
 }
 
 /**
@@ -103,9 +107,9 @@
 		del_page_from_active_list(page);
 	} else if (PageInactive(page)) {
 		del_page_from_inactive_list(page);
-	} else {
-//		printk("VM: __lru_cache_del, found unknown page ?!\n");
-	}
+	} else
+		printk("VM: __lru_cache_del, found unknown page ?!\n");
+	DEBUG_LRU_PAGE(page);
 }
 
 /**
@@ -114,6 +118,8 @@
  */
 void lru_cache_del(struct page * page)
 {
+	if (!PageLocked(page))
+		BUG();
 	spin_lock(&pagemap_lru_lock);
 	__lru_cache_del(page);
 	spin_unlock(&pagemap_lru_lock);
diff -urN vm-ref/mm/swap_state.c vm/mm/swap_state.c
--- vm-ref/mm/swap_state.c	Tue Oct 30 03:37:11 2001
+++ vm/mm/swap_state.c	Tue Oct 30 04:32:51 2001
@@ -17,8 +17,17 @@
 
 #include <asm/pgtable.h>
 
+/*
+ * We may have stale swap cache pages in memory: notice
+ * them here and get rid of the unnecessary final write.
+ */
 static int swap_writepage(struct page *page)
 {
+	if (exclusive_swap_page(page)) {
+		delete_from_swap_cache(page);
+		UnlockPage(page);
+		return 0;
+	}
 	rw_swap_page(WRITE, page);
 	return 0;
 }
@@ -109,7 +118,8 @@
 	if (!PageLocked(page))
 		BUG();
 
-	block_flushpage(page, 0);
+	if (block_flushpage(page, 0))
+		lru_cache_del(page);
 
 	entry.val = page->index;
 
@@ -137,7 +147,8 @@
 	 * 					- Marcelo
 	 */
 	if (PageSwapCache(page) && !TryLockPage(page)) {
-		remove_exclusive_swap_page(page);
+		if (exclusive_swap_page(page))
+			delete_from_swap_cache(page);
 		UnlockPage(page);
 	}
 	page_cache_release(page);
diff -urN vm-ref/mm/swapfile.c vm/mm/swapfile.c
--- vm-ref/mm/swapfile.c	Tue Oct 30 03:37:11 2001
+++ vm/mm/swapfile.c	Tue Oct 30 04:32:51 2001
@@ -224,50 +224,6 @@
 }
 
 /*
- * Work out if there are any other processes sharing this
- * swap cache page. Free it if you can. Return success.
- */
-int remove_exclusive_swap_page(struct page *page)
-{
-	int retval;
-	struct swap_info_struct * p;
-	swp_entry_t entry;
-
-	if (!PageLocked(page))
-		BUG();
-	if (!PageSwapCache(page))
-		return 0;
-	if (page_count(page) - !!page->buffers != 2)	/* 2: us + cache */
-		return 0;
-
-	entry.val = page->index;
-	p = swap_info_get(entry);
-	if (!p)
-		return 0;
-
-	/* Is the only swap cache user the cache itself? */
-	retval = 0;
-	if (p->swap_map[SWP_OFFSET(entry)] == 1) {
-		/* Recheck the page count with the pagecache lock held.. */
-		spin_lock(&pagecache_lock);
-		if (page_count(page) - !!page->buffers == 2) {
-			__delete_from_swap_cache(page);
-			retval = 1;
-		}
-		spin_unlock(&pagecache_lock);
-	}
-	swap_info_put(p);
-
-	if (retval) {
-		block_flushpage(page, 0);
-		swap_free(entry);
-		page_cache_release(page);
-	}
-
-	return retval;
-}
-
-/*
  * Free the swap entry like above, but also try to
  * free the page cache entry if it is the last user.
  */
diff -urN vm-ref/mm/vmscan.c vm/mm/vmscan.c
--- vm-ref/mm/vmscan.c	Tue Oct 30 03:37:11 2001
+++ vm/mm/vmscan.c	Tue Oct 30 04:32:51 2001
@@ -26,12 +26,28 @@
 #include <asm/pgalloc.h>
 
 /*
- * The "priority" of VM scanning is how much of the queues we
- * will scan in one go. A value of 6 for DEF_PRIORITY implies
- * that we'll scan 1/64th of the queues ("queue_length >> 6")
- * during a normal aging round.
+ * "vm_scan_ratio" is how much of the queues we will scan
+ * in one go. A value of 8 for vm_scan_ratio implies that we'll
+ * scan 1/8 of the inactive list during a normal aging round.
+ * So if 1/vm_scan_ratio of the inactive cache is unfreeable
+ * we'll start the background paging.
  */
-#define DEF_PRIORITY (6)
+int vm_scan_ratio = 8;
+
+/*
+ * "vm_scan_ratio" controls when we start to swapout, the lower,
+ * the earlier we'll start to swapout.
+ */
+int vm_mapped_ratio = 5;
+
+/*
+ * "vm_balance_ratio" controls the balance between active and
+ * inactive cache. The bigger vm_balance_ratio is, the easier the
+ * active cache will grow, because we'll rotate the active list
+ * slowly. A value of 6 means we'll go towards a balance of
+ * 1/7 of the cache being inactive.
+ */
+int vm_balance_ratio = 6;
 
 /*
  * The swap-out function returns 1 if it successfully
@@ -50,7 +66,7 @@
 
 	/* Don't look at this pte if it's been accessed recently. */
 	if (ptep_test_and_clear_young(page_table)) {
-		mark_page_accessed(page);
+		activate_page(page);
 		return 0;
 	}
 
@@ -91,6 +107,11 @@
 		UnlockPage(page);
 		{
 			int freeable = page_count(page) - !!page->buffers <= 2;
+#if 0
+			if (freeable)
+				/* don't waste time waiting this page */
+				deactivate_page(page);
+#endif
 			page_cache_release(page);
 			return freeable;
 		}
@@ -287,13 +308,13 @@
 	return count;
 }
 
-static int FASTCALL(swap_out(unsigned int priority, unsigned int gfp_mask, zone_t * classzone));
-static int swap_out(unsigned int priority, unsigned int gfp_mask, zone_t * classzone)
+static int FASTCALL(swap_out(zone_t * classzone));
+static int swap_out(zone_t * classzone)
 {
 	int counter, nr_pages = SWAP_CLUSTER_MAX;
 	struct mm_struct *mm;
 
-	counter = mmlist_nr;
+	counter = mmlist_nr << 1;
 	do {
 		if (unlikely(current->need_resched)) {
 			__set_current_state(TASK_RUNNING);
@@ -329,15 +350,13 @@
 	return 0;
 }
 
-static int FASTCALL(shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int priority));
-static int shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int priority)
+static int FASTCALL(shrink_cache(int nr_pages, int max_scan, zone_t * classzone, unsigned int gfp_mask));
+static int shrink_cache(int nr_pages, int max_scan, zone_t * classzone, unsigned int gfp_mask)
 {
 	struct list_head * entry;
-	int max_scan = nr_inactive_pages / priority;
-	int max_mapped = nr_pages*10;
+	int max_mapped = nr_pages * vm_mapped_ratio;
 
-	spin_lock(&pagemap_lru_lock);
-	while (--max_scan >= 0 && (entry = inactive_list.prev) != &inactive_list) {
+	while (max_scan && classzone->nr_inactive_pages && (entry = inactive_list.prev) != &inactive_list) {
 		struct page * page;
 
 		if (unlikely(current->need_resched)) {
@@ -356,18 +375,13 @@
 		list_del(entry);
 		list_add(entry, &inactive_list);
 
-		/*
-		 * Zero page counts can happen because we unlink the pages
-		 * _after_ decrementing the usage count..
-		 */
-		if (unlikely(!page_count(page)))
-			continue;
-
 		if (!memclass(page->zone, classzone))
 			continue;
 
+		max_scan--;
+
 		/* Racy check to avoid trylocking when not worthwhile */
-		if (!page->buffers && (page_count(page) != 1 || !page->mapping))
+		if (!page->buffers && page_count(page) != 1)
 			goto page_mapped;
 
 		/*
@@ -460,31 +474,35 @@
 			}
 		}
 
+		if (unlikely(!page->mapping))
+			BUG();
+
 		spin_lock(&pagecache_lock);
 
 		/*
-		 * this is the non-racy check for busy page.
+		 * This is the non-racy check for busy page.
+		 * It is critical to check PageDirty _after_ we made sure
+		 * the page is freeable so not in use by anybody.
+		 * At this point we're guaranteed that page->buffers is NULL,
+		 * nobody can refill page->buffers under us because we still
+		 * hold the page lock.
 		 */
-		if (!page->mapping || !is_page_cache_freeable(page)) {
+		if (unlikely(page_count(page) > 1)) {
 			spin_unlock(&pagecache_lock);
 			UnlockPage(page);
-page_mapped:
-			if (--max_mapped >= 0)
-				continue;
+		page_mapped:
+			if (--max_mapped < 0) {
+				spin_unlock(&pagemap_lru_lock);
 
-			/*
-			 * Alert! We've found too many mapped pages on the
-			 * inactive list, so we start swapping out now!
-			 */
-			spin_unlock(&pagemap_lru_lock);
-			swap_out(priority, gfp_mask, classzone);
-			return nr_pages;
-		}
+				if (!swap_out(classzone))
+					return nr_pages;
+				max_mapped = nr_pages * vm_mapped_ratio;
 
-		/*
-		 * It is critical to check PageDirty _after_ we made sure
-		 * the page is freeable* so not in use by anybody.
-		 */
+				spin_lock(&pagemap_lru_lock);
+			}
+			continue;
+			
+		}
 		if (PageDirty(page)) {
 			spin_unlock(&pagecache_lock);
 			UnlockPage(page);
@@ -515,20 +533,6 @@
 	}
 	spin_unlock(&pagemap_lru_lock);
 
-	if (nr_pages <= 0)
-		return 0;
-
-	/*
-	 * If swapping out isn't appropriate, and 
-	 * we still fail, try the other (usually smaller)
-	 * caches instead.
-	 */
-	shrink_dcache_memory(priority, gfp_mask);
-	shrink_icache_memory(priority, gfp_mask);
-#ifdef CONFIG_QUOTA
-	shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
-#endif
-
 	return nr_pages;
 }
 
@@ -539,60 +543,99 @@
  * We move them the other way when we see the
  * reference bit on the page.
  */
-static void refill_inactive(int nr_pages)
+static void FASTCALL(refill_inactive(int nr_pages, zone_t * classzone));
+static void refill_inactive(int nr_pages, zone_t * classzone)
 {
 	struct list_head * entry;
 
-	spin_lock(&pagemap_lru_lock);
 	entry = active_list.prev;
-	while (nr_pages-- && entry != &active_list) {
+	while (nr_pages && entry != &active_list) {
 		struct page * page;
 
 		page = list_entry(entry, struct page, lru);
 		entry = entry->prev;
+
+		if (!memclass(page->zone, classzone))
+			continue;
+
 		if (PageTestandClearReferenced(page)) {
 			list_del(&page->lru);
 			list_add(&page->lru, &active_list);
 			continue;
 		}
 
+		nr_pages--;
+
 		del_page_from_active_list(page);
 		add_page_to_inactive_list(page);
+		SetPageReferenced(page);
+	}
+	if (entry != &active_list) {
+		list_del(&active_list);
+		list_add(&active_list, entry);
 	}
-	spin_unlock(&pagemap_lru_lock);
 }
 
-static int FASTCALL(shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages));
-static int shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages)
+static int FASTCALL(shrink_caches(zone_t * classzone, unsigned int gfp_mask, int nr_pages, int * shrink_vfs));
+static int shrink_caches(zone_t * classzone, unsigned int gfp_mask, int nr_pages, int * shrink_vfs)
 {
-	int chunk_size = nr_pages;
+	int max_scan;
 	unsigned long ratio;
 
 	nr_pages -= kmem_cache_reap(gfp_mask);
 	if (nr_pages <= 0)
 		return 0;
 
-	nr_pages = chunk_size;
-	/* try to keep the active list 2/3 of the size of the cache */
-	ratio = (unsigned long) nr_pages * nr_active_pages / ((nr_inactive_pages + 1) * 2);
-	refill_inactive(ratio);
+	spin_lock(&pagemap_lru_lock);
+	ratio = (unsigned long) nr_pages * classzone->nr_active_pages / (((unsigned long) classzone->nr_inactive_pages * vm_balance_ratio) + 1);
+	if (ratio > nr_pages * 2) {
+		ratio = nr_pages * 2;
+		/* too much active cache so shrink the vfs as well */
+		*shrink_vfs = 1;
+	}
+	refill_inactive(ratio, classzone);
+
+	max_scan = classzone->nr_inactive_pages / vm_scan_ratio;
+	nr_pages = shrink_cache(nr_pages, max_scan, classzone, gfp_mask);
 
-	return shrink_cache(nr_pages, classzone, gfp_mask, priority);
+	return nr_pages;
 }
 
+static int check_classzone_need_balance(zone_t * classzone);
+
 int try_to_free_pages(zone_t *classzone, unsigned int gfp_mask, unsigned int order)
 {
-	int ret = 0;
-	int priority = DEF_PRIORITY;
-	int nr_pages = SWAP_CLUSTER_MAX;
+	for (;;) {
+		int tries = vm_scan_ratio << 2;
+		int nr_pages = SWAP_CLUSTER_MAX;
 
-	do {
-		nr_pages = shrink_caches(classzone, priority, gfp_mask, nr_pages);
-		if (nr_pages <= 0)
-			return 1;
-	} while (--priority);
+		do {
+			int shrink_vfs = 0;
+			nr_pages = shrink_caches(classzone, gfp_mask, nr_pages, &shrink_vfs);
+			if (shrink_vfs || nr_pages > 0) {
+				shrink_dcache_memory(vm_scan_ratio, gfp_mask);
+				shrink_icache_memory(vm_scan_ratio, gfp_mask);
+#ifdef CONFIG_QUOTA
+				shrink_dqcache_memory(vm_scan_ratio, gfp_mask);
+#endif
+			}
+			if (nr_pages <= 0)
+				return 1;
 
-	return ret;
+			if (!swap_out(classzone))
+				return 0;
+		} while (--tries);
+
+		if (likely(current->pid != 1))
+			break;
+		if (!check_classzone_need_balance(classzone))
+			break;
+		current->policy |= SCHED_YIELD;
+		__set_current_state(TASK_RUNNING);
+		schedule();
+	}
+
+	return 0;
 }
 
 DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
@@ -624,7 +667,7 @@
 		if (!try_to_free_pages(zone, GFP_KSWAPD, 0)) {
 			zone->need_balance = 0;
 			__set_current_state(TASK_INTERRUPTIBLE);
-			schedule_timeout(HZ);
+			schedule_timeout(HZ*5);
 			continue;
 		}
 		if (check_classzone_need_balance(zone))
@@ -647,9 +690,6 @@
 		do
 			need_more_balance |= kswapd_balance_pgdat(pgdat);
 		while ((pgdat = pgdat->node_next));
-		if (need_more_balance && out_of_memory()) {
-			oom_kill();	
-		}
 	} while (need_more_balance);
 }