diff -urN vm-ref/arch/sparc/kernel/sys_sunos.c vm/arch/sparc/kernel/sys_sunos.c
--- vm-ref/arch/sparc/kernel/sys_sunos.c	Tue Jan 22 18:52:53 2002
+++ vm/arch/sparc/kernel/sys_sunos.c	Sun Mar 10 07:30:58 2002
@@ -193,7 +193,7 @@
 	 * fool it, but this should catch most mistakes.
 	 */
 	freepages = atomic_read(&buffermem_pages) >> PAGE_SHIFT;
-	freepages += atomic_read(&page_cache_size);
+	freepages += page_cache_size;
 	freepages >>= 1;
 	freepages += nr_free_pages();
 	freepages += nr_swap_pages;
diff -urN vm-ref/arch/sparc64/kernel/sys_sunos32.c vm/arch/sparc64/kernel/sys_sunos32.c
--- vm-ref/arch/sparc64/kernel/sys_sunos32.c	Tue Jan 22 18:52:53 2002
+++ vm/arch/sparc64/kernel/sys_sunos32.c	Sun Mar 10 07:30:58 2002
@@ -157,7 +157,7 @@
 	 * fool it, but this should catch most mistakes.
 	 */
 	freepages = atomic_read(&buffermem_pages) >> PAGE_SHIFT;
-	freepages += atomic_read(&page_cache_size);
+	freepages += page_cache_size;
 	freepages >>= 1;
 	freepages += nr_free_pages();
 	freepages += nr_swap_pages;
diff -urN vm-ref/fs/buffer.c vm/fs/buffer.c
--- vm-ref/fs/buffer.c	Sun Mar 10 07:30:37 2002
+++ vm/fs/buffer.c	Sun Mar 10 07:30:58 2002
@@ -105,27 +105,27 @@
 	struct {
 		int nfract;	/* Percentage of buffer cache dirty to 
 				   activate bdflush */
-		int dummy1;	/* old "ndirty" */
+		int ndirty;	/* Maximum number of dirty blocks to write out per
+				   wake-cycle */
 		int dummy2;	/* old "nrefill" */
 		int dummy3;	/* unused */
 		int interval;	/* jiffies delay between kupdate flushes */
 		int age_buffer;	/* Time for normal buffer to age before we flush it */
 		int nfract_sync;/* Percentage of buffer cache dirty to 
 				   activate bdflush synchronously */
-		int dummy4;	/* unused */
+		int nfract_stop_bdflush; /* Percetange of buffer cache dirty to stop bdflush */
 		int dummy5;	/* unused */
 	} b_un;
 	unsigned int data[N_PARAM];
-} bdf_prm = {{40, 0, 0, 0, 5*HZ, 30*HZ, 60, 0, 0}};
+} bdf_prm = {{30, 500, 0, 0, 5*HZ, 30*HZ, 60, 20, 0}};
 
 /* These are the min and max parameter values that we will allow to be assigned */
-int bdflush_min[N_PARAM] = {  0,  10,    5,   25,  0,   1*HZ,   0, 0, 0};
-int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,10000*HZ, 6000*HZ, 100, 0, 0};
+int bdflush_min[N_PARAM] = {  0,  1,    0,   0,  0,   1*HZ,   0, 0, 0};
+int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,10000*HZ, 10000*HZ, 100, 100, 0};
 
 void unlock_buffer(struct buffer_head *bh)
 {
 	clear_bit(BH_Wait_IO, &bh->b_state);
-	clear_bit(BH_launder, &bh->b_state);
 	clear_bit(BH_Lock, &bh->b_state);
 	smp_mb__after_clear_bit();
 	if (waitqueue_active(&bh->b_wait))
@@ -180,6 +180,7 @@
 	do {
 		struct buffer_head * bh = *array++;
 		bh->b_end_io = end_buffer_io_sync;
+		clear_bit(BH_Pending_IO, &bh->b_state);
 		submit_bh(WRITE, bh);
 	} while (--count);
 }
@@ -212,6 +213,7 @@
 		if (atomic_set_buffer_clean(bh)) {
 			__refile_buffer(bh);
 			get_bh(bh);
+			set_bit(BH_Pending_IO, &bh->b_state);
 			array[count++] = bh;
 			if (count < NRSYNC)
 				continue;
@@ -241,7 +243,6 @@
 	do
 		spin_lock(&lru_list_lock);
 	while (write_some_buffers(dev));
-	run_task_queue(&tq_disk);
 }
 
 /*
@@ -281,12 +282,6 @@
 	return 0;
 }
 
-static inline void wait_for_some_buffers(kdev_t dev)
-{
-	spin_lock(&lru_list_lock);
-	wait_for_buffers(dev, BUF_LOCKED, 1);
-}
-
 static int wait_for_locked_buffers(kdev_t dev, int index, int refile)
 {
 	do
@@ -737,12 +732,8 @@
 
 static void free_more_memory(void)
 {
-	zone_t * zone = contig_page_data.node_zonelists[GFP_NOFS & GFP_ZONEMASK].zones[0];
-	
-	balance_dirty();
 	wakeup_bdflush();
-	try_to_free_pages(zone, GFP_NOFS, 0);
-	run_task_queue(&tq_disk);
+	try_to_free_pages_nozone(GFP_NOIO);
 	current->policy |= SCHED_YIELD;
 	__set_current_state(TASK_RUNNING);
 	schedule();
@@ -1039,8 +1030,10 @@
 
 		conditional_schedule();
 		bh = get_hash_table(dev, block, size);
-		if (bh)
+		if (bh) {
+			touch_buffer(bh);
 			return bh;
+		}
 
 		if (!grow_buffers(dev, block, size))
 			free_more_memory();
@@ -1055,7 +1048,6 @@
 	unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
 
 	dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
-	dirty += size_buffers_type[BUF_LOCKED] >> PAGE_SHIFT;
 	tot = nr_free_buffer_pages();
 
 	dirty *= 100;
@@ -1072,6 +1064,21 @@
 	return -1;
 }
 
+static int bdflush_stop(void)
+{
+	unsigned long dirty, tot, dirty_limit;
+
+	dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
+	tot = nr_free_buffer_pages();
+
+	dirty *= 100;
+	dirty_limit = tot * bdf_prm.b_un.nfract_stop_bdflush;
+
+	if (dirty > dirty_limit)
+		return 0;
+	return 1;
+}
+
 /*
  * if a new dirty buffer is created we need to balance bdflush.
  *
@@ -1086,19 +1093,16 @@
 	if (state < 0)
 		return;
 
-	/* If we're getting into imbalance, start write-out */
-	spin_lock(&lru_list_lock);
-	write_some_buffers(NODEV);
+	wakeup_bdflush();
 
 	/*
 	 * And if we're _really_ out of balance, wait for
-	 * some of the dirty/locked buffers ourselves and
-	 * start bdflush.
+	 * some of the dirty/locked buffers ourselves.
 	 * This will throttle heavy writers.
 	 */
 	if (state > 0) {
-		wait_for_some_buffers(NODEV);
-		wakeup_bdflush();
+		spin_lock(&lru_list_lock);
+		write_some_buffers(NODEV);
 	}
 }
 
@@ -1192,7 +1196,6 @@
 	struct buffer_head * bh;
 
 	bh = getblk(dev, block, size);
-	touch_buffer(bh);
 	if (buffer_uptodate(bh))
 		return bh;
 	ll_rw_block(READ, 1, &bh);
@@ -2613,20 +2616,25 @@
 static int sync_page_buffers(struct buffer_head *head)
 {
 	struct buffer_head * bh = head;
-	int tryagain = 0;
+	int tryagain = 1;
 
 	do {
 		if (!buffer_dirty(bh) && !buffer_locked(bh))
 			continue;
 
 		/* Don't start IO first time around.. */
-		if (!test_and_set_bit(BH_Wait_IO, &bh->b_state))
+		if (!test_and_set_bit(BH_Wait_IO, &bh->b_state)) {
+			tryagain = 0;
 			continue;
+		}
+
+		if (unlikely(buffer_pending_IO(bh))) {
+			tryagain = 0;
+			continue;
+		}
 
 		/* Second time through we start actively writing out.. */
 		if (test_and_set_bit(BH_Lock, &bh->b_state)) {
-			if (!test_bit(BH_launder, &bh->b_state))
-				continue;
 			wait_on_buffer(bh);
 			tryagain = 1;
 			continue;
@@ -2639,7 +2647,6 @@
 
 		__mark_buffer_clean(bh);
 		get_bh(bh);
-		set_bit(BH_launder, &bh->b_state);
 		bh->b_end_io = end_buffer_io_sync;
 		submit_bh(WRITE, bh);
 		tryagain = 0;
@@ -2742,7 +2749,7 @@
 			atomic_read(&buffermem_pages) << (PAGE_SHIFT-10));
 
 	printk("Cache memory:   %6dkB\n",
-			(atomic_read(&page_cache_size)- atomic_read(&buffermem_pages)) << (PAGE_SHIFT-10));
+			(page_cache_size - atomic_read(&buffermem_pages)) << (PAGE_SHIFT-10));
 
 #ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */
 	if (!spin_trylock(&lru_list_lock))
@@ -2965,13 +2972,18 @@
 	complete((struct completion *)startup);
 
 	for (;;) {
+		int ndirty = bdf_prm.b_un.ndirty;
+
 		CHECK_EMERGENCY_SYNC
 
-		spin_lock(&lru_list_lock);
-		if (!write_some_buffers(NODEV) || balance_dirty_state() < 0) {
-			wait_for_some_buffers(NODEV);
-			interruptible_sleep_on(&bdflush_wait);
+		while (ndirty > 0) {
+			spin_lock(&lru_list_lock);
+			if (!write_some_buffers(NODEV))
+				break;
+			ndirty -= NRSYNC;
 		}
+		if (ndirty > 0 || bdflush_stop())
+			interruptible_sleep_on(&bdflush_wait);
 	}
 }
 
@@ -3000,8 +3012,6 @@
 	complete((struct completion *)startup);
 
 	for (;;) {
-		wait_for_some_buffers(NODEV);
-
 		/* update interval */
 		interval = bdf_prm.b_un.interval;
 		if (interval) {
@@ -3029,6 +3039,7 @@
 		printk(KERN_DEBUG "kupdate() activated...\n");
 #endif
 		sync_old_buffers();
+		run_task_queue(&tq_disk);
 	}
 }
 
diff -urN vm-ref/fs/proc/proc_misc.c vm/fs/proc/proc_misc.c
--- vm-ref/fs/proc/proc_misc.c	Tue Jan 22 18:55:57 2002
+++ vm/fs/proc/proc_misc.c	Sun Mar 10 07:30:58 2002
@@ -142,7 +142,7 @@
 #define B(x) ((unsigned long long)(x) << PAGE_SHIFT)
 	si_meminfo(&i);
 	si_swapinfo(&i);
-	pg_size = atomic_read(&page_cache_size) - i.bufferram ;
+	pg_size = page_cache_size - i.bufferram;
 
 	len = sprintf(page, "        total:    used:    free:  shared: buffers:  cached:\n"
 		"Mem:  %8Lu %8Lu %8Lu %8Lu %8Lu %8Lu\n"
diff -urN vm-ref/include/linux/fs.h vm/include/linux/fs.h
--- vm-ref/include/linux/fs.h	Sun Mar 10 07:30:36 2002
+++ vm/include/linux/fs.h	Sun Mar 10 07:30:58 2002
@@ -217,7 +217,7 @@
 	BH_New,		/* 1 if the buffer is new and not yet written out */
 	BH_Async,	/* 1 if the buffer is under end_buffer_io_async I/O */
 	BH_Wait_IO,	/* 1 if we should write out this buffer */
-	BH_launder,	/* 1 if we should throttle on this buffer */
+	BH_Pending_IO,	/* 1 if the buffer is locked but not in the I/O queue yet */
 	BH_JBD,		/* 1 if it has an attached journal_head */
 
 	BH_PrivateStart,/* not a state bit, but the first bit available
@@ -279,6 +279,7 @@
 #define buffer_mapped(bh)	__buffer_state(bh,Mapped)
 #define buffer_new(bh)		__buffer_state(bh,New)
 #define buffer_async(bh)	__buffer_state(bh,Async)
+#define buffer_pending_IO(bh)	__buffer_state(bh,Pending_IO)
 
 #define bh_offset(bh)		((unsigned long)(bh)->b_data & ~PAGE_MASK)
 
@@ -1117,7 +1118,7 @@
 
 extern int fs_may_remount_ro(struct super_block *);
 
-extern int try_to_free_buffers(struct page *, unsigned int);
+extern int FASTCALL(try_to_free_buffers(struct page *, unsigned int));
 extern void refile_buffer(struct buffer_head * buf);
 extern void create_empty_buffers(struct page *, kdev_t, unsigned long);
 extern void end_buffer_io_sync(struct buffer_head *bh, int uptodate);
diff -urN vm-ref/include/linux/kernel.h vm/include/linux/kernel.h
--- vm-ref/include/linux/kernel.h	Tue Mar  5 17:38:23 2002
+++ vm/include/linux/kernel.h	Sun Mar 10 07:30:58 2002
@@ -106,6 +106,8 @@
 extern int tainted;
 extern const char *print_tainted(void);
 
+extern void show_stack(unsigned long *);
+
 #if DEBUG
 #define pr_debug(fmt,arg...) \
 	printk(KERN_DEBUG fmt,##arg)
diff -urN vm-ref/include/linux/mm.h vm/include/linux/mm.h
--- vm-ref/include/linux/mm.h	Sun Mar 10 07:30:37 2002
+++ vm/include/linux/mm.h	Sun Mar 10 07:30:58 2002
@@ -167,9 +167,8 @@
 	 * we can simply calculate the virtual address. On machines with
 	 * highmem some memory is mapped into kernel virtual memory
 	 * dynamically, so we need a place to store that address.
-	 * Note that this field could be 16 bits on x86 ... ;)
 	 *
-	 * Architectures with slow multiplication can define
+	 * Architectures with slow ALU can define
 	 * WANT_PAGE_VIRTUAL in asm/page.h
 	 */
 #if defined(CONFIG_HIGHMEM) || defined(WANT_PAGE_VIRTUAL)
@@ -310,9 +309,10 @@
 #define TryLockPage(page)	test_and_set_bit(PG_locked, &(page)->flags)
 #define PageChecked(page)	test_bit(PG_checked, &(page)->flags)
 #define SetPageChecked(page)	set_bit(PG_checked, &(page)->flags)
+
 #define PageLaunder(page)	test_bit(PG_launder, &(page)->flags)
 #define SetPageLaunder(page)	set_bit(PG_launder, &(page)->flags)
-#define __SetPageReserved(page)	__set_bit(PG_reserved, &(page)->flags)
+#define ClearPageLaunder(page)	clear_bit(PG_launder, &(page)->flags)
 
 /*
  * The zone field is never updated after free_area_init_core()
@@ -347,24 +347,18 @@
 	do {						\
 		(page)->virtual = (address);		\
 	} while(0)
-
-#else /* CONFIG_HIGHMEM || WANT_PAGE_VIRTUAL */
-#define set_page_address(page, address)  do { } while(0)
-#endif /* CONFIG_HIGHMEM || WANT_PAGE_VIRTUAL */
-
-/*
- * Permanent address of a page. Obviously must never be
- * called on a highmem page.
- */
-#if defined(CONFIG_HIGHMEM) || defined(WANT_PAGE_VIRTUAL)
-
 #define page_address(page) ((page)->virtual)
 
 #else /* CONFIG_HIGHMEM || WANT_PAGE_VIRTUAL */
 
+#define set_page_address(page, address)  do { } while(0)
+#ifdef CONFIG_DISCONTIGMEM
 #define page_address(page)						\
 	__va( (((page) - page_zone(page)->zone_mem_map) << PAGE_SHIFT)	\
 			+ page_zone(page)->zone_start_paddr)
+#else
+#define page_address(page) __va((page - mem_map) << PAGE_SHIFT)
+#endif
 
 #endif /* CONFIG_HIGHMEM || WANT_PAGE_VIRTUAL */
 
@@ -403,6 +397,7 @@
 #endif
 
 #define SetPageReserved(page)		set_bit(PG_reserved, &(page)->flags)
+#define __SetPageReserved(page)		__set_bit(PG_reserved, &(page)->flags) /* just for boot time very-micro-optimization */
 #define ClearPageReserved(page)		clear_bit(PG_reserved, &(page)->flags)
 
 /*
@@ -462,6 +457,8 @@
 #define __free_page(page) __free_pages((page), 0)
 #define free_page(addr) free_pages((addr),0)
 
+extern int start_aggressive_readahead(unsigned int);
+
 extern void show_free_areas(void);
 extern void show_free_areas_node(pg_data_t *pgdat);
 
@@ -526,8 +523,8 @@
 	return page_count(page) - !!page->buffers == 1;
 }
 
-extern int can_share_swap_page(struct page *);
-extern int remove_exclusive_swap_page(struct page *);
+extern int FASTCALL(make_exclusive_page(struct page *, int));
+extern int FASTCALL(remove_exclusive_swap_page(struct page *));
 
 extern void __free_pte(pte_t);
 
diff -urN vm-ref/include/linux/mmzone.h vm/include/linux/mmzone.h
--- vm-ref/include/linux/mmzone.h	Fri Mar  1 00:09:39 2002
+++ vm/include/linux/mmzone.h	Sun Mar 10 07:30:58 2002
@@ -19,6 +19,11 @@
 #define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER
 #endif
 
+#define ZONE_DMA		0
+#define ZONE_NORMAL		1
+#define ZONE_HIGHMEM		2
+#define MAX_NR_ZONES		3
+
 typedef struct free_area_struct {
 	struct list_head	free_list;
 	unsigned long		*map;
@@ -26,6 +31,10 @@
 
 struct pglist_data;
 
+typedef struct zone_watermarks_s {
+	unsigned long min, low, high;
+} zone_watermarks_t;
+
 /*
  * On machines where it is needed (eg PCs) we divide physical memory
  * into multiple physical zones. On a PC we have 3 zones:
@@ -40,42 +49,32 @@
 	 */
 	spinlock_t		lock;
 	unsigned long		free_pages;
-	unsigned long		pages_min, pages_low, pages_high;
-	int			need_balance;
 
 	/*
-	 * free areas of different sizes
+	 * We don't know if the memory that we're going to allocate will be freeable
+	 * or/and it will be released eventually, so to avoid totally wasting several
+	 * GB of ram we must reserve some of the lower zone memory (otherwise we risk
+	 * to run OOM on the lower zones despite there's tons of freeable ram
+	 * on the higher zones).
 	 */
-	free_area_t		free_area[MAX_ORDER];
+	zone_watermarks_t	watermarks[MAX_NR_ZONES];
 
 	/*
-	 * wait_table		-- the array holding the hash table
-	 * wait_table_size	-- the size of the hash table array
-	 * wait_table_shift	-- wait_table_size
-	 * 				== BITS_PER_LONG (1 << wait_table_bits)
-	 *
-	 * The purpose of all these is to keep track of the people
-	 * waiting for a page to become available and make them
-	 * runnable again when possible. The trouble is that this
-	 * consumes a lot of space, especially when so few things
-	 * wait on pages at a given time. So instead of using
-	 * per-page waitqueues, we use a waitqueue hash table.
-	 *
-	 * The bucket discipline is to sleep on the same queue when
-	 * colliding and wake all in that wait queue when removing.
-	 * When something wakes, it must check to be sure its page is
-	 * truly available, a la thundering herd. The cost of a
-	 * collision is great, but given the expected load of the
-	 * table, they should be so rare as to be outweighed by the
-	 * benefits from the saved space.
-	 *
-	 * __wait_on_page() and unlock_page() in mm/filemap.c, are the
-	 * primary users of these fields, and in mm/page_alloc.c
-	 * free_area_init_core() performs the initialization of them.
+	 * The below fields are protected by different locks (or by
+	 * no lock at all like need_balance), so they're longs to
+	 * provide an atomic granularity against each other on
+	 * all architectures.
 	 */
-	wait_queue_head_t	* wait_table;
-	unsigned long		wait_table_size;
-	unsigned long		wait_table_shift;
+	unsigned long		need_balance;
+	/* protected by the pagemap_lru_lock */
+	unsigned long		nr_active_pages, nr_inactive_pages;
+	/* protected by the pagecache_lock */
+	unsigned long		nr_cache_pages;
+
+	/*
+	 * free areas of different sizes
+	 */
+	free_area_t		free_area[MAX_ORDER];
 
 	/*
 	 * Discontig memory support fields.
@@ -90,13 +89,9 @@
 	 */
 	char			*name;
 	unsigned long		size;
+	unsigned long		realsize;
 } zone_t;
 
-#define ZONE_DMA		0
-#define ZONE_NORMAL		1
-#define ZONE_HIGHMEM		2
-#define MAX_NR_ZONES		3
-
 /*
  * One allocation request operates on a zonelist. A zonelist
  * is a list of zones, the first one is the 'goal' of the
@@ -114,6 +109,32 @@
 
 #define GFP_ZONEMASK	0x0f
 
+typedef struct wait_table_s {
+	/*
+	 * The purpose of all these is to keep track of the people
+	 * waiting for a page to become available and make them
+	 * runnable again when possible. The trouble is that this
+	 * consumes a lot of space, especially when so few things
+	 * wait on pages at a given time. So instead of using
+	 * per-page waitqueues, we use a waitqueue hash table.
+	 *
+	 * The bucket discipline is to sleep on the same queue when
+	 * colliding and wake all in that wait queue when removing.
+	 * When something wakes, it must check to be sure its page is
+	 * truly available, a la thundering herd. The cost of a
+	 * collision is great, but given the expected load of the
+	 * table, they should be so rare as to be outweighed by the
+	 * benefits from the saved space.
+	 *
+	 * __wait_on_page() and unlock_page() in mm/filemap.c, are the
+	 * primary users of these fields, and in mm/page_alloc.c
+	 * free_area_init_core() performs the initialization of them.
+	 */
+	wait_queue_head_t	* head;
+	unsigned long		shift;
+	unsigned long		size;
+} wait_table_t;
+
 /*
  * The pg_data_t structure is used in machines with CONFIG_DISCONTIGMEM
  * (mostly NUMA machines?) to denote a higher-level memory zone than the
@@ -137,14 +158,15 @@
 	unsigned long node_start_mapnr;
 	unsigned long node_size;
 	int node_id;
+	wait_table_t wait_table;
 	struct pglist_data *node_next;
 } pg_data_t;
 
 extern int numnodes;
 extern pg_data_t *pgdat_list;
 
-#define memclass(pgzone, classzone)	(((pgzone)->zone_pgdat == (classzone)->zone_pgdat) \
-			&& ((pgzone) <= (classzone)))
+#define zone_idx(zone)			((zone) - (zone)->zone_pgdat->node_zones)
+#define memclass(pgzone, classzone)	(zone_idx(pgzone) <= zone_idx(classzone))
 
 /*
  * The following two are not meant for general usage. They are here as
diff -urN vm-ref/include/linux/pagemap.h vm/include/linux/pagemap.h
--- vm-ref/include/linux/pagemap.h	Fri Mar  1 00:09:39 2002
+++ vm/include/linux/pagemap.h	Sun Mar 10 07:31:06 2002
@@ -45,7 +45,7 @@
 #define PAGE_HASH_BITS (page_hash_bits)
 #define PAGE_HASH_SIZE (1 << PAGE_HASH_BITS)
 
-extern atomic_t page_cache_size; /* # of pages currently in the hash table */
+extern unsigned long page_cache_size; /* # of pages currently in the hash table */
 extern struct page **page_hash_table;
 
 extern void page_cache_init(unsigned long);
@@ -96,8 +96,6 @@
 	if (PageLocked(page))
 		___wait_on_page(page);
 }
-
-extern void wake_up_page(struct page *);
 
 extern struct page * grab_cache_page (struct address_space *, unsigned long);
 extern struct page * grab_cache_page_nowait (struct address_space *, unsigned long);
diff -urN vm-ref/include/linux/sched.h vm/include/linux/sched.h
--- vm-ref/include/linux/sched.h	Sun Mar 10 07:30:37 2002
+++ vm/include/linux/sched.h	Sun Mar 10 07:30:58 2002
@@ -282,6 +282,14 @@
 extern struct user_struct root_user;
 #define INIT_USER (&root_user)
 
+struct zone_struct;
+
+struct local_pages {
+	struct list_head list;
+	unsigned int order, nr;
+	struct zone_struct * classzone;
+};
+
 struct task_struct {
 	/*
 	 * offsets of these are hardcoded elsewhere - touch with care
@@ -327,8 +335,7 @@
 
 	struct task_struct *next_task, *prev_task;
 	struct mm_struct *active_mm;
-	struct list_head local_pages;
-	unsigned int allocation_order, nr_local_pages;
+	struct local_pages local_pages;
 
 /* task state */
 	struct linux_binfmt *binfmt;
@@ -428,7 +435,6 @@
 #define PF_DUMPCORE	0x00000200	/* dumped core */
 #define PF_SIGNALED	0x00000400	/* killed by a signal */
 #define PF_MEMALLOC	0x00000800	/* Allocating memory */
-#define PF_MEMDIE	0x00001000	/* Killed for out-of-memory */
 #define PF_FREE_PAGES	0x00002000	/* per process page freeing */
 #define PF_NOIO		0x00004000	/* avoid generating further I/O */
 
diff -urN vm-ref/include/linux/swap.h vm/include/linux/swap.h
--- vm-ref/include/linux/swap.h	Sun Mar 10 07:30:37 2002
+++ vm/include/linux/swap.h	Sun Mar 10 07:31:06 2002
@@ -88,7 +88,7 @@
 extern int nr_active_pages;
 extern int nr_inactive_pages;
 extern atomic_t nr_async_pages;
-extern atomic_t page_cache_size;
+extern unsigned long page_cache_size;
 extern atomic_t buffermem_pages;
 
 extern spinlock_cacheline_t pagecache_lock_cacheline;
@@ -115,6 +115,8 @@
 /* linux/mm/vmscan.c */
 extern wait_queue_head_t kswapd_wait;
 extern int FASTCALL(try_to_free_pages(zone_t *, unsigned int, unsigned int));
+extern int FASTCALL(try_to_free_pages_nozone(unsigned int));
+extern int vm_vfs_scan_ratio, vm_cache_scan_ratio, vm_lru_balance_ratio, vm_passes, vm_gfp_debug, vm_mapped_ratio;
 
 /* linux/mm/page_io.c */
 extern void rw_swap_page(int, struct page *);
@@ -176,32 +178,128 @@
 		BUG();				\
 } while (0)
 
+#define inc_nr_active_pages(page)				\
+do {								\
+	pg_data_t * __pgdat;					\
+	zone_t * __classzone, * __overflow;			\
+								\
+	__classzone = page_zone(page);				\
+	__pgdat = __classzone->zone_pgdat;			\
+	__overflow = __pgdat->node_zones + __pgdat->nr_zones;	\
+								\
+	while (__classzone < __overflow) {			\
+		__classzone->nr_active_pages++;			\
+		__classzone++;					\
+	}							\
+	nr_active_pages++;					\
+} while (0)
+
+#define dec_nr_active_pages(page)				\
+do {								\
+	pg_data_t * __pgdat;					\
+	zone_t * __classzone, * __overflow;			\
+								\
+	__classzone = page_zone(page);				\
+	__pgdat = __classzone->zone_pgdat;			\
+	__overflow = __pgdat->node_zones + __pgdat->nr_zones;	\
+								\
+	while (__classzone < __overflow) {			\
+		__classzone->nr_active_pages--;			\
+		__classzone++;					\
+	}							\
+	nr_active_pages--;					\
+} while (0)
+
+#define inc_nr_inactive_pages(page)				\
+do {								\
+	pg_data_t * __pgdat;					\
+	zone_t * __classzone, * __overflow;			\
+								\
+	__classzone = page_zone(page);				\
+	__pgdat = __classzone->zone_pgdat;			\
+	__overflow = __pgdat->node_zones + __pgdat->nr_zones;	\
+								\
+	while (__classzone < __overflow) {			\
+		__classzone->nr_inactive_pages++;		\
+		__classzone++;					\
+	}							\
+	nr_inactive_pages++;					\
+} while (0)
+
+#define dec_nr_inactive_pages(page)				\
+do {								\
+	pg_data_t * __pgdat;					\
+	zone_t * __classzone, * __overflow;			\
+								\
+	__classzone = page_zone(page);				\
+	__pgdat = __classzone->zone_pgdat;			\
+	__overflow = __pgdat->node_zones + __pgdat->nr_zones;	\
+								\
+	while (__classzone < __overflow) {			\
+		__classzone->nr_inactive_pages--;		\
+		__classzone++;					\
+	}							\
+	nr_inactive_pages--;					\
+} while (0)
+
 #define add_page_to_active_list(page)		\
 do {						\
 	DEBUG_LRU_PAGE(page);			\
 	SetPageActive(page);			\
 	list_add(&(page)->lru, &active_list);	\
-	nr_active_pages++;			\
+	inc_nr_active_pages(page);		\
 } while (0)
 
 #define add_page_to_inactive_list(page)		\
 do {						\
 	DEBUG_LRU_PAGE(page);			\
 	list_add(&(page)->lru, &inactive_list);	\
-	nr_inactive_pages++;			\
+	inc_nr_inactive_pages(page);		\
 } while (0)
 
 #define del_page_from_active_list(page)		\
 do {						\
 	list_del(&(page)->lru);			\
 	ClearPageActive(page);			\
-	nr_active_pages--;			\
+	dec_nr_active_pages(page);		\
 } while (0)
 
 #define del_page_from_inactive_list(page)	\
 do {						\
 	list_del(&(page)->lru);			\
-	nr_inactive_pages--;			\
+	dec_nr_inactive_pages(page);		\
+} while (0)
+
+#define inc_nr_cache_pages(page)				\
+do {								\
+	pg_data_t * __pgdat;					\
+	zone_t * __classzone, * __overflow;			\
+								\
+	__classzone = page_zone(page);				\
+	__pgdat = __classzone->zone_pgdat;			\
+	__overflow = __pgdat->node_zones + __pgdat->nr_zones;	\
+								\
+	while (__classzone < __overflow) {			\
+		__classzone->nr_cache_pages++;			\
+		__classzone++;					\
+	}							\
+	page_cache_size++;					\
+} while (0)
+
+#define dec_nr_cache_pages(page)				\
+do {								\
+	pg_data_t * __pgdat;					\
+	zone_t * __classzone, * __overflow;			\
+								\
+	__classzone = page_zone(page);				\
+	__pgdat = __classzone->zone_pgdat;			\
+	__overflow = __pgdat->node_zones + __pgdat->nr_zones;	\
+								\
+	while (__classzone < __overflow) {			\
+		__classzone->nr_cache_pages--;			\
+		__classzone++;					\
+	}							\
+	page_cache_size--;					\
 } while (0)
 
 extern spinlock_t swaplock;
diff -urN vm-ref/include/linux/sysctl.h vm/include/linux/sysctl.h
--- vm-ref/include/linux/sysctl.h	Sun Mar 10 07:30:37 2002
+++ vm/include/linux/sysctl.h	Sun Mar 10 07:31:44 2002
@@ -141,9 +141,15 @@
 	VM_PGT_CACHE=9,		/* struct: Set page table cache parameters */
 	VM_PAGE_CLUSTER=10,	/* int: set number of pages to swap together */
 	VM_HEAP_STACK_GAP=11,	/* int: page gap between heap and stack */
-       VM_MIN_READAHEAD=12,    /* Min file readahead */
-       VM_MAX_READAHEAD=13,     /* Max file readahead */
-	VM_MAX_MAP_COUNT=19	/* int: Maximum number of active map areas */
+	VM_MIN_READAHEAD=12,    /* Min file readahead */
+	VM_MAX_READAHEAD=13,     /* Max file readahead */
+	VM_MAX_MAP_COUNT=14,	/* int: Maximum number of active map areas */
+	VM_VFS_SCAN_RATIO=15,	/* part of the inactive vfs lists to scan */
+	VM_LRU_BALANCE_RATIO=16,/* balance active and inactive caches */
+	VM_PASSES=17,		/* number of vm passes before failing */
+	VM_GFP_DEBUG=18,	/* debug GFP failures */
+	VM_CACHE_SCAN_RATIO=19,	/* part of the inactive cache list to scan */
+	VM_MAPPED_RATIO=20,	/* amount of unfreeable pages that triggers swapout */
 };
 
 
diff -urN vm-ref/kernel/fork.c vm/kernel/fork.c
--- vm-ref/kernel/fork.c	Sun Mar 10 07:30:37 2002
+++ vm/kernel/fork.c	Sun Mar 10 07:30:58 2002
@@ -660,7 +660,7 @@
 	p->lock_depth = -1;		/* -1 = no lock */
 	p->start_time = jiffies;
 
-	INIT_LIST_HEAD(&p->local_pages);
+	INIT_LIST_HEAD(&p->local_pages.list);
 
 	retval = -ENOMEM;
 	/* copy all the process information */
diff -urN vm-ref/kernel/ksyms.c vm/kernel/ksyms.c
--- vm-ref/kernel/ksyms.c	Sun Mar 10 07:30:36 2002
+++ vm/kernel/ksyms.c	Sun Mar 10 07:30:58 2002
@@ -90,6 +90,7 @@
 EXPORT_SYMBOL(exit_sighand);
 
 /* internal kernel memory management */
+EXPORT_SYMBOL(start_aggressive_readahead);
 EXPORT_SYMBOL(_alloc_pages);
 EXPORT_SYMBOL(__alloc_pages);
 EXPORT_SYMBOL(alloc_pages_node);
diff -urN vm-ref/kernel/sysctl.c vm/kernel/sysctl.c
--- vm-ref/kernel/sysctl.c	Sun Mar 10 07:30:37 2002
+++ vm/kernel/sysctl.c	Sun Mar 10 07:31:06 2002
@@ -30,6 +30,7 @@
 #include <linux/init.h>
 #include <linux/sysrq.h>
 #include <linux/highuid.h>
+#include <linux/swap.h>
 
 #include <asm/uaccess.h>
 
@@ -260,6 +261,18 @@
 };
 
 static ctl_table vm_table[] = {
+	{VM_VFS_SCAN_RATIO, "vm_gfp_debug", 
+	 &vm_gfp_debug, sizeof(int), 0644, NULL, &proc_dointvec},
+	{VM_VFS_SCAN_RATIO, "vm_vfs_scan_ratio", 
+	 &vm_vfs_scan_ratio, sizeof(int), 0644, NULL, &proc_dointvec},
+	{VM_CACHE_SCAN_RATIO, "vm_cache_scan_ratio", 
+	 &vm_cache_scan_ratio, sizeof(int), 0644, NULL, &proc_dointvec},
+	{VM_MAPPED_RATIO, "vm_mapped_ratio", 
+	 &vm_mapped_ratio, sizeof(int), 0644, NULL, &proc_dointvec},
+	{VM_LRU_BALANCE_RATIO, "vm_lru_balance_ratio", 
+	 &vm_lru_balance_ratio, sizeof(int), 0644, NULL, &proc_dointvec},
+	{VM_PASSES, "vm_passes", 
+	 &vm_passes, sizeof(int), 0644, NULL, &proc_dointvec},
 	{VM_BDFLUSH, "bdflush", &bdf_prm, 9*sizeof(int), 0644, NULL,
 	 &proc_dointvec_minmax, &sysctl_intvec, NULL,
 	 &bdflush_min, &bdflush_max},
diff -urN vm-ref/mm/filemap.c vm/mm/filemap.c
--- vm-ref/mm/filemap.c	Sun Mar 10 07:30:37 2002
+++ vm/mm/filemap.c	Sun Mar 10 07:31:06 2002
@@ -43,7 +43,7 @@
  * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
  */
 
-atomic_t page_cache_size = ATOMIC_INIT(0);
+unsigned long page_cache_size;
 unsigned int page_hash_bits;
 struct page **page_hash_table;
 
@@ -80,7 +80,7 @@
 		next->pprev_hash = &page->next_hash;
 	if (page->buffers)
 		PAGE_BUG(page);
-	atomic_inc(&page_cache_size);
+	inc_nr_cache_pages(page);
 }
 
 static inline void add_page_to_inode_queue(struct address_space *mapping, struct page * page)
@@ -110,7 +110,7 @@
 		next->pprev_hash = pprev;
 	*pprev = next;
 	page->pprev_hash = NULL;
-	atomic_dec(&page_cache_size);
+	dec_nr_cache_pages(page);
 }
 
 /*
@@ -740,25 +740,14 @@
 	return 0;
 }
 
-/*
- * Knuth recommends primes in approximately golden ratio to the maximum
- * integer representable by a machine word for multiplicative hashing.
- * Chuck Lever verified the effectiveness of this technique:
- * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
- *
- * These primes are chosen to be bit-sparse, that is operations on
- * them can use shifts and additions instead of multiplications for
- * machines where multiplications are slow.
- */
-#if BITS_PER_LONG == 32
-/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
-#define GOLDEN_RATIO_PRIME 0x9e370001UL
-#elif BITS_PER_LONG == 64
-/*  2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
-#define GOLDEN_RATIO_PRIME 0x9e37fffffffc0001UL
-#else
-#error Define GOLDEN_RATIO_PRIME for your wordsize.
-#endif
+static inline wait_queue_head_t * wait_table_hashfn(struct page * page, wait_table_t * wait_table)
+{
+#define i (((unsigned long) page)/(sizeof(struct page) & ~ (sizeof(struct page) - 1)))
+#define s(x) ((x)+((x)>>wait_table->shift))
+	return wait_table->head + (s(i) & (wait_table->size-1));
+#undef i
+#undef s
+}
 
 /*
  * In order to wait for pages to become available there must be
@@ -770,35 +759,10 @@
  * at a cost of "thundering herd" phenomena during rare hash
  * collisions.
  */
-static inline wait_queue_head_t *page_waitqueue(struct page *page)
+static inline wait_queue_head_t * page_waitqueue(struct page *page)
 {
-	const zone_t *zone = page_zone(page);
-	wait_queue_head_t *wait = zone->wait_table;
-	unsigned long hash = (unsigned long)page;
-
-#if BITS_PER_LONG == 64
-	/*  Sigh, gcc can't optimise this alone like it does for 32 bits. */
-	unsigned long n = hash;
-	n <<= 18;
-	hash -= n;
-	n <<= 33;
-	hash -= n;
-	n <<= 3;
-	hash += n;
-	n <<= 3;
-	hash -= n;
-	n <<= 4;
-	hash += n;
-	n <<= 2;
-	hash += n;
-#else
-	/* On some cpus multiply is faster, on others gcc will do shifts */
-	hash *= GOLDEN_RATIO_PRIME;
-#endif
-
-	hash >>= zone->wait_table_shift;
-
-	return &wait[hash];
+	pg_data_t * pgdat = page_zone(page)->zone_pgdat;
+	return wait_table_hashfn(page, &pgdat->wait_table);
 }
 
 /* 
@@ -832,13 +796,13 @@
 void unlock_page(struct page *page)
 {
 	wait_queue_head_t *waitqueue = page_waitqueue(page);
-	clear_bit(PG_launder, &(page)->flags);
+	ClearPageLaunder(page);
 	smp_mb__before_clear_bit();
 	if (!test_and_clear_bit(PG_locked, &(page)->flags))
 		BUG();
 	smp_mb__after_clear_bit(); 
 	if (waitqueue_active(waitqueue))
-		wake_up_all(waitqueue);
+		wake_up(waitqueue);
 }
 
 /*
@@ -851,7 +815,7 @@
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
 
-	add_wait_queue_exclusive(waitqueue, &wait);
+	add_wait_queue(waitqueue, &wait);
 	for (;;) {
 		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 		if (PageLocked(page)) {
@@ -865,12 +829,6 @@
 	remove_wait_queue(waitqueue, &wait);
 }
 
-void wake_up_page(struct page *page)
-{
-	wake_up(page_waitqueue(page));
-}
-EXPORT_SYMBOL(wake_up_page);
-
 /*
  * Get an exclusive lock on the page, optimistically
  * assuming it's not locked..
@@ -1973,7 +1931,6 @@
 	 * and possibly copy it over to another page..
 	 */
 	mark_page_accessed(page);
-	flush_page_to_ram(page);
 	return page;
 
 no_cached_page:
@@ -3095,8 +3052,15 @@
 		}
 unlock:
 		kunmap(page);
+
+		/*
+		 * Mark the page accessed if we wrote the
+		 * beginning or we just did an lseek.
+		 */
+		if (!offset || !file->f_reada)
+			SetPageReferenced(page);
+
 		/* Mark it unlocked again and drop the page.. */
-		SetPageReferenced(page);
 		UnlockPage(page);
 		page_cache_release(page);
 
diff -urN vm-ref/mm/memory.c vm/mm/memory.c
--- vm-ref/mm/memory.c	Sun Mar 10 07:30:37 2002
+++ vm/mm/memory.c	Sun Mar 10 07:30:58 2002
@@ -964,15 +964,11 @@
 	if (!VALID_PAGE(old_page))
 		goto bad_wp_page;
 
-	if (!TryLockPage(old_page)) {
-		int reuse = can_share_swap_page(old_page);
-		unlock_page(old_page);
-		if (reuse) {
-			flush_cache_page(vma, address);
-			establish_pte(vma, address, page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte))));
-			spin_unlock(&mm->page_table_lock);
-			return 1;	/* Minor fault */
-		}
+	if (make_exclusive_page(old_page, 1)) {	
+		flush_cache_page(vma, address);
+		establish_pte(vma, address, page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte))));
+		spin_unlock(&mm->page_table_lock);
+		return 1;	/* Minor fault */
 	}
 
 	/*
@@ -990,6 +986,19 @@
 	 * Re-check the pte - we dropped the lock
 	 */
 	spin_lock(&mm->page_table_lock);
+	/*
+	 * keep the page pinned until we return runnable
+	 * to avoid another thread to skip the break_cow
+	 * path, so we're sure pte_same below check also implys
+	 * that the _contents_ of the old_page didn't changed
+	 * under us (not only that the pagetable is the same).
+	 *
+	 * Since we have the page_table_lock acquired here, if the
+	 * pte is the same it means we're still holding an additional
+	 * reference on the old_page so we can safely
+	 * page_cache_release(old_page) before the "pte_same == true" path.
+	 */
+	page_cache_release(old_page);
 	if (pte_same(*page_table, pte)) {
 		if (PageReserved(old_page))
 			++mm->rss;
@@ -1001,7 +1010,6 @@
 	}
 	spin_unlock(&mm->page_table_lock);
 	page_cache_release(new_page);
-	page_cache_release(old_page);
 	return 1;	/* Minor fault */
 
 bad_wp_page:
@@ -1154,9 +1162,8 @@
 		ret = 2;
 	}
 
-	mark_page_accessed(page);
-
-	lock_page(page);
+	if (!Page_Uptodate(page))
+		wait_on_page(page);
 
 	/*
 	 * Back out if somebody else faulted in this pte while we
@@ -1165,7 +1172,6 @@
 	spin_lock(&mm->page_table_lock);
 	if (!pte_same(*page_table, orig_pte)) {
 		spin_unlock(&mm->page_table_lock);
-		unlock_page(page);
 		page_cache_release(page);
 		return 1;
 	}
@@ -1173,14 +1179,15 @@
 	/* The page isn't present yet, go ahead with the fault. */
 		
 	swap_free(entry);
-	if (vm_swap_full())
-		remove_exclusive_swap_page(page);
-
 	mm->rss++;
 	pte = mk_pte(page, vma->vm_page_prot);
-	if (write_access && can_share_swap_page(page))
-		pte = pte_mkdirty(pte_mkwrite(pte));
-	unlock_page(page);
+	if (make_exclusive_page(page, write_access)) {
+		if (write_access)
+			pte = pte_mkdirty(pte);
+		if (vma->vm_flags & VM_WRITE)
+			pte = pte_mkwrite(pte);
+	}
+	mark_page_accessed(page);
 
 	flush_page_to_ram(page);
 	flush_icache_page(vma, page);
@@ -1218,15 +1225,14 @@
 
 		spin_lock(&mm->page_table_lock);
 		if (!pte_none(*page_table)) {
-			page_cache_release(page);
 			spin_unlock(&mm->page_table_lock);
+			page_cache_release(page);
 			return 1;
 		}
 		mm->rss++;
 		flush_page_to_ram(page);
 		entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 		lru_cache_add(page);
-		mark_page_accessed(page);
 	}
 
 	set_pte(page_table, entry);
@@ -1305,9 +1311,9 @@
 			entry = pte_mkwrite(pte_mkdirty(entry));
 		set_pte(page_table, entry);
 	} else {
+		spin_unlock(&mm->page_table_lock);
 		/* One of our sibling threads was faster, back out. */
 		page_cache_release(new_page);
-		spin_unlock(&mm->page_table_lock);
 		return 1;
 	}
 
diff -urN vm-ref/mm/mmap.c vm/mm/mmap.c
--- vm-ref/mm/mmap.c	Sun Mar 10 07:30:37 2002
+++ vm/mm/mmap.c	Sun Mar 10 07:30:58 2002
@@ -70,7 +70,7 @@
 	    return 1;
 
 	/* The page cache contains buffer pages these days.. */
-	free = atomic_read(&page_cache_size);
+	free = page_cache_size;
 	free += nr_free_pages();
 	free += nr_swap_pages;
 
diff -urN vm-ref/mm/oom_kill.c vm/mm/oom_kill.c
--- vm-ref/mm/oom_kill.c	Tue Jan 22 18:55:26 2002
+++ vm/mm/oom_kill.c	Sun Mar 10 07:30:58 2002
@@ -150,7 +150,6 @@
 	 * exit() and clear out its resources quickly...
 	 */
 	p->counter = 5 * HZ;
-	p->flags |= PF_MEMALLOC | PF_MEMDIE;
 
 	/* This process has hardware access, be more careful. */
 	if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO)) {
diff -urN vm-ref/mm/page_alloc.c vm/mm/page_alloc.c
--- vm-ref/mm/page_alloc.c	Sun Mar 10 07:30:36 2002
+++ vm/mm/page_alloc.c	Sun Mar 10 07:30:58 2002
@@ -27,8 +27,8 @@
 int nr_swap_pages;
 int nr_active_pages;
 int nr_inactive_pages;
-struct list_head inactive_list;
-struct list_head active_list;
+LIST_HEAD(inactive_list);
+LIST_HEAD(active_list);
 pg_data_t *pgdat_list;
 
 /* Used to look up the address of the struct zone encoded in page->zone */
@@ -39,6 +39,9 @@
 static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 128, 128, 128, };
 static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, };
 static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, };
+static int lower_zone_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 };
+
+int vm_gfp_debug = 0;
 
 /*
  * Free_page() adds the page to the free lists. This is optimized for
@@ -101,8 +104,11 @@
 	/* Yes, think what happens when other parts of the kernel take 
 	 * a reference to a page in order to pin it for io. -ben
 	 */
-	if (PageLRU(page))
+	if (PageLRU(page)) {
+		if (unlikely(in_interrupt()))
+			BUG();
 		lru_cache_del(page);
+	}
 
 	if (page->buffers)
 		BUG();
@@ -173,14 +179,14 @@
 	return;
 
  local_freelist:
-	if (current->nr_local_pages)
+	if ((current->local_pages.nr && !current->local_pages.order) ||
+	    !memclass(page_zone(page), current->local_pages.classzone) ||
+	    in_interrupt())
 		goto back_local_freelist;
-	if (in_interrupt())
-		goto back_local_freelist;		
 
-	list_add(&page->list, &current->local_pages);
+	list_add(&page->list, &current->local_pages.list);
 	page->index = order;
-	current->nr_local_pages++;
+	current->local_pages.nr++;
 }
 
 #define MARK_USED(index, order, area) \
@@ -265,35 +271,36 @@
 static struct page * balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed)
 {
 	struct page * page = NULL;
-	int __freed = 0;
+	int __freed;
 
-	if (!(gfp_mask & __GFP_WAIT))
-		goto out;
 	if (in_interrupt())
 		BUG();
 
-	current->allocation_order = order;
+	current->local_pages.order = order;
+	current->local_pages.classzone = classzone;
 	current->flags |= PF_MEMALLOC | PF_FREE_PAGES;
 
 	__freed = try_to_free_pages(classzone, gfp_mask, order);
 
 	current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES);
 
-	if (current->nr_local_pages) {
+	if (current->local_pages.nr) {
 		struct list_head * entry, * local_pages;
 		struct page * tmp;
 		int nr_pages;
 
-		local_pages = &current->local_pages;
+		local_pages = &current->local_pages.list;
 
 		if (likely(__freed)) {
 			/* pick from the last inserted so we're lifo */
 			entry = local_pages->next;
 			do {
 				tmp = list_entry(entry, struct page, list);
-				if (tmp->index == order && memclass(page_zone(tmp), classzone)) {
+				if (!memclass(page_zone(tmp), classzone))
+					BUG();
+				if (tmp->index == order) {
 					list_del(entry);
-					current->nr_local_pages--;
+					current->local_pages.nr--;
 					set_page_count(tmp, 1);
 					page = tmp;
 
@@ -319,7 +326,7 @@
 			} while ((entry = entry->next) != local_pages);
 		}
 
-		nr_pages = current->nr_local_pages;
+		nr_pages = current->local_pages.nr;
 		/* free in reverse order so that the global order will be lifo */
 		while ((entry = local_pages->prev) != local_pages) {
 			list_del(entry);
@@ -328,33 +335,37 @@
 			if (!nr_pages--)
 				BUG();
 		}
-		current->nr_local_pages = 0;
+		current->local_pages.nr = 0;
 	}
- out:
 	*freed = __freed;
 	return page;
 }
 
+static inline unsigned long zone_free_pages(zone_t * zone, unsigned int order)
+{
+	long free = zone->free_pages - (1UL << order);
+	return free >= 0 ? free : 0;
+}
+
 /*
  * This is the 'heart' of the zoned buddy allocator:
  */
 struct page * __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist)
 {
-	unsigned long min;
 	zone_t **zone, * classzone;
 	struct page * page;
-	int freed;
+	int freed, class_idx;
 
 	zone = zonelist->zones;
 	classzone = *zone;
-	min = 1UL << order;
+	class_idx = zone_idx(classzone);
+
 	for (;;) {
 		zone_t *z = *(zone++);
 		if (!z)
 			break;
 
-		min += z->pages_low;
-		if (z->free_pages > min) {
+		if (zone_free_pages(z, order) > z->watermarks[class_idx].low) {
 			page = rmqueue(z, order);
 			if (page)
 				return page;
@@ -367,18 +378,16 @@
 		wake_up_interruptible(&kswapd_wait);
 
 	zone = zonelist->zones;
-	min = 1UL << order;
 	for (;;) {
-		unsigned long local_min;
+		unsigned long min;
 		zone_t *z = *(zone++);
 		if (!z)
 			break;
 
-		local_min = z->pages_min;
+		min = z->watermarks[class_idx].min;
 		if (!(gfp_mask & __GFP_WAIT))
-			local_min >>= 2;
-		min += local_min;
-		if (z->free_pages > min) {
+			min >>= 2;
+		if (zone_free_pages(z, order) > min) {
 			page = rmqueue(z, order);
 			if (page)
 				return page;
@@ -387,8 +396,7 @@
 
 	/* here we're in the low on memory slow path */
 
-rebalance:
-	if (current->flags & (PF_MEMALLOC | PF_MEMDIE)) {
+	if (current->flags & PF_MEMALLOC && !in_interrupt()) {
 		zone = zonelist->zones;
 		for (;;) {
 			zone_t *z = *(zone++);
@@ -404,36 +412,51 @@
 
 	/* Atomic allocations - we can't balance anything */
 	if (!(gfp_mask & __GFP_WAIT))
-		return NULL;
+		goto out;
 
+ rebalance:
 	page = balance_classzone(classzone, gfp_mask, order, &freed);
 	if (page)
 		return page;
 
 	zone = zonelist->zones;
-	min = 1UL << order;
-	for (;;) {
-		zone_t *z = *(zone++);
-		if (!z)
-			break;
+	if (likely(freed)) {
+		for (;;) {
+			zone_t *z = *(zone++);
+			if (!z)
+				break;
 
-		min += z->pages_min;
-		if (z->free_pages > min) {
-			page = rmqueue(z, order);
-			if (page)
-				return page;
+			if (zone_free_pages(z, order) > z->watermarks[class_idx].min) {
+				page = rmqueue(z, order);
+				if (page)
+					return page;
+			}
 		}
-	}
+		goto rebalance;
+	} else {
+		/* 
+		 * Check that no other task is been killed meanwhile,
+		 * in such a case we can succeed the allocation.
+		 */
+		for (;;) {
+			zone_t *z = *(zone++);
+			if (!z)
+				break;
 
-	/* Don't let big-order allocations loop */
-	if (order > 3)
-		return NULL;
+			if (zone_free_pages(z, order) > z->watermarks[class_idx].high) {
+				page = rmqueue(z, order);
+				if (page)
+					return page;
+			}
+		}
+	}
 
-	/* Yield for kswapd, and try again */
-	current->policy |= SCHED_YIELD;
-	__set_current_state(TASK_RUNNING);
-	schedule();
-	goto rebalance;
+ out:
+	printk(KERN_NOTICE "__alloc_pages: %u-order allocation failed (gfp=0x%x/%i)\n",
+	       order, gfp_mask, !!(current->flags & PF_MEMALLOC));
+	if (unlikely(vm_gfp_debug))
+		show_stack(NULL);
+	return NULL;
 }
 
 /*
@@ -547,18 +570,25 @@
 {
 	pg_data_t *pgdat = pgdat_list;
 	unsigned int sum = 0;
+	zonelist_t *zonelist;
+	zone_t **zonep, *zone;
 
 	do {
-		zonelist_t *zonelist = pgdat->node_zonelists + (GFP_USER & GFP_ZONEMASK);
-		zone_t **zonep = zonelist->zones;
-		zone_t *zone;
-
-		for (zone = *zonep++; zone; zone = *zonep++) {
-			unsigned long size = zone->size;
-			unsigned long high = zone->pages_high;
-			if (size > high)
-				sum += size - high;
-		}
+		int class_idx;
+		zonelist = pgdat->node_zonelists + (GFP_USER & GFP_ZONEMASK);
+		zonep = zonelist->zones;
+		zone = *zonep;
+		class_idx = zone_idx(zone);
+
+		sum += zone->nr_cache_pages;
+		do {
+			unsigned int free = zone->free_pages - zone->watermarks[class_idx].high;
+			zonep++;
+			zone = *zonep;
+			if (free <= 0)
+				continue;
+			sum += free;
+		} while (zone);
 
 		pgdat = pgdat->node_next;
 	} while (pgdat);
@@ -580,6 +610,65 @@
 }
 #endif
 
+/*
+ * If it returns non zero it means there's lots of ram "free"
+ * (note: not in cache!) so any caller will know that
+ * he can allocate some memory to do some more aggressive
+ * (possibly wasteful) readahead. The state of the memory
+ * should be rechecked after every few pages allocated for
+ * doing this aggressive readahead.
+ *
+ * The gfp_mask parameter specifies in which kind of memory
+ * the readahead information will be applocated to.
+ */
+int start_aggressive_readahead(unsigned int gfp_mask)
+{
+	pg_data_t *pgdat = pgdat_list;
+	zonelist_t *zonelist;
+	zone_t **zonep, *zone;
+	int ret = 0;
+
+	do {
+		int class_idx;
+		zonelist = pgdat->node_zonelists + (gfp_mask & GFP_ZONEMASK);
+		zonep = zonelist->zones;
+		zone = *(zonep++);
+		class_idx = zone_idx(zone);
+
+		for (; zone; zone = *(zonep++))
+			if (zone->free_pages > zone->watermarks[class_idx].high * 2)
+				ret = 1;
+
+		pgdat = pgdat->node_next;
+	} while (pgdat);
+
+	return ret;
+}
+
+int try_to_free_pages_nozone(unsigned int gfp_mask)
+{
+	pg_data_t *pgdat = pgdat_list;
+	zonelist_t *zonelist;
+	zone_t **zonep;
+	int ret = 0;
+	unsigned long pf_free_pages;
+
+	pf_free_pages = current->flags & PF_FREE_PAGES;
+	current->flags &= ~PF_FREE_PAGES;
+
+	do {
+		zonelist = pgdat->node_zonelists + (gfp_mask & GFP_ZONEMASK);
+		zonep = zonelist->zones;
+
+		ret |= try_to_free_pages(*zonep, gfp_mask, 0);
+
+		pgdat = pgdat->node_next;
+	} while (pgdat);
+
+	current->flags |= pf_free_pages;
+	return ret;
+}
+
 #define K(x) ((x) << (PAGE_SHIFT-10))
 
 /*
@@ -601,13 +690,9 @@
 		zone_t *zone;
 		for (zone = tmpdat->node_zones;
 			       	zone < tmpdat->node_zones + MAX_NR_ZONES; zone++)
-			printk("Zone:%s freepages:%6lukB min:%6lukB low:%6lukB " 
-				       "high:%6lukB\n", 
+			printk("Zone:%s freepages:%6lukB\n", 
 					zone->name,
-					K(zone->free_pages),
-					K(zone->pages_min),
-					K(zone->pages_low),
-					K(zone->pages_high));
+					K(zone->free_pages));
 			
 		tmpdat = tmpdat->node_next;
 	}
@@ -714,33 +799,45 @@
  */
 #define PAGES_PER_WAITQUEUE	256
 
-static inline unsigned long wait_table_size(unsigned long pages)
+static inline unsigned long wait_table_size(unsigned long pages, unsigned long * shift)
 {
 	unsigned long size = 1;
+	unsigned long __shift = 0;
 
 	pages /= PAGES_PER_WAITQUEUE;
 
-	while (size < pages)
+	while (size < pages) {
 		size <<= 1;
+		__shift++;
+	}
 
 	/*
-	 * Once we have dozens or even hundreds of threads sleeping
-	 * on IO we've got bigger problems than wait queue collision.
-	 * Limit the size of the wait table to a reasonable size.
+	 * The usage pattern of the queues depends mostly on the I/O,
+	 * not much of the ram size of the machine, so make sure the
+	 * array is large enough on lowmem nodes too.
 	 */
-	size = min(size, 4096UL);
+	size = max(size, 256);
+	*shift = max(__shift, 8);
 
 	return size;
 }
 
 /*
- * This is an integer logarithm so that shifts can be used later
- * to extract the more random high bits from the multiplicative
- * hash function before the remainder is taken.
+ * The per-node waitqueue mechanism uses hashed waitqueues
+ * per zone.
  */
-static inline unsigned long wait_table_bits(unsigned long size)
+static inline void wait_table_init(pg_data_t *pgdat)
 {
-	return ffz(~size);
+	unsigned long shift, size, i;
+
+	size = wait_table_size(pgdat->node_size, &shift);
+
+	pgdat->wait_table.size = size;
+	pgdat->wait_table.shift = shift;
+	pgdat->wait_table.head = (wait_queue_head_t *) alloc_bootmem_node(pgdat, size * sizeof(wait_queue_head_t));
+
+	for(i = 0; i < size; i++)
+		init_waitqueue_head(pgdat->wait_table.head + i);
 }
 
 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
@@ -775,9 +872,6 @@
 			
 	printk("On node %d totalpages: %lu\n", nid, realtotalpages);
 
-	INIT_LIST_HEAD(&active_list);
-	INIT_LIST_HEAD(&inactive_list);
-
 	/*
 	 * Some architectures (with lots of mem and discontinous memory
 	 * maps) have to search for a good mem_map area:
@@ -797,11 +891,14 @@
 	pgdat->node_start_mapnr = (lmem_map - mem_map);
 	pgdat->nr_zones = 0;
 
+	wait_table_init(pgdat);
+
 	offset = lmem_map - mem_map;	
 	for (j = 0; j < MAX_NR_ZONES; j++) {
 		zone_t *zone = pgdat->node_zones + j;
 		unsigned long mask;
 		unsigned long size, realsize;
+		int idx;
 
 		zone_table[nid * MAX_NR_ZONES + j] = zone;
 		realsize = size = zones_size[j];
@@ -810,28 +907,16 @@
 
 		printk("zone(%lu): %lu pages.\n", j, size);
 		zone->size = size;
+		zone->realsize = realsize;
 		zone->name = zone_names[j];
 		zone->lock = SPIN_LOCK_UNLOCKED;
 		zone->zone_pgdat = pgdat;
 		zone->free_pages = 0;
 		zone->need_balance = 0;
+		zone->nr_active_pages = zone->nr_inactive_pages = 0;
 		if (!size)
 			continue;
 
-		/*
-		 * The per-page waitqueue mechanism uses hashed waitqueues
-		 * per zone.
-		 */
-		zone->wait_table_size = wait_table_size(size);
-		zone->wait_table_shift =
-			BITS_PER_LONG - wait_table_bits(zone->wait_table_size);
-		zone->wait_table = (wait_queue_head_t *)
-			alloc_bootmem_node(pgdat, zone->wait_table_size
-						* sizeof(wait_queue_head_t));
-
-		for(i = 0; i < zone->wait_table_size; ++i)
-			init_waitqueue_head(zone->wait_table + i);
-
 		pgdat->nr_zones = j+1;
 
 		mask = (realsize / zone_balance_ratio[j]);
@@ -839,9 +924,29 @@
 			mask = zone_balance_min[j];
 		else if (mask > zone_balance_max[j])
 			mask = zone_balance_max[j];
-		zone->pages_min = mask;
-		zone->pages_low = mask*2;
-		zone->pages_high = mask*3;
+		zone->watermarks[j].min = mask;
+		zone->watermarks[j].low = mask*2;
+		zone->watermarks[j].high = mask*3;
+		/* now set the watermarks of the lower zones in the "j" classzone */
+		for (idx = j-1; idx >= 0; idx--) {
+			zone_t * lower_zone = pgdat->node_zones + idx;
+			unsigned long lower_zone_reserve;
+			if (!lower_zone->size)
+				continue;
+
+			mask = lower_zone->watermarks[idx].min;
+			lower_zone->watermarks[j].min = mask;
+			lower_zone->watermarks[j].low = mask*2;
+			lower_zone->watermarks[j].high = mask*3;
+
+			/* now the brainer part */
+			lower_zone_reserve = realsize / lower_zone_reserve_ratio[idx];
+			lower_zone->watermarks[j].min += lower_zone_reserve;
+			lower_zone->watermarks[j].low += lower_zone_reserve;
+			lower_zone->watermarks[j].high += lower_zone_reserve;
+
+			realsize += lower_zone->realsize;
+		}
 
 		zone->zone_mem_map = mem_map + offset;
 		zone->zone_start_mapnr = offset;
@@ -925,3 +1030,16 @@
 }
 
 __setup("memfrac=", setup_mem_frac);
+
+static int __init setup_lower_zone_reserve(char *str)
+{
+	int j = 0;
+
+	while (get_option(&str, &lower_zone_reserve_ratio[j++]) == 2);
+	printk("setup_lower_zone_reserve: ");
+	for (j = 0; j < MAX_NR_ZONES-1; j++) printk("%d  ", lower_zone_reserve_ratio[j]);
+	printk("\n");
+	return 1;
+}
+
+__setup("lower_zone_reserve=", setup_lower_zone_reserve);
diff -urN vm-ref/mm/page_io.c vm/mm/page_io.c
--- vm-ref/mm/page_io.c	Tue Jan 22 18:56:00 2002
+++ vm/mm/page_io.c	Sun Mar 10 07:30:58 2002
@@ -73,10 +73,6 @@
  	/* block_size == PAGE_SIZE/zones_used */
  	brw_page(rw, page, dev, zones, block_size);
 
- 	/* Note! For consistency we do all of the logic,
- 	 * decrementing the page count, and unlocking the page in the
- 	 * swap lock map - in the IO completion handler.
- 	 */
 	return 1;
 }
 
diff -urN vm-ref/mm/slab.c vm/mm/slab.c
--- vm-ref/mm/slab.c	Tue Jan 22 18:56:30 2002
+++ vm/mm/slab.c	Sun Mar 10 07:30:58 2002
@@ -916,8 +916,6 @@
 	slab_t *slabp;
 	int ret;
 
-	drain_cpu_caches(cachep);
-
 	spin_lock_irq(&cachep->spinlock);
 
 	/* If the cache is growing, stop shrinking. */
@@ -987,6 +985,8 @@
 						kmem_cache_t, next);
 	list_del(&cachep->next);
 	up(&cache_chain_sem);
+
+	drain_cpu_caches(cachep);
 
 	if (__kmem_cache_shrink(cachep)) {
 		printk(KERN_ERR "kmem_cache_destroy: Can't free all objects %p\n",
diff -urN vm-ref/mm/swap.c vm/mm/swap.c
--- vm-ref/mm/swap.c	Tue Jan 22 18:56:00 2002
+++ vm/mm/swap.c	Sun Mar 10 07:30:58 2002
@@ -36,18 +36,17 @@
 /*
  * Move an inactive page to the active list.
  */
-static inline void activate_page_nolock(struct page * page)
-{
-	if (PageLRU(page) && !PageActive(page)) {
-		del_page_from_inactive_list(page);
-		add_page_to_active_list(page);
-	}
-}
-
 void activate_page(struct page * page)
 {
 	spin_lock(&pagemap_lru_lock);
-	activate_page_nolock(page);
+	if (PageLRU(page)) {
+		if  (!PageActive(page)) {
+			del_page_from_inactive_list(page);
+			add_page_to_active_list(page);
+			ClearPageReferenced(page);
+		} else
+			SetPageReferenced(page);
+	}
 	spin_unlock(&pagemap_lru_lock);
 }
 
diff -urN vm-ref/mm/swap_state.c vm/mm/swap_state.c
--- vm-ref/mm/swap_state.c	Tue Jan 22 18:55:27 2002
+++ vm/mm/swap_state.c	Sun Mar 10 07:30:58 2002
@@ -117,7 +117,9 @@
 	if (!PageLocked(page))
 		BUG();
 
-	block_flushpage(page, 0);
+	if (!block_flushpage(page, 0))
+		/* an anonymous page cannot have page->buffers set */
+		BUG();
 
 	entry.val = page->index;
 
diff -urN vm-ref/mm/swapfile.c vm/mm/swapfile.c
--- vm-ref/mm/swapfile.c	Mon Feb 25 22:05:09 2002
+++ vm/mm/swapfile.c	Sun Mar 10 07:30:58 2002
@@ -227,6 +227,7 @@
  * Check if we're the only user of a swap page,
  * when the page is locked.
  */
+static int FASTCALL(exclusive_swap_page(struct page *page));
 static int exclusive_swap_page(struct page *page)
 {
 	int retval = 0;
@@ -240,12 +241,13 @@
 		if (p->swap_map[SWP_OFFSET(entry)] == 1) {
 			/* Recheck the page count with the pagecache lock held.. */
 			spin_lock(&pagecache_lock);
-			if (page_count(page) - !!page->buffers == 2)
+			if (PageSwapCache(page) && page_count(page) - !!page->buffers == 2)
 				retval = 1;
 			spin_unlock(&pagecache_lock);
 		}
 		swap_info_put(p);
 	}
+
 	return retval;
 }
 
@@ -257,21 +259,42 @@
  * work, but we opportunistically check whether
  * we need to get all the locks first..
  */
-int can_share_swap_page(struct page *page)
+int make_exclusive_page(struct page *page, int write)
 {
 	int retval = 0;
 
-	if (!PageLocked(page))
-		BUG();
 	switch (page_count(page)) {
 	case 3:
 		if (!page->buffers)
 			break;
 		/* Fallthrough */
 	case 2:
+		/* racy fastpath check */
 		if (!PageSwapCache(page))
 			break;
-		retval = exclusive_swap_page(page);
+
+		if ((!write && !vm_swap_full()) || TryLockPage(page)) {
+			/*
+			 * Don't remove the page from the swapcache if:
+			 * - it was a read fault and...
+			 * - the swap isn't full
+			 * or if
+			 * - we failed acquiring the page lock
+			 *
+			 * NOTE: if failed acquiring the lock we cannot remove the
+			 * page from the swapcache, but still we can safely takeover
+			 * the page if it's exclusive, see the swapcache check in
+			 * the innermost critical section of exclusive_swap_page().
+			 */
+			retval = exclusive_swap_page(page);
+		} else {
+			/*
+			 * Here we've the page lock acquired and we're asked
+			 * to try to drop this page from the swapcache.
+			 */
+			retval = remove_exclusive_swap_page(page);
+			unlock_page(page);
+		}
 		break;
 	case 1:
 		if (PageReserved(page))
@@ -300,7 +323,7 @@
 
 	entry.val = page->index;
 	p = swap_info_get(entry);
-	if (!p)
+	if (unlikely(!p))
 		return 0;
 
 	/* Is the only swap cache user the cache itself? */
@@ -309,7 +332,11 @@
 		/* Recheck the page count with the pagecache lock held.. */
 		spin_lock(&pagecache_lock);
 		if (page_count(page) - !!page->buffers == 2) {
+			if (page->buffers && !try_to_free_buffers(page, 0))
+				/* an anonymous page cannot have page->buffers set */
+				BUG();
 			__delete_from_swap_cache(page);
+			swap_entry_free(p, SWP_OFFSET(entry));
 			SetPageDirty(page);
 			retval = 1;
 		}
@@ -317,11 +344,8 @@
 	}
 	swap_info_put(p);
 
-	if (retval) {
-		block_flushpage(page, 0);
-		swap_free(entry);
+	if (retval)
 		page_cache_release(page);
-	}
 
 	return retval;
 }
@@ -343,11 +367,7 @@
 	}
 	if (page) {
 		page_cache_get(page);
-		/* Only cache user (+us), or swap space full? Free it! */
-		if (page_count(page) - !!page->buffers == 2 || vm_swap_full()) {
-			delete_from_swap_cache(page);
-			SetPageDirty(page);
-		}
+		remove_exclusive_swap_page(page);
 		UnlockPage(page);
 		page_cache_release(page);
 	}
diff -urN vm-ref/mm/vmscan.c vm/mm/vmscan.c
--- vm-ref/mm/vmscan.c	Fri Mar  1 00:09:39 2002
+++ vm/mm/vmscan.c	Sun Mar 10 07:31:06 2002
@@ -25,12 +25,41 @@
 #include <asm/pgalloc.h>
 
 /*
- * The "priority" of VM scanning is how much of the queues we
- * will scan in one go. A value of 6 for DEF_PRIORITY implies
- * that we'll scan 1/64th of the queues ("queue_length >> 6")
- * during a normal aging round.
+ * "vm_passes" is the number of vm passes before failing the
+ * memory balancing. Take into account 3 passes are needed
+ * for a flush/wait/free cycle and that we only scan 1/vm_cache_scan_ratio
+ * of the inactive list at each pass.
  */
-#define DEF_PRIORITY (6)
+int vm_passes = 60;
+
+/*
+ * "vm_cache_scan_ratio" is how much of the inactive LRU queue we will scan
+ * in one go. A value of 6 for vm_cache_scan_ratio implies that we'll
+ * scan 1/6 of the inactive lists during a normal aging round.
+ */
+int vm_cache_scan_ratio = 6;
+
+/*
+ * "vm_mapped_ratio" controls the pageout rate, the smaller, the earlier
+ * we'll start to pageout.
+ */
+int vm_mapped_ratio = 100;
+
+/*
+ * "vm_lru_balance_ratio" controls the balance between active and
+ * inactive cache. The bigger vm_balance is, the easier the
+ * active cache will grow, because we'll rotate the active list
+ * slowly. A value of 2 means we'll go towards a balance of
+ * 1/3 of the cache being inactive.
+ */
+int vm_lru_balance_ratio = 2;
+
+/*
+ * "vm_vfs_scan_ratio" is how much of the VFS queues we will scan
+ * in one go. A value of 6 for vm_vfs_scan_ratio implies that we'll
+ * scan 1/6 of the inactive lists during a normal aging round.
+ */
+int vm_vfs_scan_ratio = 6;
 
 /*
  * The swap-out function returns 1 if it successfully
@@ -53,10 +82,6 @@
 		return 0;
 	}
 
-	/* Don't bother unmapping pages that are active */
-	if (PageActive(page))
-		return 0;
-
 	/* Don't bother replenishing zones not under pressure.. */
 	if (!memclass(page_zone(page), classzone))
 		return 0;
@@ -256,6 +281,7 @@
 {
 	unsigned long address;
 	struct vm_area_struct* vma;
+	int tlb_flush = 0;
 
 	/*
 	 * Find the proper vm-area after freezing the vma chain 
@@ -270,6 +296,7 @@
 	}
 	vma = find_vma(mm, address);
 	if (vma) {
+		tlb_flush = 1;
 		if (address < vma->vm_start)
 			address = vma->vm_start;
 
@@ -288,16 +315,18 @@
 
 out_unlock:
 	spin_unlock(&mm->page_table_lock);
+	if (tlb_flush)
+		flush_tlb_mm(mm);
 	return count;
 }
 
-static int FASTCALL(swap_out(unsigned int priority, unsigned int gfp_mask, zone_t * classzone));
-static int swap_out(unsigned int priority, unsigned int gfp_mask, zone_t * classzone)
+static int FASTCALL(swap_out(zone_t * classzone));
+static int swap_out(zone_t * classzone)
 {
 	int counter, nr_pages = SWAP_CLUSTER_MAX;
 	struct mm_struct *mm;
 
-	counter = mmlist_nr;
+	counter = mmlist_nr << 1;
 	do {
 		if (unlikely(current->need_resched)) {
 			__set_current_state(TASK_RUNNING);
@@ -326,23 +355,44 @@
 			return 1;
 	} while (--counter >= 0);
 
+ out:
+	if (unlikely(vm_gfp_debug)) {
+		printk(KERN_NOTICE "swap_out: failed\n");
+		show_stack(NULL);
+	}
 	return 0;
 
-empty:
+ empty:
 	spin_unlock(&mmlist_lock);
+	goto out;
+}
+
+static int FASTCALL(memclass_related_bhs(struct page * page, zone_t * classzone));
+static int memclass_related_bhs(struct page * page, zone_t * classzone)
+{
+	struct buffer_head * tmp, * bh = page->buffers;
+
+	tmp = bh;
+	do {
+		if (memclass(page_zone(virt_to_page(tmp)), classzone))
+			return 1;
+		tmp = tmp->b_this_page;
+	} while (tmp != bh);
+
 	return 0;
 }
 
-static int FASTCALL(shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int priority));
-static int shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int priority)
+static void FASTCALL(refill_inactive(int nr_pages, zone_t * classzone));
+static int FASTCALL(shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int * failed_swapout));
+static int shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int * failed_swapout)
 {
 	struct list_head * entry;
-	int max_scan = nr_inactive_pages / priority;
-	int max_mapped = min((nr_pages << (10 - priority)), max_scan / 10);
+	int max_scan = (classzone->nr_inactive_pages + classzone->nr_active_pages) / vm_cache_scan_ratio;
+	int max_mapped = vm_mapped_ratio * nr_pages;
 
-	spin_lock(&pagemap_lru_lock);
-	while (--max_scan >= 0 && (entry = inactive_list.prev) != &inactive_list) {
+	while (max_scan && classzone->nr_inactive_pages && (entry = inactive_list.prev) != &inactive_list) {
 		struct page * page;
+		int only_metadata;
 
 		if (unlikely(current->need_resched)) {
 			spin_unlock(&pagemap_lru_lock);
@@ -369,8 +419,30 @@
 		if (unlikely(!page_count(page)))
 			continue;
 
-		if (!memclass(page_zone(page), classzone))
+		only_metadata = 0;
+		if (!memclass(page_zone(page), classzone)) {
+			/*
+			 * Hack to address an issue found by Rik. The problem is that
+			 * highmem pages can hold buffer headers allocated
+			 * from the slab on lowmem, and so if we are working
+			 * on the NORMAL classzone here, it is correct not to
+			 * try to free the highmem pages themself (that would be useless)
+			 * but we must make sure to drop any lowmem metadata related to those
+			 * highmem pages.
+			 */
+			if (page->buffers && page->mapping) { /* fast path racy check */
+				if (unlikely(TryLockPage(page)))
+					continue;
+				if (page->buffers && page->mapping && memclass_related_bhs(page, classzone)) { /* non racy check */
+					only_metadata = 1;
+					goto free_bhs;
+				}
+				UnlockPage(page);
+			}
 			continue;
+		}
+
+		max_scan--;
 
 		/* Racy check to avoid trylocking when not worthwhile */
 		if (!page->buffers && (page_count(page) != 1 || !page->mapping))
@@ -423,6 +495,7 @@
 		 * the page as well.
 		 */
 		if (page->buffers) {
+		free_bhs:
 			spin_unlock(&pagemap_lru_lock);
 
 			/* avoid to free a locked page */
@@ -455,6 +528,10 @@
 					page_cache_release(page);
 
 					spin_lock(&pagemap_lru_lock);
+					if (only_metadata) {
+						UnlockPage(page);
+						continue;
+					}
 				}
 			} else {
 				/* failed to drop the buffers so stop here */
@@ -469,34 +546,49 @@
 		spin_lock(&pagecache_lock);
 
 		/*
-		 * this is the non-racy check for busy page.
+		 * This is the non-racy check for busy page.
+		 * It is critical to check PageDirty _after_ we made sure
+		 * the page is freeable so not in use by anybody.
+		 * At this point we're guaranteed that page->buffers is NULL,
+		 * nobody can refill page->buffers under us because we still
+		 * hold the page lock.
 		 */
-		if (!page->mapping || !is_page_cache_freeable(page)) {
+		if (!page->mapping || page_count(page) > 1) {
 			spin_unlock(&pagecache_lock);
 			UnlockPage(page);
-page_mapped:
-			if (--max_mapped >= 0)
-				continue;
+		page_mapped:
+			if (--max_mapped < 0) {
+				spin_unlock(&pagemap_lru_lock);
 
-			/*
-			 * Alert! We've found too many mapped pages on the
-			 * inactive list, so we start swapping out now!
-			 */
-			spin_unlock(&pagemap_lru_lock);
-			swap_out(priority, gfp_mask, classzone);
-			return nr_pages;
-		}
+				nr_pages -= kmem_cache_reap(gfp_mask);
+				if (nr_pages <= 0)
+					goto out;
 
-		/*
-		 * It is critical to check PageDirty _after_ we made sure
-		 * the page is freeable* so not in use by anybody.
-		 */
+				shrink_dcache_memory(vm_vfs_scan_ratio, gfp_mask);
+				shrink_icache_memory(vm_vfs_scan_ratio, gfp_mask);
+#ifdef CONFIG_QUOTA
+				shrink_dqcache_memory(vm_vfs_scan_ratio, gfp_mask);
+#endif
+
+				if (!*failed_swapout)
+					*failed_swapout = !swap_out(classzone);
+
+				max_mapped = nr_pages * vm_mapped_ratio;
+
+				spin_lock(&pagemap_lru_lock);
+				refill_inactive(nr_pages, classzone);
+			}
+			continue;
+			
+		}
 		if (PageDirty(page)) {
 			spin_unlock(&pagecache_lock);
 			UnlockPage(page);
 			continue;
 		}
 
+		__lru_cache_del(page);
+
 		/* point of no return */
 		if (likely(!PageSwapCache(page))) {
 			__remove_inode_page(page);
@@ -509,7 +601,6 @@
 			swap_free(swap);
 		}
 
-		__lru_cache_del(page);
 		UnlockPage(page);
 
 		/* effectively free the page here */
@@ -521,6 +612,7 @@
 	}
 	spin_unlock(&pagemap_lru_lock);
 
+ out:
 	return nr_pages;
 }
 
@@ -531,77 +623,112 @@
  * We move them the other way when we see the
  * reference bit on the page.
  */
-static void refill_inactive(int nr_pages)
+static void refill_inactive(int nr_pages, zone_t * classzone)
 {
 	struct list_head * entry;
+	unsigned long ratio;
+
+	ratio = (unsigned long) nr_pages * classzone->nr_active_pages / (((unsigned long) classzone->nr_inactive_pages * vm_lru_balance_ratio) + 1);
 
-	spin_lock(&pagemap_lru_lock);
 	entry = active_list.prev;
-	while (nr_pages && entry != &active_list) {
+	while (ratio && entry != &active_list) {
 		struct page * page;
+		int related_metadata = 0;
 
 		page = list_entry(entry, struct page, lru);
 		entry = entry->prev;
+
+		if (!memclass(page_zone(page), classzone)) {
+			/*
+			 * Hack to address an issue found by Rik. The problem is that
+			 * highmem pages can hold buffer headers allocated
+			 * from the slab on lowmem, and so if we are working
+			 * on the NORMAL classzone here, it is correct not to
+			 * try to free the highmem pages themself (that would be useless)
+			 * but we must make sure to drop any lowmem metadata related to those
+			 * highmem pages.
+			 */
+			if (page->buffers && page->mapping) { /* fast path racy check */
+				if (unlikely(TryLockPage(page)))
+					continue;
+				if (page->buffers && page->mapping && memclass_related_bhs(page, classzone)) /* non racy check */
+					related_metadata = 1;
+				UnlockPage(page);
+			}
+			if (!related_metadata)
+				continue;
+		}
+
 		if (PageTestandClearReferenced(page)) {
 			list_del(&page->lru);
 			list_add(&page->lru, &active_list);
 			continue;
 		}
 
-		nr_pages--;
+		if (!related_metadata)
+			ratio--;
 
 		del_page_from_active_list(page);
 		add_page_to_inactive_list(page);
 		SetPageReferenced(page);
 	}
-	spin_unlock(&pagemap_lru_lock);
+	if (entry != &active_list) {
+		list_del(&active_list);
+		list_add(&active_list, entry);
+	}
 }
 
-static int FASTCALL(shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages));
-static int shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages)
+static int FASTCALL(shrink_caches(zone_t * classzone, unsigned int gfp_mask, int nr_pages, int * failed_swapout));
+static int shrink_caches(zone_t * classzone, unsigned int gfp_mask, int nr_pages, int * failed_swapout)
 {
-	int chunk_size = nr_pages;
-	unsigned long ratio;
-
 	nr_pages -= kmem_cache_reap(gfp_mask);
 	if (nr_pages <= 0)
-		return 0;
+		goto out;
 
-	nr_pages = chunk_size;
-	/* try to keep the active list 2/3 of the size of the cache */
-	ratio = (unsigned long) nr_pages * nr_active_pages / ((nr_inactive_pages + 1) * 2);
-	refill_inactive(ratio);
-
-	nr_pages = shrink_cache(nr_pages, classzone, gfp_mask, priority);
-	if (nr_pages <= 0)
-		return 0;
+	spin_lock(&pagemap_lru_lock);
+	refill_inactive(nr_pages, classzone);
 
-	shrink_dcache_memory(priority, gfp_mask);
-	shrink_icache_memory(priority, gfp_mask);
-#ifdef CONFIG_QUOTA
-	shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
-#endif
+	nr_pages = shrink_cache(nr_pages, classzone, gfp_mask, failed_swapout);
 
+ out:
 	return nr_pages;
 }
 
+static int check_classzone_need_balance(zone_t * classzone);
+
 int try_to_free_pages(zone_t *classzone, unsigned int gfp_mask, unsigned int order)
 {
-	int priority = DEF_PRIORITY;
-	int nr_pages = SWAP_CLUSTER_MAX;
-
 	gfp_mask = pf_gfp_mask(gfp_mask);
-	do {
-		nr_pages = shrink_caches(classzone, priority, gfp_mask, nr_pages);
-		if (nr_pages <= 0)
-			return 1;
-	} while (--priority);
 
-	/*
-	 * Hmm.. Cache shrink failed - time to kill something?
-	 * Mhwahahhaha! This is the part I really like. Giggle.
-	 */
-	out_of_memory();
+	for (;;) {
+		int tries = vm_passes;
+		int failed_swapout = !(gfp_mask & __GFP_IO);
+		int nr_pages = SWAP_CLUSTER_MAX;
+
+		do {
+			nr_pages = shrink_caches(classzone, gfp_mask, nr_pages, &failed_swapout);
+			if (nr_pages <= 0)
+				return 1;
+
+			shrink_dcache_memory(vm_vfs_scan_ratio, gfp_mask);
+			shrink_icache_memory(vm_vfs_scan_ratio, gfp_mask);
+#ifdef CONFIG_QUOTA
+			shrink_dqcache_memory(vm_vfs_scan_ratio, gfp_mask);
+#endif
+
+			if (!failed_swapout)
+				failed_swapout = !swap_out(classzone);
+		} while (--tries);
+
+		if (likely(current->pid != 1))
+			break;
+		if (!check_classzone_need_balance(classzone))
+			break;
+		current->policy |= SCHED_YIELD;
+		__set_current_state(TASK_RUNNING);
+		schedule();
+	}
+
 	return 0;
 }
 
@@ -609,11 +736,12 @@
 
 static int check_classzone_need_balance(zone_t * classzone)
 {
-	zone_t * first_classzone;
+	zone_t * first_zone;
+	int class_idx = zone_idx(classzone);
 
-	first_classzone = classzone->zone_pgdat->node_zones;
-	while (classzone >= first_classzone) {
-		if (classzone->free_pages > classzone->pages_high)
+	first_zone = classzone->zone_pgdat->node_zones;
+	while (classzone >= first_zone) {
+		if (classzone->free_pages > classzone->watermarks[class_idx].high)
 			return 0;
 		classzone--;
 	}
@@ -629,12 +757,12 @@
 		zone = pgdat->node_zones + i;
 		if (unlikely(current->need_resched))
 			schedule();
-		if (!zone->need_balance)
+		if (!zone->need_balance || !zone->size)
 			continue;
 		if (!try_to_free_pages(zone, GFP_KSWAPD, 0)) {
 			zone->need_balance = 0;
 			__set_current_state(TASK_INTERRUPTIBLE);
-			schedule_timeout(HZ);
+			schedule_timeout(HZ*5);
 			continue;
 		}
 		if (check_classzone_need_balance(zone))
@@ -667,7 +795,7 @@
 
 	for (i = pgdat->nr_zones-1; i >= 0; i--) {
 		zone = pgdat->node_zones + i;
-		if (!zone->need_balance)
+		if (!zone->need_balance || !zone->size)
 			continue;
 		return 0;
 	}