diff -urN 2.2.17pre9/drivers/block/rd.c 2.2.17pre9-VM/drivers/block/rd.c
--- 2.2.17pre9/drivers/block/rd.c	Wed Jun 28 17:13:02 2000
+++ 2.2.17pre9-VM/drivers/block/rd.c	Sun Jul  2 23:56:02 2000
@@ -173,7 +173,7 @@
 	if (CURRENT->cmd == READ) 
 		memset(CURRENT->buffer, 0, len); 
 	else	
-		set_bit(BH_Protected, &CURRENT->bh->b_state);
+		mark_buffer_protected(CURRENT->bh);
 
 	end_request(1);
 	goto repeat;
diff -urN 2.2.17pre9/fs/buffer.c 2.2.17pre9-VM/fs/buffer.c
--- 2.2.17pre9/fs/buffer.c	Wed Jun 28 17:13:08 2000
+++ 2.2.17pre9-VM/fs/buffer.c	Mon Jul  3 11:29:32 2000
@@ -27,6 +27,8 @@
 /* invalidate_buffers/set_blocksize/sync_dev race conditions and
    fs corruption fixes, 1999, Andrea Arcangeli <andrea@suse.de> */
 
+/* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
+
 /* Wait for dirty buffers to sync in sync_page_buffers.
  * 2000, Marcelo Tosatti <marcelo@conectiva.com.br>
  */
@@ -83,6 +85,7 @@
 
 static int nr_buffers = 0;
 static int nr_buffers_type[NR_LIST] = {0,};
+static unsigned long size_buffers_type[NR_LIST];
 static int nr_buffer_heads = 0;
 static int nr_unused_buffer_heads = 0;
 static int nr_hashed_buffers = 0;
@@ -474,6 +477,7 @@
 		return;
 	}
 	nr_buffers_type[bh->b_list]--;
+	size_buffers_type[bh->b_list] -= bh->b_size;
 	remove_from_hash_queue(bh);
 	remove_from_lru_list(bh);
 }
@@ -523,6 +527,7 @@
 		(*bhp)->b_prev_free = bh;
 
 		nr_buffers_type[bh->b_list]++;
+		size_buffers_type[bh->b_list] += bh->b_size;
 
 		/* Put the buffer in new hash-queue if it has a device. */
 		bh->b_next = NULL;
@@ -816,6 +821,46 @@
 	insert_into_queues(bh);
 }
 
+/* -1 -> no need to flush
+    0 -> async flush
+    1 -> sync flush (wait for I/O completation) */
+static int balance_dirty_state(kdev_t dev)
+{
+	unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
+
+	dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
+	tot = (buffermem >> PAGE_SHIFT) + nr_free_pages;
+	tot -= size_buffers_type[BUF_PROTECTED] >> PAGE_SHIFT;
+
+	dirty *= 200;
+	soft_dirty_limit = tot * bdf_prm.b_un.nfract;
+	hard_dirty_limit = soft_dirty_limit * 2;
+
+	if (dirty > soft_dirty_limit)
+	{
+		if (dirty > hard_dirty_limit)
+			return 1;
+		return 0;
+	}
+	return -1;
+}
+
+/*
+ * if a new dirty buffer is created we need to balance bdflush.
+ *
+ * in the future we might want to make bdflush aware of different
+ * pressures on different devices - thus the (currently unused)
+ * 'dev' parameter.
+ */
+void balance_dirty(kdev_t dev)
+{
+	int state = balance_dirty_state(dev);
+
+	if (state < 0)
+		return;
+	wakeup_bdflush(state);
+}
+
 /*
  * A buffer may need to be moved from one buffer list to another
  * (e.g. in case it is not shared any more). Handle this.
@@ -828,7 +873,9 @@
 		printk("Attempt to refile free buffer\n");
 		return;
 	}
-	if (buffer_dirty(buf))
+	if (buffer_protected(buf))
+		dispose = BUF_PROTECTED;
+	else if (buffer_dirty(buf))
 		dispose = BUF_DIRTY;
 	else if (buffer_locked(buf))
 		dispose = BUF_LOCKED;
@@ -837,13 +884,7 @@
 	if(dispose != buf->b_list) {
 		file_buffer(buf, dispose);
 		if(dispose == BUF_DIRTY) {
-			int too_many = (nr_buffers * bdf_prm.b_un.nfract/100);
-
-			/* This buffer is dirty, maybe we need to start flushing.
-			 * If too high a percentage of the buffers are dirty...
-			 */
-			if (nr_buffers_type[BUF_DIRTY] > too_many)
-				wakeup_bdflush(1);
+			balance_dirty(buf->b_dev);
 
 			/* If this is a loop device, and
 			 * more than half of the buffers are dirty...
@@ -1468,18 +1509,23 @@
 #define BUFFER_BUSY_BITS	((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
 #define buffer_busy(bh)		((bh)->b_count || ((bh)->b_state & BUFFER_BUSY_BITS))
 
-static int sync_page_buffers(struct buffer_head *bh, int wait)
+static int sync_page_buffers(struct buffer_head *bh)
 {
 	struct buffer_head * tmp = bh;
 
 	do {
 		struct buffer_head *p = tmp;
 		tmp = tmp->b_this_page;
-		if (buffer_locked(p)) {
-			if (wait)
-				__wait_on_buffer(p);
-		} else if (buffer_dirty(p))
-			ll_rw_block(WRITE, 1, &p);
+		if (buffer_dirty(p) || buffer_locked(p)) {
+			if (test_and_set_bit(BH_Wait_IO, &p->b_state)) {
+				if (buffer_dirty(p))
+					ll_rw_block(WRITE, 1, &p);
+				wait_on_buffer(p);
+			} else {
+				if (buffer_dirty(p))
+					ll_rw_block(WRITEA, 1, &p);
+			}
+		}
 	} while (tmp != bh);
 
 	do {
@@ -1499,10 +1545,9 @@
  * Wake up bdflush() if this fails - if we're running low on memory due
  * to dirty buffers, we need to flush them out as quickly as possible.
  */
-int try_to_free_buffers(struct page * page_map, int wait)
+int try_to_free_buffers(struct page * page_map)
 {
 	struct buffer_head * tmp, * bh = page_map->buffers;
-	int too_many;
 
 	tmp = bh;
 	do {
@@ -1531,25 +1576,14 @@
 	return 1;
 
  busy:
-	too_many = (nr_buffers * bdf_prm.b_un.nfract/100);
-
-	if (!sync_page_buffers(bh, wait)) {
-
-		/* If a high percentage of the buffers are dirty, 
-		 * wake kflushd 
-		 */
-		if (nr_buffers_type[BUF_DIRTY] > too_many)
-			wakeup_bdflush(0);
-			
+	if (!sync_page_buffers(bh))
 		/*
 		 * We can jump after the busy check because
 		 * we rely on the kernel lock.
 		 */
 		goto succeed;
-	}
 
-	if(nr_buffers_type[BUF_DIRTY] > too_many)
-		wakeup_bdflush(0);
+	wakeup_bdflush(0);
 	return 0;
 }
 
@@ -1561,7 +1595,7 @@
 	int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
 	int protected = 0;
 	int nlist;
-	static char *buf_types[NR_LIST] = {"CLEAN","LOCKED","DIRTY"};
+	static char *buf_types[NR_LIST] = {"CLEAN","LOCKED","DIRTY","PROTECTED",};
 
 	printk("Buffer memory:   %8ldkB\n",buffermem>>10);
 	printk("Buffer heads:    %6d\n",nr_buffer_heads);
@@ -1585,7 +1619,7 @@
 			used++, lastused = found;
 		bh = bh->b_next_free;
 	  } while (bh != lru_list[nlist]);
-	  printk("%8s: %d buffers, %d used (last=%d), "
+	  printk("%9s: %d buffers, %d used (last=%d), "
 		 "%d locked, %d protected, %d dirty\n",
 		 buf_types[nlist], found, used, lastused,
 		 locked, protected, dirty);
@@ -1930,7 +1964,8 @@
 		
 		/* If there are still a lot of dirty buffers around, skip the sleep
 		   and flush some more */
-		if(ndirty == 0 || nr_buffers_type[BUF_DIRTY] <= nr_buffers * bdf_prm.b_un.nfract/100) {
+		if (!ndirty || balance_dirty_state(NODEV) < 0)
+		{
 			spin_lock_irq(&current->sigmask_lock);
 			flush_signals(current);
 			spin_unlock_irq(&current->sigmask_lock);
diff -urN 2.2.17pre9/fs/dcache.c 2.2.17pre9-VM/fs/dcache.c
--- 2.2.17pre9/fs/dcache.c	Tue Jun 13 03:48:14 2000
+++ 2.2.17pre9-VM/fs/dcache.c	Sun Jul  2 23:57:23 2000
@@ -477,7 +477,7 @@
 {
 	if (gfp_mask & __GFP_IO) {
 		int count = 0;
-		if (priority)
+		if (priority > 1)
 			count = dentry_stat.nr_unused / priority;
 		prune_dcache(count, -1);
 	}
diff -urN 2.2.17pre9/fs/ext2/super.c 2.2.17pre9-VM/fs/ext2/super.c
--- 2.2.17pre9/fs/ext2/super.c	Mon Jan 17 16:44:42 2000
+++ 2.2.17pre9-VM/fs/ext2/super.c	Mon Jul  3 15:10:43 2000
@@ -589,7 +589,7 @@
 				       EXT2_BLOCKS_PER_GROUP(sb);
 	db_count = (sb->u.ext2_sb.s_groups_count + EXT2_DESC_PER_BLOCK(sb) - 1) /
 		   EXT2_DESC_PER_BLOCK(sb);
-	sb->u.ext2_sb.s_group_desc = kmalloc (db_count * sizeof (struct buffer_head *), GFP_KERNEL);
+	sb->u.ext2_sb.s_group_desc = kmalloc (db_count * sizeof (struct buffer_head *), GFP_BUFFER);
 	if (sb->u.ext2_sb.s_group_desc == NULL) {
 		printk ("EXT2-fs: not enough memory\n");
 		goto failed_mount;
diff -urN 2.2.17pre9/include/linux/fs.h 2.2.17pre9-VM/include/linux/fs.h
--- 2.2.17pre9/include/linux/fs.h	Fri Jun 30 04:03:09 2000
+++ 2.2.17pre9-VM/include/linux/fs.h	Mon Jul  3 04:45:55 2000
@@ -185,6 +185,7 @@
 #define BH_Lock		2	/* 1 if the buffer is locked */
 #define BH_Req		3	/* 0 if the buffer has been invalidated */
 #define BH_Protected	6	/* 1 if the buffer is protected */
+#define BH_Wait_IO	7	/* 1 if we should throttle on this buffer */
 
 /*
  * Try to keep the most commonly used fields in single cache lines (16
@@ -754,7 +755,7 @@
 
 extern void refile_buffer(struct buffer_head * buf);
 extern void set_writetime(struct buffer_head * buf, int flag);
-extern int try_to_free_buffers(struct page *, int wait);
+extern int try_to_free_buffers(struct page *);
 
 extern int nr_buffers;
 extern long buffermem;
@@ -763,9 +764,18 @@
 #define BUF_CLEAN	0
 #define BUF_LOCKED	1	/* Buffers scheduled for write */
 #define BUF_DIRTY	2	/* Dirty buffers, not yet scheduled for write */
-#define NR_LIST		3
+#define BUF_PROTECTED	3	/* Ramdisk persistent storage */
+#define NR_LIST		4
 
 void mark_buffer_uptodate(struct buffer_head * bh, int on);
+
+extern inline void mark_buffer_protected(struct buffer_head * bh)
+{
+	if (!test_and_set_bit(BH_Protected, &bh->b_state)) {
+		if (bh->b_list != BUF_PROTECTED)
+			refile_buffer(bh);
+	}
+}
 
 extern inline void mark_buffer_clean(struct buffer_head * bh)
 {
diff -urN 2.2.17pre9/include/linux/locks.h 2.2.17pre9-VM/include/linux/locks.h
--- 2.2.17pre9/include/linux/locks.h	Sun Jul  2 12:52:47 2000
+++ 2.2.17pre9-VM/include/linux/locks.h	Mon Jul  3 04:15:36 2000
@@ -29,6 +29,7 @@
 extern inline void unlock_buffer(struct buffer_head *bh)
 {
 	clear_bit(BH_Lock, &bh->b_state);
+	clear_bit(BH_Wait_IO, &bh->b_state);
 	wake_up(&bh->b_wait);
 }
 
diff -urN 2.2.17pre9/include/linux/sched.h 2.2.17pre9-VM/include/linux/sched.h
--- 2.2.17pre9/include/linux/sched.h	Wed Jun 28 17:13:15 2000
+++ 2.2.17pre9-VM/include/linux/sched.h	Sun Jul  2 23:45:13 2000
@@ -316,6 +316,7 @@
 	struct files_struct *files;
 /* memory management info */
 	struct mm_struct *mm;
+	struct list_head local_pages; int allocation_order, nr_local_pages;
 
 /* signal handlers */
 	spinlock_t sigmask_lock;	/* Protects signal and blocked */
@@ -348,6 +349,7 @@
 #define PF_SIGNALED	0x00000400	/* killed by a signal */
 #define PF_MEMALLOC	0x00000800	/* Allocating memory */
 #define PF_VFORK	0x00001000	/* Wake up parent in mm_release */
+#define PF_FREE_PAGES	0x00002000	/* The current-> */
 
 #define PF_USEDFPU	0x00100000	/* task used FPU this quantum (SMP) */
 #define PF_DTRACE	0x00200000	/* delayed trace (used on m68k, i386) */
@@ -395,7 +397,7 @@
 /* tss */	INIT_TSS, \
 /* fs */	&init_fs, \
 /* files */	&init_files, \
-/* mm */	&init_mm, \
+/* mm */	&init_mm, { &init_task.local_pages, &init_task.local_pages}, 0, 0, \
 /* signals */	SPIN_LOCK_UNLOCKED, &init_signals, {{0}}, {{0}}, NULL, &init_task.sigqueue, 0, 0, \
 /* exec cts */	0,0, \
 /* oom */	0, \
diff -urN 2.2.17pre9/init/main.c 2.2.17pre9-VM/init/main.c
--- 2.2.17pre9/init/main.c	Tue Jun 13 03:48:15 2000
+++ 2.2.17pre9-VM/init/main.c	Mon Jul  3 15:08:47 2000
@@ -77,7 +77,6 @@
 extern int bdflush(void *);
 extern int kupdate(void *);
 extern int kswapd(void *);
-extern int kpiod(void *);
 extern void kswapd_setup(void);
 extern unsigned long init_IRQ( unsigned long);
 extern void init_modules(void);
@@ -1531,7 +1530,6 @@
 	kernel_thread(kupdate, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
 	/* Start the background pageout daemon. */
 	kswapd_setup();
-	kernel_thread(kpiod, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
 	kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
 
 #if CONFIG_AP1000
diff -urN 2.2.17pre9/ipc/shm.c 2.2.17pre9-VM/ipc/shm.c
--- 2.2.17pre9/ipc/shm.c	Tue Jun 13 03:48:15 2000
+++ 2.2.17pre9-VM/ipc/shm.c	Sun Jul  2 23:57:23 2000
@@ -679,7 +679,7 @@
 }
 
 /*
- * Goes through counter = (shm_rss >> prio) present shm pages.
+ * Goes through counter = (shm_rss / prio) present shm pages.
  */
 static unsigned long swap_id = 0; /* currently being swapped */
 static unsigned long swap_idx = 0; /* next to swap */
@@ -693,7 +693,7 @@
 	int loop = 0;
 	int counter;
 	
-	counter = shm_rss >> prio;
+	counter = shm_rss / prio;
 	if (!counter || !(swap_nr = get_swap_page()))
 		return 0;
 
diff -urN 2.2.17pre9/kernel/fork.c 2.2.17pre9-VM/kernel/fork.c
--- 2.2.17pre9/kernel/fork.c	Mon Jan 17 16:44:50 2000
+++ 2.2.17pre9-VM/kernel/fork.c	Sun Jul  2 23:45:13 2000
@@ -665,6 +665,8 @@
 	p->lock_depth = -1;		/* -1 = no lock */
 	p->start_time = jiffies;
 
+	INIT_LIST_HEAD(&p->local_pages);
+
 	retval = -ENOMEM;
 	/* copy all the process information */
 	if (copy_files(clone_flags, p))
diff -urN 2.2.17pre9/mm/filemap.c 2.2.17pre9-VM/mm/filemap.c
--- 2.2.17pre9/mm/filemap.c	Wed Jun 28 17:13:15 2000
+++ 2.2.17pre9-VM/mm/filemap.c	Mon Jul  3 15:08:19 2000
@@ -19,7 +19,6 @@
 #include <linux/blkdev.h>
 #include <linux/file.h>
 #include <linux/swapctl.h>
-#include <linux/slab.h>
 #include <linux/init.h>
 
 #include <asm/pgtable.h>
@@ -36,26 +35,6 @@
 unsigned int page_hash_bits, page_hash_mask;
 struct page **page_hash_table;
 
-/* 
- * Define a request structure for outstanding page write requests
- * to the background page io daemon
- */
-
-struct pio_request 
-{
-	struct pio_request *	next;
-	struct file *		file;
-	unsigned long		offset;
-	unsigned long		page;
-};
-static struct pio_request *pio_first = NULL, **pio_last = &pio_first;
-static kmem_cache_t *pio_request_cache;
-static struct wait_queue *pio_wait = NULL;
-
-static inline void 
-make_pio_request(struct file *, unsigned long, unsigned long);
-
-
 /*
  * Invalidate the pages of an inode, removing all pages that aren't
  * locked down (those are sure to be up-to-date anyway, so we shouldn't
@@ -141,10 +120,9 @@
 	unsigned long limit = num_physpages;
 	struct page * page;
 	int count;
-	int nr_dirty = 0;
-	
+
 	/* Make sure we scan all pages twice at priority 0. */
-	count = (limit << 1) >> priority;
+	count = limit / priority;
 
  refresh_clock:
 	page = mem_map + clock;
@@ -198,14 +176,6 @@
 
 		/* Is it a buffer page? */
 		if (page->buffers) {
-			/*
-			 * Wait for async IO to complete
-			 * at each 64 buffers
-			 */ 
-
-			int wait = ((gfp_mask & __GFP_IO) 
-				&& (!(nr_dirty++ % 64)));
-
 			if (buffer_under_min())
 				continue;
 			/*
@@ -213,7 +183,7 @@
 			 * throttling.
 			 */
 
-			if (!try_to_free_buffers(page, wait))
+			if (!try_to_free_buffers(page))
 				goto refresh_clock;
 			return 1;
 		}
@@ -1146,8 +1116,7 @@
 
 static int filemap_write_page(struct vm_area_struct * vma,
 			      unsigned long offset,
-			      unsigned long page,
-			      int wait)
+			      unsigned long page)
 {
 	int result;
 	struct file * file;
@@ -1165,17 +1134,6 @@
 	 * and file could be released ... increment the count to be safe.
 	 */
 	file->f_count++;
-
-	/* 
-	 * If this is a swapping operation rather than msync(), then
-	 * leave the actual IO, and the restoration of the file count,
-	 * to the kpiod thread.  Just queue the request for now.
-	 */
-	if (!wait) {
-		make_pio_request(file, offset, page);
-		return 0;
-	}
-	
 	down(&inode->i_sem);
 	result = do_write_page(inode, file, (const char *) page, offset);
 	up(&inode->i_sem);
@@ -1191,7 +1149,7 @@
  */
 int filemap_swapout(struct vm_area_struct * vma, struct page * page)
 {
-	return filemap_write_page(vma, page->offset, page_address(page), 0);
+	return filemap_write_page(vma, page->offset, page_address(page));
 }
 
 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
@@ -1228,7 +1186,7 @@
 			return 0;
 		}
 	}
-	error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page, 1);
+	error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page);
 	page_cache_free(page);
 	return error;
 }
@@ -1658,130 +1616,6 @@
 	clear_bit(PG_locked, &page->flags);
 	wake_up(&page->wait);
 	page_cache_release(page);
-}
-
-
-/* Add request for page IO to the queue */
-
-static inline void put_pio_request(struct pio_request *p)
-{
-	*pio_last = p;
-	p->next = NULL;
-	pio_last = &p->next;
-}
-
-/* Take the first page IO request off the queue */
-
-static inline struct pio_request * get_pio_request(void)
-{
-	struct pio_request * p = pio_first;
-	pio_first = p->next;
-	if (!pio_first)
-		pio_last = &pio_first;
-	return p;
-}
-
-/* Make a new page IO request and queue it to the kpiod thread */
-
-static inline void make_pio_request(struct file *file,
-				    unsigned long offset,
-				    unsigned long page)
-{
-	struct pio_request *p;
-
-	atomic_inc(&page_cache_entry(page)->count);
-
-	/* 
-	 * We need to allocate without causing any recursive IO in the
-	 * current thread's context.  We might currently be swapping out
-	 * as a result of an allocation made while holding a critical
-	 * filesystem lock.  To avoid deadlock, we *MUST* not reenter
-	 * the filesystem in this thread.
-	 *
-	 * We can wait for kswapd to free memory, or we can try to free
-	 * pages without actually performing further IO, without fear of
-	 * deadlock.  --sct
-	 */
-
-	while ((p = kmem_cache_alloc(pio_request_cache, GFP_BUFFER)) == NULL) {
-		if (try_to_free_pages(__GFP_WAIT))
-			continue;
-		current->state = TASK_INTERRUPTIBLE;
-		schedule_timeout(HZ/10);
-	}
-	
-	p->file   = file;
-	p->offset = offset;
-	p->page   = page;
-
-	put_pio_request(p);
-	wake_up(&pio_wait);
-}
-
-
-/*
- * This is the only thread which is allowed to write out filemap pages
- * while swapping.
- * 
- * To avoid deadlock, it is important that we never reenter this thread.
- * Although recursive memory allocations within this thread may result
- * in more page swapping, that swapping will always be done by queuing
- * another IO request to the same thread: we will never actually start
- * that IO request until we have finished with the current one, and so
- * we will not deadlock.  
- */
-
-int kpiod(void * unused)
-{
-	struct task_struct *tsk = current;
-	struct wait_queue wait = { tsk, };
-	struct inode * inode;
-	struct dentry * dentry;
-	struct pio_request * p;
-	
-	tsk->session = 1;
-	tsk->pgrp = 1;
-	strcpy(tsk->comm, "kpiod");
-	sigfillset(&tsk->blocked);
-	init_waitqueue(&pio_wait);
-	/*
-	 * Mark this task as a memory allocator - we don't want to get caught
-	 * up in the regular mm freeing frenzy if we have to allocate memory
-	 * in order to write stuff out.
-	 */
-	tsk->flags |= PF_MEMALLOC;
-
-	lock_kernel();
-	
-	pio_request_cache = kmem_cache_create("pio_request", 
-					      sizeof(struct pio_request),
-					      0, SLAB_HWCACHE_ALIGN, 
-					      NULL, NULL);
-	if (!pio_request_cache)
-		panic ("Could not create pio_request slab cache");
-
-	while (1) {
-		tsk->state = TASK_INTERRUPTIBLE;
-		add_wait_queue(&pio_wait, &wait);
-		if (!pio_first)
-			schedule();
-		remove_wait_queue(&pio_wait, &wait);
-		tsk->state = TASK_RUNNING;
-
-		while (pio_first) {
-			p = get_pio_request();
-			dentry = p->file->f_dentry;
-			inode = dentry->d_inode;
-			
-			down(&inode->i_sem);
-			do_write_page(inode, p->file,
-				      (const char *) p->page, p->offset);
-			up(&inode->i_sem);
-			fput(p->file);
-			page_cache_free(p->page);
-			kmem_cache_free(pio_request_cache, p);
-		}
-	}
 }
 
 void __init page_cache_init(unsigned long memory_size)
diff -urN 2.2.17pre9/mm/page_alloc.c 2.2.17pre9-VM/mm/page_alloc.c
--- 2.2.17pre9/mm/page_alloc.c	Wed Jun 28 17:13:15 2000
+++ 2.2.17pre9-VM/mm/page_alloc.c	Sun Jul  2 23:56:30 2000
@@ -93,34 +93,68 @@
  */
 spinlock_t page_alloc_lock = SPIN_LOCK_UNLOCKED;
 
-static inline void free_pages_ok(unsigned long map_nr, unsigned long order, unsigned type)
-{
+#define list(x) (mem_map+(x))
+#define __free_pages_ok(map_nr, mask, area, index)		\
+	nr_free_pages -= (mask);				\
+	while ((mask) + (1 << (NR_MEM_LISTS-1))) {		\
+		if (!test_and_change_bit((index), (area)->map))	\
+			break;					\
+		(area)->count--;				\
+		remove_mem_queue(list((map_nr) ^ -(mask)));	\
+		(mask) <<= 1;					\
+		(area)++;					\
+		(index) >>= 1;					\
+		(map_nr) &= (mask);				\
+	}							\
+	add_mem_queue(area, list(map_nr));
+
+static void free_local_pages(struct page * page) {
+	unsigned long order = page->offset;
+	unsigned int type = PageDMA(page) ? 1 : 0;
 	struct free_area_struct *area = free_area[type] + order;
-	unsigned long index = map_nr >> (1 + order);
+	unsigned long map_nr = page - mem_map;
 	unsigned long mask = (~0UL) << order;
-	unsigned long flags;
+	unsigned long index = map_nr >> (1 + order);
 
-	spin_lock_irqsave(&page_alloc_lock, flags);
+	__free_pages_ok(map_nr, mask, area, index);
+}
 
-#define list(x) (mem_map+(x))
+static inline void free_pages_ok(unsigned long map_nr, unsigned long order, unsigned type)
+{
+	struct free_area_struct *area;
+	unsigned long index;
+	unsigned long mask;
+	unsigned long flags;
+	struct page * page;
 
+	if (current->flags & PF_FREE_PAGES)
+		goto local_freelist;
+ back_local_freelist:
+
+	area = free_area[type] + order;
+	index = map_nr >> (1 + order);
+	mask = (~0UL) << order;
 	map_nr &= mask;
-	nr_free_pages -= mask;
-	while (mask + (1 << (NR_MEM_LISTS-1))) {
-		if (!test_and_change_bit(index, area->map))
-			break;
-		area->count--;
-		remove_mem_queue(list(map_nr ^ -mask));
-		mask <<= 1;
-		area++;
-		index >>= 1;
-		map_nr &= mask;
-	}
-	add_mem_queue(area, list(map_nr));
-
-#undef list
 
+	spin_lock_irqsave(&page_alloc_lock, flags);
+	__free_pages_ok(map_nr, mask, area, index);
 	spin_unlock_irqrestore(&page_alloc_lock, flags);
+	return;
+
+ local_freelist:
+	/*
+	 * This is a little subtle: if the allocation order
+	 * wanted is major than zero we'd better take all the pages
+	 * local since we must deal with fragmentation too and we
+	 * can't rely on the nr_local_pages information.
+	 */
+	if (current->nr_local_pages && !current->allocation_order)
+		goto back_local_freelist;
+
+	page = mem_map + map_nr;
+	list_add((struct list_head *) page, &current->local_pages);
+	page->offset = order;
+	current->nr_local_pages++;
 }
 
 void __free_pages(struct page *page, unsigned long order)
@@ -179,13 +213,32 @@
 	atomic_set(&map->count, 1); \
 } while (0)
 
+static void refile_local_pages(void)
+{
+	if (current->nr_local_pages) {
+		struct page * page;
+		struct list_head * entry;
+		int nr_pages = current->nr_local_pages;
+
+		while ((entry = current->local_pages.next) != &current->local_pages) {
+			list_del(entry);
+			page = (struct page *) entry;
+			free_local_pages(page);
+			if (!nr_pages--)
+				panic("__get_free_pages local_pages list corrupted I");
+		}
+		if (nr_pages)
+			panic("__get_free_pages local_pages list corrupted II");
+		current->nr_local_pages = 0;
+	}
+}
+
 unsigned long __get_free_pages(int gfp_mask, unsigned long order)
 {
 	unsigned long flags;
-	static atomic_t free_before_allocate = ATOMIC_INIT(0);
 
 	if (order >= NR_MEM_LISTS)
-		goto nopage;
+		goto out;
 
 #ifdef ATOMIC_MEMORY_DEBUGGING
 	if ((gfp_mask & __GFP_WAIT) && in_interrupt()) {
@@ -194,26 +247,24 @@
 			printk("gfp called nonatomically from interrupt %p\n",
 				__builtin_return_address(0));
 		}
-		goto nopage;
+		goto out;
 	}
 #endif
 
 	/*
+	 * Acquire lock before reading nr_free_pages to make sure it
+	 * won't change from under us.
+	 */
+	spin_lock_irqsave(&page_alloc_lock, flags);
+
+	/*
 	 * If this is a recursive call, we'd better
 	 * do our best to just allocate things without
 	 * further thought.
 	 */
 	if (!(current->flags & PF_MEMALLOC)) {
-		int freed;
 		extern struct wait_queue * kswapd_wait;
 
-		/* Somebody needs to free pages so we free some of our own. */
-		if (atomic_read(&free_before_allocate)) {
-			current->flags |= PF_MEMALLOC;
-			try_to_free_pages(gfp_mask);
-			current->flags &= ~PF_MEMALLOC;
-		}
-
 		if (nr_free_pages > freepages.low)
 			goto ok_to_allocate;
 
@@ -224,34 +275,44 @@
 		if (nr_free_pages > freepages.min)
 			goto ok_to_allocate;
 
-		current->flags |= PF_MEMALLOC;
-		atomic_inc(&free_before_allocate);
-		freed = try_to_free_pages(gfp_mask);
-		atomic_dec(&free_before_allocate);
-		current->flags &= ~PF_MEMALLOC;
-
-		/*
-		 * Re-check we're still low on memory after we blocked
-		 * for some time. Somebody may have released lots of
-		 * memory from under us while we was trying to free
-		 * the pages. We check against pages_high to be sure
-		 * to succeed only if lots of memory is been released.
-		 */
-		if (nr_free_pages > freepages.high)
-			goto ok_to_allocate;
+		if (gfp_mask & __GFP_WAIT) {
+			int freed;
+			/*
+			 * If the task is ok to sleep it's fine also
+			 * if we release irq here.
+			 */
+			spin_unlock_irq(&page_alloc_lock);
+
+			current->flags |= PF_MEMALLOC|PF_FREE_PAGES;
+			current->allocation_order = order;
+			freed = try_to_free_pages(gfp_mask);
+			current->flags &= ~(PF_MEMALLOC|PF_FREE_PAGES);
+
+			spin_lock_irq(&page_alloc_lock);
+			refile_local_pages();
+
+			/*
+			 * Re-check we're still low on memory after we blocked
+			 * for some time. Somebody may have released lots of
+			 * memory from under us while we was trying to free
+			 * the pages. We check against pages_high to be sure
+			 * to succeed only if lots of memory is been released.
+			 */
+			if (nr_free_pages > freepages.high)
+				goto ok_to_allocate;
 
-		if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
-			goto nopage;
+			if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
+				goto nopage;
+		}
 	}
 ok_to_allocate:
-	spin_lock_irqsave(&page_alloc_lock, flags);
 	/* if it's not a dma request, try non-dma first */
 	if (!(gfp_mask & __GFP_DMA))
 		RMQUEUE_TYPE(order, 0);
 	RMQUEUE_TYPE(order, 1);
+ nopage:
 	spin_unlock_irqrestore(&page_alloc_lock, flags);
-
-nopage:
+ out:
 	return 0;
 }
 
@@ -310,8 +371,8 @@
 	 * analysis.
 	 */
 	i = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT+7);
-	if (i < 10)
-		i = 10;
+	if (i < 50)
+		i = 50;
 	if (i > 256)
 		i = 256;
 	freepages.min = i;
diff -urN 2.2.17pre9/mm/vmscan.c 2.2.17pre9-VM/mm/vmscan.c
--- 2.2.17pre9/mm/vmscan.c	Wed Jun 28 17:13:15 2000
+++ 2.2.17pre9-VM/mm/vmscan.c	Sun Jul  2 23:57:54 2000
@@ -327,7 +327,7 @@
 	 * Think of swap_cnt as a "shadow rss" - it tells us which process
 	 * we want to page out (always try largest first).
 	 */
-	counter = nr_tasks / (priority+1);
+	counter = nr_tasks / priority;
 	if (counter < 1)
 		counter = 1;
 
@@ -377,11 +377,9 @@
  * cluster them so that we get good swap-out behaviour. See
  * the "free_memory()" macro for details.
  */
-static int do_try_to_free_pages(unsigned int gfp_mask)
+int try_to_free_pages(unsigned int gfp_mask)
 {
 	int priority;
-	int ret = 0;
-	int swapcount;
 	int count = SWAP_CLUSTER_MAX;
 
 	lock_kernel();
@@ -392,7 +390,6 @@
 	priority = 6;
 	do {
 		while (shrink_mmap(priority, gfp_mask)) {
-			ret = 1;
 			if (!--count)
 				goto done;
 		}
@@ -400,30 +397,24 @@
 		/* Try to get rid of some shared memory pages.. */
 		if (gfp_mask & __GFP_IO) {
 			while (shm_swap(priority, gfp_mask)) {
-				ret = 1;
 				if (!--count)
 					goto done;
 			}
 		}
 
 		/* Then, try to page stuff out.. */
-		swapcount = count;
 		while (swap_out(priority, gfp_mask)) {
-			ret = 1;
-			if (!--swapcount)
-				break;
+			if (!--count)
+				goto done;
 		}
 
 		shrink_dcache_memory(priority, gfp_mask);
-	} while (--priority >= 0);
+	} while (--priority > 0);
 done:
 	unlock_kernel();
 
-	if (!ret)
-		printk("VM: do_try_to_free_pages failed for %s...\n",
-				current->comm);
 	/* Return success if we freed a page. */
-	return ret;
+	return priority > 0;
 }
 
 /*
@@ -499,7 +490,7 @@
 
 		while (nr_free_pages < freepages.high)
 		{
-			if (do_try_to_free_pages(GFP_KSWAPD))
+			if (try_to_free_pages(GFP_KSWAPD))
 			{
 				if (tsk->need_resched)
 					schedule();
@@ -510,17 +501,3 @@
 		}
 	}
 }
-
-/*
- * Called by non-kswapd processes when kswapd really cannot
- * keep up with the demand for free memory.
- */
-int try_to_free_pages(unsigned int gfp_mask)
-{
-	int retval = 1;
-
-	if (gfp_mask & __GFP_WAIT)
-		retval = do_try_to_free_pages(gfp_mask);
-	return retval;
-}
-