diff -urN 2.2.17pre9/drivers/block/rd.c 2.2.17pre9-VM/drivers/block/rd.c --- 2.2.17pre9/drivers/block/rd.c Wed Jun 28 17:13:02 2000 +++ 2.2.17pre9-VM/drivers/block/rd.c Sun Jul 2 23:56:02 2000 @@ -173,7 +173,7 @@ if (CURRENT->cmd == READ) memset(CURRENT->buffer, 0, len); else - set_bit(BH_Protected, &CURRENT->bh->b_state); + mark_buffer_protected(CURRENT->bh); end_request(1); goto repeat; diff -urN 2.2.17pre9/fs/buffer.c 2.2.17pre9-VM/fs/buffer.c --- 2.2.17pre9/fs/buffer.c Wed Jun 28 17:13:08 2000 +++ 2.2.17pre9-VM/fs/buffer.c Mon Jul 3 11:29:32 2000 @@ -27,6 +27,8 @@ /* invalidate_buffers/set_blocksize/sync_dev race conditions and fs corruption fixes, 1999, Andrea Arcangeli */ +/* async buffer flushing, 1999 Andrea Arcangeli */ + /* Wait for dirty buffers to sync in sync_page_buffers. * 2000, Marcelo Tosatti */ @@ -83,6 +85,7 @@ static int nr_buffers = 0; static int nr_buffers_type[NR_LIST] = {0,}; +static unsigned long size_buffers_type[NR_LIST]; static int nr_buffer_heads = 0; static int nr_unused_buffer_heads = 0; static int nr_hashed_buffers = 0; @@ -474,6 +477,7 @@ return; } nr_buffers_type[bh->b_list]--; + size_buffers_type[bh->b_list] -= bh->b_size; remove_from_hash_queue(bh); remove_from_lru_list(bh); } @@ -523,6 +527,7 @@ (*bhp)->b_prev_free = bh; nr_buffers_type[bh->b_list]++; + size_buffers_type[bh->b_list] += bh->b_size; /* Put the buffer in new hash-queue if it has a device. */ bh->b_next = NULL; @@ -816,6 +821,46 @@ insert_into_queues(bh); } +/* -1 -> no need to flush + 0 -> async flush + 1 -> sync flush (wait for I/O completation) */ +static int balance_dirty_state(kdev_t dev) +{ + unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit; + + dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT; + tot = (buffermem >> PAGE_SHIFT) + nr_free_pages; + tot -= size_buffers_type[BUF_PROTECTED] >> PAGE_SHIFT; + + dirty *= 200; + soft_dirty_limit = tot * bdf_prm.b_un.nfract; + hard_dirty_limit = soft_dirty_limit * 2; + + if (dirty > soft_dirty_limit) + { + if (dirty > hard_dirty_limit) + return 1; + return 0; + } + return -1; +} + +/* + * if a new dirty buffer is created we need to balance bdflush. + * + * in the future we might want to make bdflush aware of different + * pressures on different devices - thus the (currently unused) + * 'dev' parameter. + */ +void balance_dirty(kdev_t dev) +{ + int state = balance_dirty_state(dev); + + if (state < 0) + return; + wakeup_bdflush(state); +} + /* * A buffer may need to be moved from one buffer list to another * (e.g. in case it is not shared any more). Handle this. @@ -828,7 +873,9 @@ printk("Attempt to refile free buffer\n"); return; } - if (buffer_dirty(buf)) + if (buffer_protected(buf)) + dispose = BUF_PROTECTED; + else if (buffer_dirty(buf)) dispose = BUF_DIRTY; else if (buffer_locked(buf)) dispose = BUF_LOCKED; @@ -837,13 +884,7 @@ if(dispose != buf->b_list) { file_buffer(buf, dispose); if(dispose == BUF_DIRTY) { - int too_many = (nr_buffers * bdf_prm.b_un.nfract/100); - - /* This buffer is dirty, maybe we need to start flushing. - * If too high a percentage of the buffers are dirty... - */ - if (nr_buffers_type[BUF_DIRTY] > too_many) - wakeup_bdflush(1); + balance_dirty(buf->b_dev); /* If this is a loop device, and * more than half of the buffers are dirty... @@ -1468,18 +1509,23 @@ #define BUFFER_BUSY_BITS ((1<b_count || ((bh)->b_state & BUFFER_BUSY_BITS)) -static int sync_page_buffers(struct buffer_head *bh, int wait) +static int sync_page_buffers(struct buffer_head *bh) { struct buffer_head * tmp = bh; do { struct buffer_head *p = tmp; tmp = tmp->b_this_page; - if (buffer_locked(p)) { - if (wait) - __wait_on_buffer(p); - } else if (buffer_dirty(p)) - ll_rw_block(WRITE, 1, &p); + if (buffer_dirty(p) || buffer_locked(p)) { + if (test_and_set_bit(BH_Wait_IO, &p->b_state)) { + if (buffer_dirty(p)) + ll_rw_block(WRITE, 1, &p); + wait_on_buffer(p); + } else { + if (buffer_dirty(p)) + ll_rw_block(WRITEA, 1, &p); + } + } } while (tmp != bh); do { @@ -1499,10 +1545,9 @@ * Wake up bdflush() if this fails - if we're running low on memory due * to dirty buffers, we need to flush them out as quickly as possible. */ -int try_to_free_buffers(struct page * page_map, int wait) +int try_to_free_buffers(struct page * page_map) { struct buffer_head * tmp, * bh = page_map->buffers; - int too_many; tmp = bh; do { @@ -1531,25 +1576,14 @@ return 1; busy: - too_many = (nr_buffers * bdf_prm.b_un.nfract/100); - - if (!sync_page_buffers(bh, wait)) { - - /* If a high percentage of the buffers are dirty, - * wake kflushd - */ - if (nr_buffers_type[BUF_DIRTY] > too_many) - wakeup_bdflush(0); - + if (!sync_page_buffers(bh)) /* * We can jump after the busy check because * we rely on the kernel lock. */ goto succeed; - } - if(nr_buffers_type[BUF_DIRTY] > too_many) - wakeup_bdflush(0); + wakeup_bdflush(0); return 0; } @@ -1561,7 +1595,7 @@ int found = 0, locked = 0, dirty = 0, used = 0, lastused = 0; int protected = 0; int nlist; - static char *buf_types[NR_LIST] = {"CLEAN","LOCKED","DIRTY"}; + static char *buf_types[NR_LIST] = {"CLEAN","LOCKED","DIRTY","PROTECTED",}; printk("Buffer memory: %8ldkB\n",buffermem>>10); printk("Buffer heads: %6d\n",nr_buffer_heads); @@ -1585,7 +1619,7 @@ used++, lastused = found; bh = bh->b_next_free; } while (bh != lru_list[nlist]); - printk("%8s: %d buffers, %d used (last=%d), " + printk("%9s: %d buffers, %d used (last=%d), " "%d locked, %d protected, %d dirty\n", buf_types[nlist], found, used, lastused, locked, protected, dirty); @@ -1930,7 +1964,8 @@ /* If there are still a lot of dirty buffers around, skip the sleep and flush some more */ - if(ndirty == 0 || nr_buffers_type[BUF_DIRTY] <= nr_buffers * bdf_prm.b_un.nfract/100) { + if (!ndirty || balance_dirty_state(NODEV) < 0) + { spin_lock_irq(¤t->sigmask_lock); flush_signals(current); spin_unlock_irq(¤t->sigmask_lock); diff -urN 2.2.17pre9/fs/dcache.c 2.2.17pre9-VM/fs/dcache.c --- 2.2.17pre9/fs/dcache.c Tue Jun 13 03:48:14 2000 +++ 2.2.17pre9-VM/fs/dcache.c Sun Jul 2 23:57:23 2000 @@ -477,7 +477,7 @@ { if (gfp_mask & __GFP_IO) { int count = 0; - if (priority) + if (priority > 1) count = dentry_stat.nr_unused / priority; prune_dcache(count, -1); } diff -urN 2.2.17pre9/fs/ext2/super.c 2.2.17pre9-VM/fs/ext2/super.c --- 2.2.17pre9/fs/ext2/super.c Mon Jan 17 16:44:42 2000 +++ 2.2.17pre9-VM/fs/ext2/super.c Mon Jul 3 15:10:43 2000 @@ -589,7 +589,7 @@ EXT2_BLOCKS_PER_GROUP(sb); db_count = (sb->u.ext2_sb.s_groups_count + EXT2_DESC_PER_BLOCK(sb) - 1) / EXT2_DESC_PER_BLOCK(sb); - sb->u.ext2_sb.s_group_desc = kmalloc (db_count * sizeof (struct buffer_head *), GFP_KERNEL); + sb->u.ext2_sb.s_group_desc = kmalloc (db_count * sizeof (struct buffer_head *), GFP_BUFFER); if (sb->u.ext2_sb.s_group_desc == NULL) { printk ("EXT2-fs: not enough memory\n"); goto failed_mount; diff -urN 2.2.17pre9/include/linux/fs.h 2.2.17pre9-VM/include/linux/fs.h --- 2.2.17pre9/include/linux/fs.h Fri Jun 30 04:03:09 2000 +++ 2.2.17pre9-VM/include/linux/fs.h Mon Jul 3 04:45:55 2000 @@ -185,6 +185,7 @@ #define BH_Lock 2 /* 1 if the buffer is locked */ #define BH_Req 3 /* 0 if the buffer has been invalidated */ #define BH_Protected 6 /* 1 if the buffer is protected */ +#define BH_Wait_IO 7 /* 1 if we should throttle on this buffer */ /* * Try to keep the most commonly used fields in single cache lines (16 @@ -754,7 +755,7 @@ extern void refile_buffer(struct buffer_head * buf); extern void set_writetime(struct buffer_head * buf, int flag); -extern int try_to_free_buffers(struct page *, int wait); +extern int try_to_free_buffers(struct page *); extern int nr_buffers; extern long buffermem; @@ -763,9 +764,18 @@ #define BUF_CLEAN 0 #define BUF_LOCKED 1 /* Buffers scheduled for write */ #define BUF_DIRTY 2 /* Dirty buffers, not yet scheduled for write */ -#define NR_LIST 3 +#define BUF_PROTECTED 3 /* Ramdisk persistent storage */ +#define NR_LIST 4 void mark_buffer_uptodate(struct buffer_head * bh, int on); + +extern inline void mark_buffer_protected(struct buffer_head * bh) +{ + if (!test_and_set_bit(BH_Protected, &bh->b_state)) { + if (bh->b_list != BUF_PROTECTED) + refile_buffer(bh); + } +} extern inline void mark_buffer_clean(struct buffer_head * bh) { diff -urN 2.2.17pre9/include/linux/locks.h 2.2.17pre9-VM/include/linux/locks.h --- 2.2.17pre9/include/linux/locks.h Sun Jul 2 12:52:47 2000 +++ 2.2.17pre9-VM/include/linux/locks.h Mon Jul 3 04:15:36 2000 @@ -29,6 +29,7 @@ extern inline void unlock_buffer(struct buffer_head *bh) { clear_bit(BH_Lock, &bh->b_state); + clear_bit(BH_Wait_IO, &bh->b_state); wake_up(&bh->b_wait); } diff -urN 2.2.17pre9/include/linux/sched.h 2.2.17pre9-VM/include/linux/sched.h --- 2.2.17pre9/include/linux/sched.h Wed Jun 28 17:13:15 2000 +++ 2.2.17pre9-VM/include/linux/sched.h Sun Jul 2 23:45:13 2000 @@ -316,6 +316,7 @@ struct files_struct *files; /* memory management info */ struct mm_struct *mm; + struct list_head local_pages; int allocation_order, nr_local_pages; /* signal handlers */ spinlock_t sigmask_lock; /* Protects signal and blocked */ @@ -348,6 +349,7 @@ #define PF_SIGNALED 0x00000400 /* killed by a signal */ #define PF_MEMALLOC 0x00000800 /* Allocating memory */ #define PF_VFORK 0x00001000 /* Wake up parent in mm_release */ +#define PF_FREE_PAGES 0x00002000 /* The current-> */ #define PF_USEDFPU 0x00100000 /* task used FPU this quantum (SMP) */ #define PF_DTRACE 0x00200000 /* delayed trace (used on m68k, i386) */ @@ -395,7 +397,7 @@ /* tss */ INIT_TSS, \ /* fs */ &init_fs, \ /* files */ &init_files, \ -/* mm */ &init_mm, \ +/* mm */ &init_mm, { &init_task.local_pages, &init_task.local_pages}, 0, 0, \ /* signals */ SPIN_LOCK_UNLOCKED, &init_signals, {{0}}, {{0}}, NULL, &init_task.sigqueue, 0, 0, \ /* exec cts */ 0,0, \ /* oom */ 0, \ diff -urN 2.2.17pre9/init/main.c 2.2.17pre9-VM/init/main.c --- 2.2.17pre9/init/main.c Tue Jun 13 03:48:15 2000 +++ 2.2.17pre9-VM/init/main.c Mon Jul 3 15:08:47 2000 @@ -77,7 +77,6 @@ extern int bdflush(void *); extern int kupdate(void *); extern int kswapd(void *); -extern int kpiod(void *); extern void kswapd_setup(void); extern unsigned long init_IRQ( unsigned long); extern void init_modules(void); @@ -1531,7 +1530,6 @@ kernel_thread(kupdate, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND); /* Start the background pageout daemon. */ kswapd_setup(); - kernel_thread(kpiod, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND); kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND); #if CONFIG_AP1000 diff -urN 2.2.17pre9/ipc/shm.c 2.2.17pre9-VM/ipc/shm.c --- 2.2.17pre9/ipc/shm.c Tue Jun 13 03:48:15 2000 +++ 2.2.17pre9-VM/ipc/shm.c Sun Jul 2 23:57:23 2000 @@ -679,7 +679,7 @@ } /* - * Goes through counter = (shm_rss >> prio) present shm pages. + * Goes through counter = (shm_rss / prio) present shm pages. */ static unsigned long swap_id = 0; /* currently being swapped */ static unsigned long swap_idx = 0; /* next to swap */ @@ -693,7 +693,7 @@ int loop = 0; int counter; - counter = shm_rss >> prio; + counter = shm_rss / prio; if (!counter || !(swap_nr = get_swap_page())) return 0; diff -urN 2.2.17pre9/kernel/fork.c 2.2.17pre9-VM/kernel/fork.c --- 2.2.17pre9/kernel/fork.c Mon Jan 17 16:44:50 2000 +++ 2.2.17pre9-VM/kernel/fork.c Sun Jul 2 23:45:13 2000 @@ -665,6 +665,8 @@ p->lock_depth = -1; /* -1 = no lock */ p->start_time = jiffies; + INIT_LIST_HEAD(&p->local_pages); + retval = -ENOMEM; /* copy all the process information */ if (copy_files(clone_flags, p)) diff -urN 2.2.17pre9/mm/filemap.c 2.2.17pre9-VM/mm/filemap.c --- 2.2.17pre9/mm/filemap.c Wed Jun 28 17:13:15 2000 +++ 2.2.17pre9-VM/mm/filemap.c Mon Jul 3 15:08:19 2000 @@ -19,7 +19,6 @@ #include #include #include -#include #include #include @@ -36,26 +35,6 @@ unsigned int page_hash_bits, page_hash_mask; struct page **page_hash_table; -/* - * Define a request structure for outstanding page write requests - * to the background page io daemon - */ - -struct pio_request -{ - struct pio_request * next; - struct file * file; - unsigned long offset; - unsigned long page; -}; -static struct pio_request *pio_first = NULL, **pio_last = &pio_first; -static kmem_cache_t *pio_request_cache; -static struct wait_queue *pio_wait = NULL; - -static inline void -make_pio_request(struct file *, unsigned long, unsigned long); - - /* * Invalidate the pages of an inode, removing all pages that aren't * locked down (those are sure to be up-to-date anyway, so we shouldn't @@ -141,10 +120,9 @@ unsigned long limit = num_physpages; struct page * page; int count; - int nr_dirty = 0; - + /* Make sure we scan all pages twice at priority 0. */ - count = (limit << 1) >> priority; + count = limit / priority; refresh_clock: page = mem_map + clock; @@ -198,14 +176,6 @@ /* Is it a buffer page? */ if (page->buffers) { - /* - * Wait for async IO to complete - * at each 64 buffers - */ - - int wait = ((gfp_mask & __GFP_IO) - && (!(nr_dirty++ % 64))); - if (buffer_under_min()) continue; /* @@ -213,7 +183,7 @@ * throttling. */ - if (!try_to_free_buffers(page, wait)) + if (!try_to_free_buffers(page)) goto refresh_clock; return 1; } @@ -1146,8 +1116,7 @@ static int filemap_write_page(struct vm_area_struct * vma, unsigned long offset, - unsigned long page, - int wait) + unsigned long page) { int result; struct file * file; @@ -1165,17 +1134,6 @@ * and file could be released ... increment the count to be safe. */ file->f_count++; - - /* - * If this is a swapping operation rather than msync(), then - * leave the actual IO, and the restoration of the file count, - * to the kpiod thread. Just queue the request for now. - */ - if (!wait) { - make_pio_request(file, offset, page); - return 0; - } - down(&inode->i_sem); result = do_write_page(inode, file, (const char *) page, offset); up(&inode->i_sem); @@ -1191,7 +1149,7 @@ */ int filemap_swapout(struct vm_area_struct * vma, struct page * page) { - return filemap_write_page(vma, page->offset, page_address(page), 0); + return filemap_write_page(vma, page->offset, page_address(page)); } static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma, @@ -1228,7 +1186,7 @@ return 0; } } - error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page, 1); + error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page); page_cache_free(page); return error; } @@ -1658,130 +1616,6 @@ clear_bit(PG_locked, &page->flags); wake_up(&page->wait); page_cache_release(page); -} - - -/* Add request for page IO to the queue */ - -static inline void put_pio_request(struct pio_request *p) -{ - *pio_last = p; - p->next = NULL; - pio_last = &p->next; -} - -/* Take the first page IO request off the queue */ - -static inline struct pio_request * get_pio_request(void) -{ - struct pio_request * p = pio_first; - pio_first = p->next; - if (!pio_first) - pio_last = &pio_first; - return p; -} - -/* Make a new page IO request and queue it to the kpiod thread */ - -static inline void make_pio_request(struct file *file, - unsigned long offset, - unsigned long page) -{ - struct pio_request *p; - - atomic_inc(&page_cache_entry(page)->count); - - /* - * We need to allocate without causing any recursive IO in the - * current thread's context. We might currently be swapping out - * as a result of an allocation made while holding a critical - * filesystem lock. To avoid deadlock, we *MUST* not reenter - * the filesystem in this thread. - * - * We can wait for kswapd to free memory, or we can try to free - * pages without actually performing further IO, without fear of - * deadlock. --sct - */ - - while ((p = kmem_cache_alloc(pio_request_cache, GFP_BUFFER)) == NULL) { - if (try_to_free_pages(__GFP_WAIT)) - continue; - current->state = TASK_INTERRUPTIBLE; - schedule_timeout(HZ/10); - } - - p->file = file; - p->offset = offset; - p->page = page; - - put_pio_request(p); - wake_up(&pio_wait); -} - - -/* - * This is the only thread which is allowed to write out filemap pages - * while swapping. - * - * To avoid deadlock, it is important that we never reenter this thread. - * Although recursive memory allocations within this thread may result - * in more page swapping, that swapping will always be done by queuing - * another IO request to the same thread: we will never actually start - * that IO request until we have finished with the current one, and so - * we will not deadlock. - */ - -int kpiod(void * unused) -{ - struct task_struct *tsk = current; - struct wait_queue wait = { tsk, }; - struct inode * inode; - struct dentry * dentry; - struct pio_request * p; - - tsk->session = 1; - tsk->pgrp = 1; - strcpy(tsk->comm, "kpiod"); - sigfillset(&tsk->blocked); - init_waitqueue(&pio_wait); - /* - * Mark this task as a memory allocator - we don't want to get caught - * up in the regular mm freeing frenzy if we have to allocate memory - * in order to write stuff out. - */ - tsk->flags |= PF_MEMALLOC; - - lock_kernel(); - - pio_request_cache = kmem_cache_create("pio_request", - sizeof(struct pio_request), - 0, SLAB_HWCACHE_ALIGN, - NULL, NULL); - if (!pio_request_cache) - panic ("Could not create pio_request slab cache"); - - while (1) { - tsk->state = TASK_INTERRUPTIBLE; - add_wait_queue(&pio_wait, &wait); - if (!pio_first) - schedule(); - remove_wait_queue(&pio_wait, &wait); - tsk->state = TASK_RUNNING; - - while (pio_first) { - p = get_pio_request(); - dentry = p->file->f_dentry; - inode = dentry->d_inode; - - down(&inode->i_sem); - do_write_page(inode, p->file, - (const char *) p->page, p->offset); - up(&inode->i_sem); - fput(p->file); - page_cache_free(p->page); - kmem_cache_free(pio_request_cache, p); - } - } } void __init page_cache_init(unsigned long memory_size) diff -urN 2.2.17pre9/mm/page_alloc.c 2.2.17pre9-VM/mm/page_alloc.c --- 2.2.17pre9/mm/page_alloc.c Wed Jun 28 17:13:15 2000 +++ 2.2.17pre9-VM/mm/page_alloc.c Sun Jul 2 23:56:30 2000 @@ -93,34 +93,68 @@ */ spinlock_t page_alloc_lock = SPIN_LOCK_UNLOCKED; -static inline void free_pages_ok(unsigned long map_nr, unsigned long order, unsigned type) -{ +#define list(x) (mem_map+(x)) +#define __free_pages_ok(map_nr, mask, area, index) \ + nr_free_pages -= (mask); \ + while ((mask) + (1 << (NR_MEM_LISTS-1))) { \ + if (!test_and_change_bit((index), (area)->map)) \ + break; \ + (area)->count--; \ + remove_mem_queue(list((map_nr) ^ -(mask))); \ + (mask) <<= 1; \ + (area)++; \ + (index) >>= 1; \ + (map_nr) &= (mask); \ + } \ + add_mem_queue(area, list(map_nr)); + +static void free_local_pages(struct page * page) { + unsigned long order = page->offset; + unsigned int type = PageDMA(page) ? 1 : 0; struct free_area_struct *area = free_area[type] + order; - unsigned long index = map_nr >> (1 + order); + unsigned long map_nr = page - mem_map; unsigned long mask = (~0UL) << order; - unsigned long flags; + unsigned long index = map_nr >> (1 + order); - spin_lock_irqsave(&page_alloc_lock, flags); + __free_pages_ok(map_nr, mask, area, index); +} -#define list(x) (mem_map+(x)) +static inline void free_pages_ok(unsigned long map_nr, unsigned long order, unsigned type) +{ + struct free_area_struct *area; + unsigned long index; + unsigned long mask; + unsigned long flags; + struct page * page; + if (current->flags & PF_FREE_PAGES) + goto local_freelist; + back_local_freelist: + + area = free_area[type] + order; + index = map_nr >> (1 + order); + mask = (~0UL) << order; map_nr &= mask; - nr_free_pages -= mask; - while (mask + (1 << (NR_MEM_LISTS-1))) { - if (!test_and_change_bit(index, area->map)) - break; - area->count--; - remove_mem_queue(list(map_nr ^ -mask)); - mask <<= 1; - area++; - index >>= 1; - map_nr &= mask; - } - add_mem_queue(area, list(map_nr)); - -#undef list + spin_lock_irqsave(&page_alloc_lock, flags); + __free_pages_ok(map_nr, mask, area, index); spin_unlock_irqrestore(&page_alloc_lock, flags); + return; + + local_freelist: + /* + * This is a little subtle: if the allocation order + * wanted is major than zero we'd better take all the pages + * local since we must deal with fragmentation too and we + * can't rely on the nr_local_pages information. + */ + if (current->nr_local_pages && !current->allocation_order) + goto back_local_freelist; + + page = mem_map + map_nr; + list_add((struct list_head *) page, ¤t->local_pages); + page->offset = order; + current->nr_local_pages++; } void __free_pages(struct page *page, unsigned long order) @@ -179,13 +213,32 @@ atomic_set(&map->count, 1); \ } while (0) +static void refile_local_pages(void) +{ + if (current->nr_local_pages) { + struct page * page; + struct list_head * entry; + int nr_pages = current->nr_local_pages; + + while ((entry = current->local_pages.next) != ¤t->local_pages) { + list_del(entry); + page = (struct page *) entry; + free_local_pages(page); + if (!nr_pages--) + panic("__get_free_pages local_pages list corrupted I"); + } + if (nr_pages) + panic("__get_free_pages local_pages list corrupted II"); + current->nr_local_pages = 0; + } +} + unsigned long __get_free_pages(int gfp_mask, unsigned long order) { unsigned long flags; - static atomic_t free_before_allocate = ATOMIC_INIT(0); if (order >= NR_MEM_LISTS) - goto nopage; + goto out; #ifdef ATOMIC_MEMORY_DEBUGGING if ((gfp_mask & __GFP_WAIT) && in_interrupt()) { @@ -194,26 +247,24 @@ printk("gfp called nonatomically from interrupt %p\n", __builtin_return_address(0)); } - goto nopage; + goto out; } #endif /* + * Acquire lock before reading nr_free_pages to make sure it + * won't change from under us. + */ + spin_lock_irqsave(&page_alloc_lock, flags); + + /* * If this is a recursive call, we'd better * do our best to just allocate things without * further thought. */ if (!(current->flags & PF_MEMALLOC)) { - int freed; extern struct wait_queue * kswapd_wait; - /* Somebody needs to free pages so we free some of our own. */ - if (atomic_read(&free_before_allocate)) { - current->flags |= PF_MEMALLOC; - try_to_free_pages(gfp_mask); - current->flags &= ~PF_MEMALLOC; - } - if (nr_free_pages > freepages.low) goto ok_to_allocate; @@ -224,34 +275,44 @@ if (nr_free_pages > freepages.min) goto ok_to_allocate; - current->flags |= PF_MEMALLOC; - atomic_inc(&free_before_allocate); - freed = try_to_free_pages(gfp_mask); - atomic_dec(&free_before_allocate); - current->flags &= ~PF_MEMALLOC; - - /* - * Re-check we're still low on memory after we blocked - * for some time. Somebody may have released lots of - * memory from under us while we was trying to free - * the pages. We check against pages_high to be sure - * to succeed only if lots of memory is been released. - */ - if (nr_free_pages > freepages.high) - goto ok_to_allocate; + if (gfp_mask & __GFP_WAIT) { + int freed; + /* + * If the task is ok to sleep it's fine also + * if we release irq here. + */ + spin_unlock_irq(&page_alloc_lock); + + current->flags |= PF_MEMALLOC|PF_FREE_PAGES; + current->allocation_order = order; + freed = try_to_free_pages(gfp_mask); + current->flags &= ~(PF_MEMALLOC|PF_FREE_PAGES); + + spin_lock_irq(&page_alloc_lock); + refile_local_pages(); + + /* + * Re-check we're still low on memory after we blocked + * for some time. Somebody may have released lots of + * memory from under us while we was trying to free + * the pages. We check against pages_high to be sure + * to succeed only if lots of memory is been released. + */ + if (nr_free_pages > freepages.high) + goto ok_to_allocate; - if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH))) - goto nopage; + if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH))) + goto nopage; + } } ok_to_allocate: - spin_lock_irqsave(&page_alloc_lock, flags); /* if it's not a dma request, try non-dma first */ if (!(gfp_mask & __GFP_DMA)) RMQUEUE_TYPE(order, 0); RMQUEUE_TYPE(order, 1); + nopage: spin_unlock_irqrestore(&page_alloc_lock, flags); - -nopage: + out: return 0; } @@ -310,8 +371,8 @@ * analysis. */ i = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT+7); - if (i < 10) - i = 10; + if (i < 50) + i = 50; if (i > 256) i = 256; freepages.min = i; diff -urN 2.2.17pre9/mm/vmscan.c 2.2.17pre9-VM/mm/vmscan.c --- 2.2.17pre9/mm/vmscan.c Wed Jun 28 17:13:15 2000 +++ 2.2.17pre9-VM/mm/vmscan.c Sun Jul 2 23:57:54 2000 @@ -327,7 +327,7 @@ * Think of swap_cnt as a "shadow rss" - it tells us which process * we want to page out (always try largest first). */ - counter = nr_tasks / (priority+1); + counter = nr_tasks / priority; if (counter < 1) counter = 1; @@ -377,11 +377,9 @@ * cluster them so that we get good swap-out behaviour. See * the "free_memory()" macro for details. */ -static int do_try_to_free_pages(unsigned int gfp_mask) +int try_to_free_pages(unsigned int gfp_mask) { int priority; - int ret = 0; - int swapcount; int count = SWAP_CLUSTER_MAX; lock_kernel(); @@ -392,7 +390,6 @@ priority = 6; do { while (shrink_mmap(priority, gfp_mask)) { - ret = 1; if (!--count) goto done; } @@ -400,30 +397,24 @@ /* Try to get rid of some shared memory pages.. */ if (gfp_mask & __GFP_IO) { while (shm_swap(priority, gfp_mask)) { - ret = 1; if (!--count) goto done; } } /* Then, try to page stuff out.. */ - swapcount = count; while (swap_out(priority, gfp_mask)) { - ret = 1; - if (!--swapcount) - break; + if (!--count) + goto done; } shrink_dcache_memory(priority, gfp_mask); - } while (--priority >= 0); + } while (--priority > 0); done: unlock_kernel(); - if (!ret) - printk("VM: do_try_to_free_pages failed for %s...\n", - current->comm); /* Return success if we freed a page. */ - return ret; + return priority > 0; } /* @@ -499,7 +490,7 @@ while (nr_free_pages < freepages.high) { - if (do_try_to_free_pages(GFP_KSWAPD)) + if (try_to_free_pages(GFP_KSWAPD)) { if (tsk->need_resched) schedule(); @@ -510,17 +501,3 @@ } } } - -/* - * Called by non-kswapd processes when kswapd really cannot - * keep up with the demand for free memory. - */ -int try_to_free_pages(unsigned int gfp_mask) -{ - int retval = 1; - - if (gfp_mask & __GFP_WAIT) - retval = do_try_to_free_pages(gfp_mask); - return retval; -} -