diff -urN vm-ref/arch/i386/config.in vm/arch/i386/config.in --- vm-ref/arch/i386/config.in Sun Oct 28 16:41:45 2001 +++ vm/arch/i386/config.in Sun Oct 28 16:42:04 2001 @@ -403,6 +403,7 @@ bool ' Magic SysRq key' CONFIG_MAGIC_SYSRQ bool ' Spinlock debugging' CONFIG_DEBUG_SPINLOCK bool ' Verbose BUG() reporting (adds 70K)' CONFIG_DEBUG_BUGVERBOSE + bool ' Debug allocation faliures' CONFIG_DEBUG_GFP fi endmenu diff -urN vm-ref/fs/buffer.c vm/fs/buffer.c --- vm-ref/fs/buffer.c Sun Oct 28 16:41:46 2001 +++ vm/fs/buffer.c Sun Oct 28 16:42:04 2001 @@ -710,11 +710,9 @@ static void free_more_memory(void) { - zone_t * zone = contig_page_data.node_zonelists[GFP_NOFS & GFP_ZONEMASK].zones[0]; - balance_dirty(); wakeup_bdflush(); - try_to_free_pages(zone, GFP_NOFS, 0); + try_to_free_pages_nozone(GFP_NOFS); run_task_queue(&tq_disk); current->policy |= SCHED_YIELD; __set_current_state(TASK_RUNNING); @@ -2382,23 +2380,27 @@ return 1; } -static int sync_page_buffers(struct buffer_head *head, unsigned int gfp_mask) +static int sync_page_buffers(struct buffer_head *head) { struct buffer_head * bh = head; - int tryagain = 0; + int tryagain = 1; do { if (!buffer_dirty(bh) && !buffer_locked(bh)) continue; /* Don't start IO first time around.. */ - if (!test_and_set_bit(BH_Wait_IO, &bh->b_state)) + if (!test_and_set_bit(BH_Wait_IO, &bh->b_state)) { + tryagain = 0; continue; + } /* Second time through we start actively writing out.. */ if (test_and_set_bit(BH_Lock, &bh->b_state)) { - if (!test_bit(BH_launder, &bh->b_state)) + if (!test_bit(BH_launder, &bh->b_state)) { + tryagain = 0; continue; + } wait_on_buffer(bh); tryagain = 1; continue; @@ -2485,7 +2487,7 @@ spin_unlock(&lru_list_lock); if (gfp_mask & __GFP_IO) { if ((gfp_mask & __GFP_HIGHIO) || !PageHighMem(page)) { - if (sync_page_buffers(bh, gfp_mask)) { + if (sync_page_buffers(bh)) { /* no IO or waiting next time */ gfp_mask = 0; goto cleaned_buffers_try_again; diff -urN vm-ref/fs/exec.c vm/fs/exec.c --- vm-ref/fs/exec.c Sun Oct 28 16:41:45 2001 +++ vm/fs/exec.c Sun Oct 28 16:42:04 2001 @@ -275,7 +275,6 @@ goto out; if (!pte_none(*pte)) goto out; - lru_cache_add(page); flush_dcache_page(page); flush_page_to_ram(page); set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, PAGE_COPY)))); diff -urN vm-ref/include/linux/mm.h vm/include/linux/mm.h --- vm-ref/include/linux/mm.h Sun Oct 28 16:41:46 2001 +++ vm/include/linux/mm.h Sun Oct 28 16:42:04 2001 @@ -296,6 +296,10 @@ #define PageLaunder(page) test_bit(PG_launder, &(page)->flags) #define SetPageLaunder(page) set_bit(PG_launder, &(page)->flags) +#define PageLaunder(page) test_bit(PG_launder, &(page)->flags) +#define SetPageLaunder(page) set_bit(PG_launder, &(page)->flags) +#define ClearPageLaunder(page) clear_bit(PG_launder, &(page)->flags) + extern void __set_page_dirty(struct page *); static inline void set_page_dirty(struct page * page) @@ -311,7 +315,7 @@ * parallel wait_on_page). */ #define UnlockPage(page) do { \ - clear_bit(PG_launder, &(page)->flags); \ + ClearPageLaunder(page); \ smp_mb__before_clear_bit(); \ if (!test_and_clear_bit(PG_locked, &(page)->flags)) BUG(); \ smp_mb__after_clear_bit(); \ @@ -402,7 +406,6 @@ /* * There is only one 'core' page-freeing function. */ -extern void FASTCALL(free_lru_page(struct page *)); extern void FASTCALL(__free_pages(struct page *page, unsigned int order)); extern void FASTCALL(free_pages(unsigned long addr, unsigned int order)); @@ -411,6 +414,8 @@ #define __free_page(page) __free_pages((page), 0) #define free_page(addr) free_pages((addr),0) + +extern int start_aggressive_readahead(unsigned int); extern void show_free_areas(void); extern void show_free_areas_node(pg_data_t *pgdat); diff -urN vm-ref/include/linux/mmzone.h vm/include/linux/mmzone.h --- vm-ref/include/linux/mmzone.h Sun Oct 28 15:44:39 2001 +++ vm/include/linux/mmzone.h Sun Oct 28 16:42:04 2001 @@ -41,6 +41,7 @@ unsigned long free_pages; unsigned long pages_min, pages_low, pages_high; int need_balance; + int nr_active_pages, nr_inactive_pages; /* * free areas of different sizes diff -urN vm-ref/include/linux/sched.h vm/include/linux/sched.h --- vm-ref/include/linux/sched.h Sun Oct 28 16:41:45 2001 +++ vm/include/linux/sched.h Sun Oct 28 16:42:04 2001 @@ -280,6 +280,14 @@ extern struct user_struct root_user; #define INIT_USER (&root_user) +struct zone_struct; + +struct local_pages { + struct list_head list; + unsigned int order, nr; + struct zone_struct * classzone; +}; + struct task_struct { /* * offsets of these are hardcoded elsewhere - touch with care @@ -318,8 +326,7 @@ struct task_struct *next_task, *prev_task; struct mm_struct *active_mm; struct rw_sem_recursor mm_recursor; - struct list_head local_pages; - unsigned int allocation_order, nr_local_pages; + struct local_pages local_pages; /* task state */ struct linux_binfmt *binfmt; diff -urN vm-ref/include/linux/swap.h vm/include/linux/swap.h --- vm-ref/include/linux/swap.h Sun Oct 28 16:41:46 2001 +++ vm/include/linux/swap.h Sun Oct 28 16:42:05 2001 @@ -112,6 +112,8 @@ /* linux/mm/vmscan.c */ extern wait_queue_head_t kswapd_wait; extern int FASTCALL(try_to_free_pages(zone_t *, unsigned int, unsigned int)); +extern int FASTCALL(try_to_free_pages_nozone(unsigned int)); +extern int vm_scan_ratio, vm_balance_ratio, vm_mapped_ratio; /* linux/mm/page_io.c */ extern void rw_swap_page(int, struct page *); @@ -132,7 +134,6 @@ extern struct page * read_swap_cache_async(swp_entry_t); /* linux/mm/oom_kill.c */ -extern int out_of_memory(void); extern void oom_kill(void); /* linux/mm/swapfile.c */ @@ -176,34 +177,100 @@ BUG(); \ } while (0) +#define inc_nr_active_pages(page) \ +do { \ + pg_data_t * __pgdat; \ + zone_t * __classzone, * __overflow; \ + \ + __classzone = (page)->zone; \ + __pgdat = __classzone->zone_pgdat; \ + __overflow = __pgdat->node_zones + __pgdat->nr_zones; \ + \ + while (__classzone < __overflow) { \ + __classzone->nr_active_pages++; \ + __classzone++; \ + } \ + nr_active_pages++; \ +} while (0) + +#define dec_nr_active_pages(page) \ +do { \ + pg_data_t * __pgdat; \ + zone_t * __classzone, * __overflow; \ + \ + __classzone = (page)->zone; \ + __pgdat = __classzone->zone_pgdat; \ + __overflow = __pgdat->node_zones + __pgdat->nr_zones; \ + \ + while (__classzone < __overflow) { \ + __classzone->nr_active_pages--; \ + __classzone++; \ + } \ + nr_active_pages--; \ +} while (0) + +#define inc_nr_inactive_pages(page) \ +do { \ + pg_data_t * __pgdat; \ + zone_t * __classzone, * __overflow; \ + \ + __classzone = (page)->zone; \ + __pgdat = __classzone->zone_pgdat; \ + __overflow = __pgdat->node_zones + __pgdat->nr_zones; \ + \ + while (__classzone < __overflow) { \ + __classzone->nr_inactive_pages++; \ + __classzone++; \ + } \ + nr_inactive_pages++; \ +} while (0) + +#define dec_nr_inactive_pages(page) \ +do { \ + pg_data_t * __pgdat; \ + zone_t * __classzone, * __overflow; \ + \ + __classzone = (page)->zone; \ + __pgdat = __classzone->zone_pgdat; \ + __overflow = __pgdat->node_zones + __pgdat->nr_zones; \ + \ + while (__classzone < __overflow) { \ + __classzone->nr_inactive_pages--; \ + __classzone++; \ + } \ + nr_inactive_pages--; \ +} while (0) + #define add_page_to_active_list(page) \ do { \ DEBUG_LRU_PAGE(page); \ SetPageActive(page); \ list_add(&(page)->lru, &active_list); \ - nr_active_pages++; \ + inc_nr_active_pages(page); \ } while (0) #define add_page_to_inactive_list(page) \ do { \ DEBUG_LRU_PAGE(page); \ - SetPageInactive(page); \ + SetPageInactive(page); \ list_add(&(page)->lru, &inactive_list); \ - nr_inactive_pages++; \ + inc_nr_inactive_pages(page); \ } while (0) #define del_page_from_active_list(page) \ do { \ list_del(&(page)->lru); \ ClearPageActive(page); \ - nr_active_pages--; \ + dec_nr_active_pages(page); \ + DEBUG_LRU_PAGE(page); \ } while (0) #define del_page_from_inactive_list(page) \ do { \ list_del(&(page)->lru); \ ClearPageInactive(page); \ - nr_inactive_pages--; \ + dec_nr_inactive_pages(page); \ + DEBUG_LRU_PAGE(page); \ } while (0) /* diff -urN vm-ref/include/linux/sysctl.h vm/include/linux/sysctl.h --- vm-ref/include/linux/sysctl.h Sun Oct 28 16:41:46 2001 +++ vm/include/linux/sysctl.h Sun Oct 28 16:42:05 2001 @@ -134,12 +134,13 @@ VM_FREEPG=3, /* struct: Set free page thresholds */ VM_BDFLUSH=4, /* struct: Control buffer cache flushing */ VM_OVERCOMMIT_MEMORY=5, /* Turn off the virtual memory safety limit */ - VM_BUFFERMEM=6, /* struct: Set buffer memory thresholds */ - VM_PAGECACHE=7, /* struct: Set cache memory thresholds */ VM_PAGERDAEMON=8, /* struct: Control kswapd behaviour */ VM_PGT_CACHE=9, /* struct: Set page table cache parameters */ VM_PAGE_CLUSTER=10, /* int: set number of pages to swap together */ VM_HEAP_STACK_GAP=11, /* int: page gap between heap and stack */ + VM_SCAN_RATIO=12, /* part of the inactive list to scan */ + VM_BALANCE_RATIO=13, /* balance active and inactive caches */ + VM_MAPPED_RATIO=14, /* pageout when we find too many mapped pages */ }; diff -urN vm-ref/kernel/fork.c vm/kernel/fork.c --- vm-ref/kernel/fork.c Sun Oct 28 16:41:44 2001 +++ vm/kernel/fork.c Sun Oct 28 16:42:04 2001 @@ -645,7 +645,7 @@ p->lock_depth = -1; /* -1 = no lock */ p->start_time = jiffies; - INIT_LIST_HEAD(&p->local_pages); + INIT_LIST_HEAD(&p->local_pages.list); retval = -ENOMEM; /* copy all the process information */ diff -urN vm-ref/kernel/ksyms.c vm/kernel/ksyms.c --- vm-ref/kernel/ksyms.c Sun Oct 28 16:41:45 2001 +++ vm/kernel/ksyms.c Sun Oct 28 16:42:04 2001 @@ -89,6 +89,7 @@ EXPORT_SYMBOL(exit_sighand); /* internal kernel memory management */ +EXPORT_SYMBOL(start_aggressive_readahead); EXPORT_SYMBOL(_alloc_pages); EXPORT_SYMBOL(__alloc_pages); EXPORT_SYMBOL(alloc_pages_node); diff -urN vm-ref/kernel/sysctl.c vm/kernel/sysctl.c --- vm-ref/kernel/sysctl.c Sun Oct 28 16:41:46 2001 +++ vm/kernel/sysctl.c Sun Oct 28 16:42:05 2001 @@ -30,6 +30,7 @@ #include #include #include +#include #include @@ -259,6 +260,12 @@ }; static ctl_table vm_table[] = { + {VM_SCAN_RATIO, "vm_scan_ratio", + &vm_scan_ratio, sizeof(int), 0644, NULL, &proc_dointvec}, + {VM_BALANCE_RATIO, "vm_balance_ratio", + &vm_balance_ratio, sizeof(int), 0644, NULL, &proc_dointvec}, + {VM_MAPPED_RATIO, "vm_mapped_ratio", + &vm_mapped_ratio, sizeof(int), 0644, NULL, &proc_dointvec}, {VM_BDFLUSH, "bdflush", &bdf_prm, 9*sizeof(int), 0644, NULL, &proc_dointvec_minmax, &sysctl_intvec, NULL, &bdflush_min, &bdflush_max}, diff -urN vm-ref/mm/filemap.c vm/mm/filemap.c --- vm-ref/mm/filemap.c Sun Oct 28 16:41:46 2001 +++ vm/mm/filemap.c Sun Oct 28 16:42:04 2001 @@ -50,8 +50,13 @@ spinlock_cacheline_t pagecache_lock_cacheline = {SPIN_LOCK_UNLOCKED}; /* - * NOTE: to avoid deadlocking you must never acquire the pagecache_lock with - * the pagemap_lru_lock held. + * NOTE: to avoid deadlocking you must never acquire the pagemap_lru_lock + * with the pagecache_lock held. + * + * Ordering: + * swap_lock -> + * pagemap_lru_lock -> + * pagecache_lock */ spinlock_cacheline_t pagemap_lru_lock_cacheline = {SPIN_LOCK_UNLOCKED}; @@ -165,8 +170,8 @@ head = &inode->i_mapping->clean_pages; - spin_lock(&pagecache_lock); spin_lock(&pagemap_lru_lock); + spin_lock(&pagecache_lock); curr = head->next; while (curr != head) { @@ -197,8 +202,8 @@ continue; } - spin_unlock(&pagemap_lru_lock); spin_unlock(&pagecache_lock); + spin_unlock(&pagemap_lru_lock); } static inline void truncate_partial_page(struct page *page, unsigned partial) @@ -634,8 +639,9 @@ spin_lock(&pagecache_lock); add_page_to_inode_queue(mapping, page); add_page_to_hash_queue(page, page_hash(mapping, index)); - lru_cache_add(page); spin_unlock(&pagecache_lock); + + lru_cache_add(page); } /* @@ -654,7 +660,6 @@ page->index = offset; add_page_to_inode_queue(mapping, page); add_page_to_hash_queue(page, hash); - lru_cache_add(page); } void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset) @@ -662,6 +667,7 @@ spin_lock(&pagecache_lock); __add_to_page_cache(page, mapping, offset, page_hash(mapping, offset)); spin_unlock(&pagecache_lock); + lru_cache_add(page); } int add_to_page_cache_unique(struct page * page, @@ -681,6 +687,8 @@ } spin_unlock(&pagecache_lock); + if (!err) + lru_cache_add(page); return err; } @@ -912,7 +920,9 @@ newpage = NULL; } spin_unlock(&pagecache_lock); - if (unlikely(newpage != NULL)) + if (newpage == NULL) + lru_cache_add(page); + else page_cache_release(newpage); } } @@ -1391,6 +1401,7 @@ page = cached_page; __add_to_page_cache(page, mapping, index, hash); spin_unlock(&pagecache_lock); + lru_cache_add(page); cached_page = NULL; goto readpage; @@ -2960,8 +2971,15 @@ } unlock: kunmap(page); + + /* + * Mark the page accessed if we wrote the + * beginning or we just did an lseek. + */ + if (!offset || !file->f_reada) + SetPageReferenced(page); + /* Mark it unlocked again and drop the page.. */ - SetPageReferenced(page); UnlockPage(page); page_cache_release(page); diff -urN vm-ref/mm/memory.c vm/mm/memory.c --- vm-ref/mm/memory.c Sun Oct 28 16:41:46 2001 +++ vm/mm/memory.c Sun Oct 28 16:42:04 2001 @@ -965,7 +965,7 @@ if (!new_page) goto no_mem; copy_cow_page(old_page,new_page,address); - free_lru_page(old_page); + page_cache_release(old_page); /* * Re-check the pte - we dropped the lock @@ -975,19 +975,18 @@ if (PageReserved(old_page)) ++mm->rss; break_cow(vma, new_page, address, page_table); - lru_cache_add(new_page); /* Free the old page.. */ new_page = old_page; } - free_lru_page(new_page); + page_cache_release(new_page); return 1; /* Minor fault */ bad_wp_page: printk("do_wp_page: bogus page at address %08lx (page 0x%lx)\n",address,(unsigned long)old_page); return -1; no_mem: - free_lru_page(old_page); + page_cache_release(old_page); spin_lock(&mm->page_table_lock); return -1; } @@ -1215,7 +1214,6 @@ mm->rss++; flush_page_to_ram(page); entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); - lru_cache_add(page); } set_pte(page_table, entry); diff -urN vm-ref/mm/oom_kill.c vm/mm/oom_kill.c --- vm-ref/mm/oom_kill.c Wed Oct 10 02:16:27 2001 +++ vm/mm/oom_kill.c Sun Oct 28 16:42:04 2001 @@ -192,67 +192,3 @@ schedule(); return; } - -static inline int node_zones_low(pg_data_t *pgdat) -{ - zone_t * zone; - int i; - - for (i = pgdat->nr_zones-1; i >= 0; i--) { - zone = pgdat->node_zones + i; - - if (zone->free_pages > (zone->pages_low)) - return 0; - - } - return 1; -} - -static int all_zones_low(void) -{ - pg_data_t * pgdat = pgdat_list; - - pgdat = pgdat_list; - do { - if (node_zones_low(pgdat)) - continue; - return 0; - } while ((pgdat = pgdat->node_next)); - - return 1; -} - -/** - * out_of_memory - is the system out of memory? - * - * Returns 0 if there is still enough memory left, - * 1 when we are out of memory (otherwise). - */ -int out_of_memory(void) -{ - long cache_mem, limit; - - /* Enough free memory? Not OOM. */ - if (!all_zones_low()) - return 0; - - /* Enough swap space left? Not OOM. */ - if (nr_swap_pages > 0) - return 0; - - /* - * If the buffer and page cache (including swap cache) are over - * their (/proc tunable) minimum, we're still not OOM. We test - * this to make sure we don't return OOM when the system simply - * has a hard time with the cache. - */ - cache_mem = atomic_read(&page_cache_size); - limit = 2; - limit *= num_physpages / 100; - - if (cache_mem > limit) - return 0; - - /* Else... */ - return 1; -} diff -urN vm-ref/mm/page_alloc.c vm/mm/page_alloc.c --- vm-ref/mm/page_alloc.c Sun Oct 28 16:41:45 2001 +++ vm/mm/page_alloc.c Sun Oct 28 16:42:04 2001 @@ -146,12 +146,13 @@ * local since we must deal with fragmentation too and we * can't rely on the nr_local_pages information. */ - if (current->nr_local_pages && !current->allocation_order) + if ((current->local_pages.nr && !current->local_pages.order) || + !memclass(page->zone, current->local_pages.classzone)) goto back_local_freelist; - list_add(&page->list, ¤t->local_pages); + list_add(&page->list, ¤t->local_pages.list); page->index = order; - current->nr_local_pages++; + current->local_pages.nr++; } #define MARK_USED(index, order, area) \ @@ -233,35 +234,36 @@ static struct page * balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed) { struct page * page = NULL; - int __freed = 0; + int __freed; - if (!(gfp_mask & __GFP_WAIT)) - goto out; if (in_interrupt()) BUG(); - current->allocation_order = order; + current->local_pages.order = order; + current->local_pages.classzone = classzone; current->flags |= PF_MEMALLOC | PF_FREE_PAGES; __freed = try_to_free_pages(classzone, gfp_mask, order); current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES); - if (current->nr_local_pages) { + if (current->local_pages.nr) { struct list_head * entry, * local_pages; struct page * tmp; int nr_pages; - local_pages = ¤t->local_pages; + local_pages = ¤t->local_pages.list; if (likely(__freed)) { /* pick from the last inserted so we're lifo */ entry = local_pages->next; do { tmp = list_entry(entry, struct page, list); - if (tmp->index == order && memclass(tmp->zone, classzone)) { + if (!memclass(tmp->zone, classzone)) + BUG(); + if (tmp->index == order) { list_del(entry); - current->nr_local_pages--; + current->local_pages.nr--; set_page_count(tmp, 1); page = tmp; @@ -289,7 +291,7 @@ } while ((entry = entry->next) != local_pages); } - nr_pages = current->nr_local_pages; + nr_pages = current->local_pages.nr; /* free in reverse order so that the global order will be lifo */ while ((entry = local_pages->prev) != local_pages) { list_del(entry); @@ -298,9 +300,8 @@ if (!nr_pages--) BUG(); } - current->nr_local_pages = 0; + current->local_pages.nr = 0; } - out: *freed = __freed; return page; } @@ -358,8 +359,7 @@ /* here we're in the low on memory slow path */ -rebalance: - if (current->flags & PF_MEMALLOC) { + if (current->flags & PF_MEMALLOC && !in_interrupt()) { zone = zonelist->zones; for (;;) { zone_t *z = *(zone++); @@ -375,34 +375,52 @@ /* Atomic allocations - we can't balance anything */ if (!(gfp_mask & __GFP_WAIT)) - return NULL; + goto out; + rebalance: page = balance_classzone(classzone, gfp_mask, order, &freed); if (page) return page; zone = zonelist->zones; - for (;;) { - zone_t *z = *(zone++); - if (!z) - break; + if (likely(freed)) { + for (;;) { + zone_t *z = *(zone++); + if (!z) + break; - if (zone_free_pages(z, order) > z->pages_min) { - page = rmqueue(z, order); - if (page) - return page; + if (zone_free_pages(z, order) > z->pages_min) { + page = rmqueue(z, order); + if (page) + return page; + } } - } + goto rebalance; + } else { + /* + * Check that no other task is been killed meanwhile, + * in such a case we can succeed the allocation. + */ + for (;;) { + zone_t *z = *(zone++); + if (!z) + break; - /* Don't let big-order allocations loop */ - if (order > 1) - return NULL; + if (zone_free_pages(z, order) > z->pages_high) { + page = rmqueue(z, order); + if (page) + return page; + } + } + } - /* Yield for kswapd, and try again */ - current->policy |= SCHED_YIELD; - __set_current_state(TASK_RUNNING); - schedule(); - goto rebalance; + out: + printk(KERN_NOTICE "__alloc_pages: %u-order allocation failed (gfp=0x%x/%i)\n", + order, gfp_mask, !!(current->flags & PF_MEMALLOC)); +#ifdef CONFIG_DEBUG_GFP + show_stack(NULL); +#endif + return NULL; } /* @@ -431,15 +449,6 @@ return 0; } -void free_lru_page(struct page *page) -{ - if (!PageReserved(page) && put_page_testzero(page)) { - if (PageActive(page) || PageInactive(page)) - lru_cache_del(page); - __free_pages_ok(page, 0); - } -} - void __free_pages(struct page *page, unsigned int order) { if (!PageReserved(page) && put_page_testzero(page)) @@ -525,17 +534,24 @@ { pg_data_t *pgdat = pgdat_list; unsigned int sum = 0; + zonelist_t *zonelist; + zone_t **zonep, *zone; do { - zonelist_t *zonelist = pgdat->node_zonelists + (GFP_USER & GFP_ZONEMASK); - zone_t **zonep = zonelist->zones; - zone_t *zone; + zonelist = pgdat->node_zonelists + (GFP_USER & GFP_ZONEMASK); + zonep = zonelist->zones; - for (zone = *zonep++; zone; zone = *zonep++) { - unsigned long size = zone->size; - unsigned long high = zone->pages_high; - if (size > high) - sum += size - high; + zone = *zonep; + if (zone) { + sum += zone->nr_inactive_pages; + do { + unsigned int free = zone->free_pages - zone->pages_high; + zonep++; + zone = *zonep; + if (free <= 0) + continue; + sum += free; + } while (zone); } pgdat = pgdat->node_next; @@ -558,6 +574,57 @@ } #endif +/* + * If it returns non zero it means there's lots of ram "free" + * (note: not in cache!) so any caller will know that + * he can allocate some memory to do some more aggressive + * (possibly wasteful) readahead. The state of the memory + * should be rechecked after every few pages allocated for + * doing this aggressive readahead. + * + * The gfp_mask parameter specifies in which kind of memory + * the readahead information will be applocated to. + */ +int start_aggressive_readahead(unsigned int gfp_mask) +{ + pg_data_t *pgdat = pgdat_list; + zonelist_t *zonelist; + zone_t **zonep, *zone; + int ret = 0; + + do { + zonelist = pgdat->node_zonelists + (gfp_mask & GFP_ZONEMASK); + zonep = zonelist->zones; + + for (zone = *zonep++; zone; zone = *zonep++) + if (zone->free_pages > zone->pages_high * 2) + ret = 1; + + pgdat = pgdat->node_next; + } while (pgdat); + + return ret; +} + +int try_to_free_pages_nozone(unsigned int gfp_mask) +{ + pg_data_t *pgdat = pgdat_list; + zonelist_t *zonelist; + zone_t **zonep; + int ret = 0; + + do { + zonelist = pgdat->node_zonelists + (gfp_mask & GFP_ZONEMASK); + zonep = zonelist->zones; + + ret |= try_to_free_pages(*zonep, gfp_mask, 0); + + pgdat = pgdat->node_next; + } while (pgdat); + + return ret; +} + #define K(x) ((x) << (PAGE_SHIFT-10)) /* @@ -572,28 +639,31 @@ pg_data_t *tmpdat = pgdat; printk("Free pages: %6dkB (%6dkB HighMem)\n", - nr_free_pages() << (PAGE_SHIFT-10), - nr_free_highpages() << (PAGE_SHIFT-10)); + K(nr_free_pages()), + K(nr_free_highpages())); while (tmpdat) { zone_t *zone; for (zone = tmpdat->node_zones; zone < tmpdat->node_zones + MAX_NR_ZONES; zone++) - printk("Zone:%s freepages:%6lukB min:%6luKB low:%6lukB " - "high:%6lukB\n", - zone->name, - K(zone->free_pages), - K(zone->pages_min), - K(zone->pages_low), - K(zone->pages_high)); - + printk("Zone:%s freepages:%6lukB|%lu min:%6luKB|%lu low:%6lukB|%lu high:%6lukB:%lu active:%6dkB|%d inactive:%6dkB|%d\n", + zone->name, + K(zone->free_pages), + zone->free_pages, + K(zone->pages_min), + zone->pages_min, + K(zone->pages_low), + zone->pages_low, + K(zone->pages_high), + zone->pages_high, + K(zone->nr_active_pages), + zone->nr_active_pages, + K(zone->nr_inactive_pages), + zone->nr_inactive_pages); + tmpdat = tmpdat->node_next; } - printk("Free pages: %6dkB (%6dkB HighMem)\n", - K(nr_free_pages()), - K(nr_free_highpages())); - printk("( Active: %d, inactive: %d, free: %d )\n", nr_active_pages, nr_inactive_pages, @@ -768,6 +838,7 @@ zone->zone_pgdat = pgdat; zone->free_pages = 0; zone->need_balance = 0; + zone->nr_active_pages = zone->nr_inactive_pages = 0; if (!size) continue; diff -urN vm-ref/mm/shmem.c vm/mm/shmem.c --- vm-ref/mm/shmem.c Sat Oct 27 10:06:54 2001 +++ vm/mm/shmem.c Sun Oct 28 16:42:04 2001 @@ -550,7 +550,7 @@ swap_free(*entry); *entry = (swp_entry_t) {0}; delete_from_swap_cache(page); - flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_referenced) | (1 << PG_arch_1)); + flags = page->flags & ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_referenced | 1 << PG_arch_1); page->flags = flags | (1 << PG_dirty); add_to_page_cache_locked(page, mapping, idx); info->swapped--; diff -urN vm-ref/mm/swap.c vm/mm/swap.c --- vm-ref/mm/swap.c Sat Oct 27 10:06:54 2001 +++ vm/mm/swap.c Sun Oct 28 16:42:04 2001 @@ -87,11 +87,11 @@ */ void lru_cache_add(struct page * page) { - if (!PageActive(page) && !PageInactive(page)) { - spin_lock(&pagemap_lru_lock); - add_page_to_inactive_list(page); - spin_unlock(&pagemap_lru_lock); - } + if (!PageLocked(page)) + BUG(); + spin_lock(&pagemap_lru_lock); + add_page_to_inactive_list(page); + spin_unlock(&pagemap_lru_lock); } /** @@ -107,9 +107,9 @@ del_page_from_active_list(page); } else if (PageInactive(page)) { del_page_from_inactive_list(page); - } else { -// printk("VM: __lru_cache_del, found unknown page ?!\n"); - } + } else + printk("VM: __lru_cache_del, found unknown page ?!\n"); + DEBUG_LRU_PAGE(page); } /** @@ -118,6 +118,8 @@ */ void lru_cache_del(struct page * page) { + if (!PageLocked(page)) + BUG(); spin_lock(&pagemap_lru_lock); __lru_cache_del(page); spin_unlock(&pagemap_lru_lock); diff -urN vm-ref/mm/swap_state.c vm/mm/swap_state.c --- vm-ref/mm/swap_state.c Sat Oct 27 10:06:54 2001 +++ vm/mm/swap_state.c Sun Oct 28 16:42:04 2001 @@ -17,17 +17,8 @@ #include -/* - * We may have stale swap cache pages in memory: notice - * them here and get rid of the unnecessary final write. - */ static int swap_writepage(struct page *page) { - if (exclusive_swap_page(page)) { - delete_from_swap_cache(page); - UnlockPage(page); - return 0; - } rw_swap_page(WRITE, page); return 0; } @@ -152,7 +143,7 @@ delete_from_swap_cache(page); UnlockPage(page); } - free_lru_page(page); + page_cache_release(page); } /* diff -urN vm-ref/mm/vmscan.c vm/mm/vmscan.c --- vm-ref/mm/vmscan.c Sat Oct 27 10:06:54 2001 +++ vm/mm/vmscan.c Sun Oct 28 16:42:05 2001 @@ -26,12 +26,28 @@ #include /* - * The "priority" of VM scanning is how much of the queues we - * will scan in one go. A value of 6 for DEF_PRIORITY implies - * that we'll scan 1/64th of the queues ("queue_length >> 6") - * during a normal aging round. + * "vm_scan_ratio" is how much of the queues we will scan + * in one go. A value of 6 for vm_scan_ratio implies that we'll + * scan 1/6 of the inactive list during a normal aging round. + * So if 1/vm_scan_ratio of the inactive cache is unfreeable + * we'll start the background paging. */ -#define DEF_PRIORITY (6) +int vm_scan_ratio = 6; + +/* + * "vm_scan_ratio" controls when we start to swapout, the lower, + * the earlier we'll start to swapout. + */ +int vm_mapped_ratio = 10; + +/* + * "vm_balance_ratio" controls the balance between active and + * inactive cache. The bigger vm_balance_ratio is, the easier the + * active cache will grow, because we'll rotate the active list + * slowly. A value of 3 means we'll go towards a balance of + * 1/4 of the cache being inactive. + */ +int vm_balance_ratio = 3; /* * The swap-out function returns 1 if it successfully @@ -286,13 +302,13 @@ return count; } -static int FASTCALL(swap_out(unsigned int priority, unsigned int gfp_mask, zone_t * classzone)); -static int swap_out(unsigned int priority, unsigned int gfp_mask, zone_t * classzone) +static int FASTCALL(swap_out(unsigned int gfp_mask, zone_t * classzone)); +static int swap_out(unsigned int gfp_mask, zone_t * classzone) { int counter, nr_pages = SWAP_CLUSTER_MAX; struct mm_struct *mm; - counter = mmlist_nr; + counter = mmlist_nr << 1; do { if (unlikely(current->need_resched)) { __set_current_state(TASK_RUNNING); @@ -328,15 +344,13 @@ return 0; } -static int FASTCALL(shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int priority)); -static int shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int priority) +static int FASTCALL(shrink_cache(int nr_pages, int max_scan, zone_t * classzone, unsigned int gfp_mask)); +static int shrink_cache(int nr_pages, int max_scan, zone_t * classzone, unsigned int gfp_mask) { struct list_head * entry; - int max_scan = nr_inactive_pages / priority; - int max_mapped = nr_pages*10; + int max_mapped = nr_pages * vm_mapped_ratio; - spin_lock(&pagemap_lru_lock); - while (--max_scan >= 0 && (entry = inactive_list.prev) != &inactive_list) { + while (max_scan && classzone->nr_inactive_pages && (entry = inactive_list.prev) != &inactive_list) { struct page * page; if (unlikely(current->need_resched)) { @@ -355,18 +369,13 @@ list_del(entry); list_add(entry, &inactive_list); - /* - * Zero page counts can happen because we unlink the pages - * _after_ decrementing the usage count.. - */ - if (unlikely(!page_count(page))) - continue; - if (!memclass(page->zone, classzone)) continue; + max_scan--; + /* Racy check to avoid trylocking when not worthwhile */ - if (!page->buffers && (page_count(page) != 1 || !page->mapping)) + if (!page->buffers && page_count(page) != 1) goto page_mapped; /* @@ -459,37 +468,27 @@ } } - if (unlikely(!spin_trylock(&pagecache_lock))) { - /* we hold the page lock so the page cannot go away from under us */ - spin_unlock(&pagemap_lru_lock); + if (unlikely(!page->mapping)) + BUG(); - spin_lock(&pagecache_lock); - spin_lock(&pagemap_lru_lock); - } + spin_lock(&pagecache_lock); /* - * this is the non-racy check for busy page. + * This is the non-racy check for busy page. + * It is critical to check PageDirty _after_ we made sure + * the page is freeable so not in use by anybody. + * At this point we're guaranteed that page->buffers is NULL, + * nobody can refill page->buffers under us because we still + * hold the page lock. */ - if (!page->mapping || !is_page_cache_freeable(page)) { + if (unlikely(page_count(page) > 1)) { spin_unlock(&pagecache_lock); UnlockPage(page); -page_mapped: + page_mapped: if (--max_mapped >= 0) continue; - - /* - * Alert! We've found too many mapped pages on the - * inactive list, so we start swapping out now! - */ - spin_unlock(&pagemap_lru_lock); - swap_out(priority, gfp_mask, classzone); - return nr_pages; + break; } - - /* - * It is critical to check PageDirty _after_ we made sure - * the page is freeable* so not in use by anybody. - */ if (PageDirty(page)) { spin_unlock(&pagecache_lock); UnlockPage(page); @@ -530,70 +529,100 @@ * We move them the other way when we see the * reference bit on the page. */ -static void refill_inactive(int nr_pages) +static void FASTCALL(refill_inactive(int nr_pages, zone_t * classzone)); +static void refill_inactive(int nr_pages, zone_t * classzone) { struct list_head * entry; - spin_lock(&pagemap_lru_lock); entry = active_list.prev; - while (nr_pages-- && entry != &active_list) { + while (nr_pages && entry != &active_list) { struct page * page; page = list_entry(entry, struct page, lru); entry = entry->prev; + + if (!memclass(page->zone, classzone)) + continue; + if (PageTestandClearReferenced(page)) { list_del(&page->lru); list_add(&page->lru, &active_list); continue; } + nr_pages--; + del_page_from_active_list(page); add_page_to_inactive_list(page); + SetPageReferenced(page); + } + if (entry != &active_list) { + list_del(&active_list); + list_add(&active_list, entry); } - spin_unlock(&pagemap_lru_lock); } -static int FASTCALL(shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages)); -static int shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages) +static int FASTCALL(shrink_caches(zone_t * classzone, unsigned int gfp_mask, int nr_pages)); +static int shrink_caches(zone_t * classzone, unsigned int gfp_mask, int nr_pages) { - int chunk_size = nr_pages; + int max_scan; unsigned long ratio; nr_pages -= kmem_cache_reap(gfp_mask); if (nr_pages <= 0) return 0; - nr_pages = chunk_size; - /* try to keep the active list 2/3 of the size of the cache */ - ratio = (unsigned long) nr_pages * nr_active_pages / ((nr_inactive_pages + 1) * 2); - refill_inactive(ratio); - - nr_pages = shrink_cache(nr_pages, classzone, gfp_mask, priority); - if (nr_pages <= 0) - return 0; - - shrink_dcache_memory(priority, gfp_mask); - shrink_icache_memory(priority, gfp_mask); + spin_lock(&pagemap_lru_lock); + ratio = (unsigned long) nr_pages * classzone->nr_active_pages / (((unsigned long) classzone->nr_inactive_pages * vm_balance_ratio) + 1); + if (ratio > nr_pages * 2) { + shrink_dcache_memory(vm_scan_ratio, gfp_mask); + shrink_icache_memory(vm_scan_ratio, gfp_mask); #ifdef CONFIG_QUOTA - shrink_dqcache_memory(DEF_PRIORITY, gfp_mask); + shrink_dqcache_memory(vm_scan_ratio, gfp_mask); #endif + ratio = nr_pages * 2; + } + refill_inactive(ratio, classzone); + + max_scan = classzone->nr_inactive_pages / vm_scan_ratio; + nr_pages = shrink_cache(nr_pages, max_scan, classzone, gfp_mask); + return nr_pages; } +static int check_classzone_need_balance(zone_t * classzone); + int try_to_free_pages(zone_t *classzone, unsigned int gfp_mask, unsigned int order) { - int ret = 0; - int priority = DEF_PRIORITY; - int nr_pages = SWAP_CLUSTER_MAX; + for (;;) { + int tries = vm_scan_ratio << 2; + int nr_pages = SWAP_CLUSTER_MAX; - do { - nr_pages = shrink_caches(classzone, priority, gfp_mask, nr_pages); - if (nr_pages <= 0) - return 1; - } while (--priority); + do { + nr_pages = shrink_caches(classzone, gfp_mask, nr_pages); + if (nr_pages <= 0) + return 1; + + shrink_dcache_memory(vm_scan_ratio, gfp_mask); + shrink_icache_memory(vm_scan_ratio, gfp_mask); +#ifdef CONFIG_QUOTA + shrink_dqcache_memory(vm_scan_ratio, gfp_mask); +#endif + if (!swap_out(gfp_mask, classzone)) + return 0; + } while (--tries); - return ret; + if (likely(current->pid != 1)) + break; + if (!check_classzone_need_balance(classzone)) + break; + current->policy |= SCHED_YIELD; + __set_current_state(TASK_RUNNING); + schedule(); + } + + return 0; } DECLARE_WAIT_QUEUE_HEAD(kswapd_wait); @@ -625,7 +654,7 @@ if (!try_to_free_pages(zone, GFP_KSWAPD, 0)) { zone->need_balance = 0; __set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(HZ); + schedule_timeout(HZ*5); continue; } if (check_classzone_need_balance(zone)) @@ -648,9 +677,6 @@ do need_more_balance |= kswapd_balance_pgdat(pgdat); while ((pgdat = pgdat->node_next)); - if (need_more_balance && out_of_memory()) { - oom_kill(); - } } while (need_more_balance); }