diff -urN vm-ref/arch/i386/config.in vm/arch/i386/config.in --- vm-ref/arch/i386/config.in Tue Oct 30 04:32:40 2001 +++ vm/arch/i386/config.in Tue Oct 30 04:32:51 2001 @@ -403,6 +403,7 @@ bool ' Magic SysRq key' CONFIG_MAGIC_SYSRQ bool ' Spinlock debugging' CONFIG_DEBUG_SPINLOCK bool ' Verbose BUG() reporting (adds 70K)' CONFIG_DEBUG_BUGVERBOSE + bool ' Debug allocation faliures' CONFIG_DEBUG_GFP fi endmenu diff -urN vm-ref/fs/buffer.c vm/fs/buffer.c --- vm-ref/fs/buffer.c Tue Oct 30 04:32:40 2001 +++ vm/fs/buffer.c Tue Oct 30 04:32:51 2001 @@ -115,7 +115,7 @@ int dummy5; /* unused */ } b_un; unsigned int data[N_PARAM]; -} bdf_prm = {{40, 0, 0, 0, 5*HZ, 30*HZ, 60, 0, 0}}; +} bdf_prm = {{20, 0, 0, 0, 5*HZ, 30*HZ, 40, 0, 0}}; /* These are the min and max parameter values that we will allow to be assigned */ int bdflush_min[N_PARAM] = { 0, 10, 5, 25, 0, 1*HZ, 0, 0, 0}; @@ -710,12 +710,8 @@ static void free_more_memory(void) { - zone_t * zone = contig_page_data.node_zonelists[GFP_NOFS & GFP_ZONEMASK].zones[0]; - - balance_dirty(); wakeup_bdflush(); - try_to_free_pages(zone, GFP_NOFS, 0); - run_task_queue(&tq_disk); + try_to_free_pages_nozone(GFP_NOIO); current->policy |= SCHED_YIELD; __set_current_state(TASK_RUNNING); schedule(); @@ -1057,19 +1053,17 @@ if (state < 0) return; - /* If we're getting into imbalance, start write-out */ - spin_lock(&lru_list_lock); - write_some_buffers(NODEV); + wakeup_bdflush(); /* * And if we're _really_ out of balance, wait for - * some of the dirty/locked buffers ourselves and - * start bdflush. + * some of the dirty/locked buffers ourselves. * This will throttle heavy writers. */ if (state > 0) { + spin_lock(&lru_list_lock); + write_some_buffers(NODEV); wait_for_some_buffers(NODEV); - wakeup_bdflush(); } } @@ -2376,23 +2370,27 @@ return 1; } -static int sync_page_buffers(struct buffer_head *head, unsigned int gfp_mask) +static int sync_page_buffers(struct buffer_head *head) { struct buffer_head * bh = head; - int tryagain = 0; + int tryagain = 1; do { if (!buffer_dirty(bh) && !buffer_locked(bh)) continue; /* Don't start IO first time around.. */ - if (!test_and_set_bit(BH_Wait_IO, &bh->b_state)) + if (!test_and_set_bit(BH_Wait_IO, &bh->b_state)) { + tryagain = 0; continue; + } /* Second time through we start actively writing out.. */ if (test_and_set_bit(BH_Lock, &bh->b_state)) { - if (!test_bit(BH_launder, &bh->b_state)) + if (!test_bit(BH_launder, &bh->b_state)) { + tryagain = 0; continue; + } wait_on_buffer(bh); tryagain = 1; continue; @@ -2479,7 +2477,7 @@ spin_unlock(&lru_list_lock); if (gfp_mask & __GFP_IO) { if ((gfp_mask & __GFP_HIGHIO) || !PageHighMem(page)) { - if (sync_page_buffers(bh, gfp_mask)) { + if (sync_page_buffers(bh)) { /* no IO or waiting next time */ gfp_mask = 0; goto cleaned_buffers_try_again; diff -urN vm-ref/fs/exec.c vm/fs/exec.c --- vm-ref/fs/exec.c Tue Oct 30 04:32:40 2001 +++ vm/fs/exec.c Tue Oct 30 04:32:51 2001 @@ -275,7 +275,6 @@ goto out; if (!pte_none(*pte)) goto out; - lru_cache_add(page); flush_dcache_page(page); flush_page_to_ram(page); set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, PAGE_COPY)))); diff -urN vm-ref/include/linux/mm.h vm/include/linux/mm.h --- vm-ref/include/linux/mm.h Tue Oct 30 04:32:40 2001 +++ vm/include/linux/mm.h Tue Oct 30 04:32:51 2001 @@ -296,6 +296,10 @@ #define PageLaunder(page) test_bit(PG_launder, &(page)->flags) #define SetPageLaunder(page) set_bit(PG_launder, &(page)->flags) +#define PageLaunder(page) test_bit(PG_launder, &(page)->flags) +#define SetPageLaunder(page) set_bit(PG_launder, &(page)->flags) +#define ClearPageLaunder(page) clear_bit(PG_launder, &(page)->flags) + extern void __set_page_dirty(struct page *); static inline void set_page_dirty(struct page * page) @@ -311,7 +315,7 @@ * parallel wait_on_page). */ #define UnlockPage(page) do { \ - clear_bit(PG_launder, &(page)->flags); \ + ClearPageLaunder(page); \ smp_mb__before_clear_bit(); \ if (!test_and_clear_bit(PG_locked, &(page)->flags)) BUG(); \ smp_mb__after_clear_bit(); \ @@ -399,7 +403,6 @@ /* * There is only one 'core' page-freeing function. */ -extern void FASTCALL(free_lru_page(struct page *)); extern void FASTCALL(__free_pages(struct page *page, unsigned int order)); extern void FASTCALL(free_pages(unsigned long addr, unsigned int order)); @@ -409,6 +412,8 @@ #define __free_page(page) __free_pages((page), 0) #define free_page(addr) free_pages((addr),0) +extern int start_aggressive_readahead(unsigned int); + extern void show_free_areas(void); extern void show_free_areas_node(pg_data_t *pgdat); @@ -469,7 +474,20 @@ return page_count(page) - !!page->buffers == 1; } -extern int remove_exclusive_swap_page(struct page *); +/* + * Work out if there are any other processes sharing this + * swap cache page. Never mind the buffers. + */ +static inline int exclusive_swap_page(struct page *page) +{ + if (!PageLocked(page)) + BUG(); + if (!PageSwapCache(page)) + return 0; + if (page_count(page) - !!page->buffers != 2) /* 2: us + cache */ + return 0; + return swap_count(page) == 1; /* 1: just cache */ +} extern void __free_pte(pte_t); diff -urN vm-ref/include/linux/mmzone.h vm/include/linux/mmzone.h --- vm-ref/include/linux/mmzone.h Mon Oct 29 01:49:56 2001 +++ vm/include/linux/mmzone.h Tue Oct 30 04:32:51 2001 @@ -41,6 +41,7 @@ unsigned long free_pages; unsigned long pages_min, pages_low, pages_high; int need_balance; + int nr_active_pages, nr_inactive_pages; /* * free areas of different sizes diff -urN vm-ref/include/linux/pagemap.h vm/include/linux/pagemap.h --- vm-ref/include/linux/pagemap.h Tue Oct 30 03:37:11 2001 +++ vm/include/linux/pagemap.h Tue Oct 30 04:32:51 2001 @@ -29,7 +29,7 @@ #define PAGE_CACHE_ALIGN(addr) (((addr)+PAGE_CACHE_SIZE-1)&PAGE_CACHE_MASK) #define page_cache_get(x) get_page(x) -#define page_cache_release(x) free_lru_page(x) +#define page_cache_release(x) __free_page(x) static inline struct page *page_cache_alloc(struct address_space *x) { diff -urN vm-ref/include/linux/sched.h vm/include/linux/sched.h --- vm-ref/include/linux/sched.h Tue Oct 30 04:32:40 2001 +++ vm/include/linux/sched.h Tue Oct 30 04:32:51 2001 @@ -280,6 +280,14 @@ extern struct user_struct root_user; #define INIT_USER (&root_user) +struct zone_struct; + +struct local_pages { + struct list_head list; + unsigned int order, nr; + struct zone_struct * classzone; +}; + struct task_struct { /* * offsets of these are hardcoded elsewhere - touch with care @@ -318,8 +326,7 @@ struct task_struct *next_task, *prev_task; struct mm_struct *active_mm; struct rw_sem_recursor mm_recursor; - struct list_head local_pages; - unsigned int allocation_order, nr_local_pages; + struct local_pages local_pages; /* task state */ struct linux_binfmt *binfmt; diff -urN vm-ref/include/linux/swap.h vm/include/linux/swap.h --- vm-ref/include/linux/swap.h Tue Oct 30 04:32:40 2001 +++ vm/include/linux/swap.h Tue Oct 30 04:32:51 2001 @@ -112,6 +112,8 @@ /* linux/mm/vmscan.c */ extern wait_queue_head_t kswapd_wait; extern int FASTCALL(try_to_free_pages(zone_t *, unsigned int, unsigned int)); +extern int FASTCALL(try_to_free_pages_nozone(unsigned int)); +extern int vm_scan_ratio, vm_balance_ratio, vm_mapped_ratio; /* linux/mm/page_io.c */ extern void rw_swap_page(int, struct page *); @@ -132,7 +134,6 @@ extern struct page * read_swap_cache_async(swp_entry_t); /* linux/mm/oom_kill.c */ -extern int out_of_memory(void); extern void oom_kill(void); /* linux/mm/swapfile.c */ @@ -176,34 +177,100 @@ BUG(); \ } while (0) +#define inc_nr_active_pages(page) \ +do { \ + pg_data_t * __pgdat; \ + zone_t * __classzone, * __overflow; \ + \ + __classzone = (page)->zone; \ + __pgdat = __classzone->zone_pgdat; \ + __overflow = __pgdat->node_zones + __pgdat->nr_zones; \ + \ + while (__classzone < __overflow) { \ + __classzone->nr_active_pages++; \ + __classzone++; \ + } \ + nr_active_pages++; \ +} while (0) + +#define dec_nr_active_pages(page) \ +do { \ + pg_data_t * __pgdat; \ + zone_t * __classzone, * __overflow; \ + \ + __classzone = (page)->zone; \ + __pgdat = __classzone->zone_pgdat; \ + __overflow = __pgdat->node_zones + __pgdat->nr_zones; \ + \ + while (__classzone < __overflow) { \ + __classzone->nr_active_pages--; \ + __classzone++; \ + } \ + nr_active_pages--; \ +} while (0) + +#define inc_nr_inactive_pages(page) \ +do { \ + pg_data_t * __pgdat; \ + zone_t * __classzone, * __overflow; \ + \ + __classzone = (page)->zone; \ + __pgdat = __classzone->zone_pgdat; \ + __overflow = __pgdat->node_zones + __pgdat->nr_zones; \ + \ + while (__classzone < __overflow) { \ + __classzone->nr_inactive_pages++; \ + __classzone++; \ + } \ + nr_inactive_pages++; \ +} while (0) + +#define dec_nr_inactive_pages(page) \ +do { \ + pg_data_t * __pgdat; \ + zone_t * __classzone, * __overflow; \ + \ + __classzone = (page)->zone; \ + __pgdat = __classzone->zone_pgdat; \ + __overflow = __pgdat->node_zones + __pgdat->nr_zones; \ + \ + while (__classzone < __overflow) { \ + __classzone->nr_inactive_pages--; \ + __classzone++; \ + } \ + nr_inactive_pages--; \ +} while (0) + #define add_page_to_active_list(page) \ do { \ DEBUG_LRU_PAGE(page); \ SetPageActive(page); \ list_add(&(page)->lru, &active_list); \ - nr_active_pages++; \ + inc_nr_active_pages(page); \ } while (0) #define add_page_to_inactive_list(page) \ do { \ DEBUG_LRU_PAGE(page); \ - SetPageInactive(page); \ + SetPageInactive(page); \ list_add(&(page)->lru, &inactive_list); \ - nr_inactive_pages++; \ + inc_nr_inactive_pages(page); \ } while (0) #define del_page_from_active_list(page) \ do { \ list_del(&(page)->lru); \ ClearPageActive(page); \ - nr_active_pages--; \ + dec_nr_active_pages(page); \ + DEBUG_LRU_PAGE(page); \ } while (0) #define del_page_from_inactive_list(page) \ do { \ list_del(&(page)->lru); \ ClearPageInactive(page); \ - nr_inactive_pages--; \ + dec_nr_inactive_pages(page); \ + DEBUG_LRU_PAGE(page); \ } while (0) /* diff -urN vm-ref/include/linux/sysctl.h vm/include/linux/sysctl.h --- vm-ref/include/linux/sysctl.h Tue Oct 30 04:32:40 2001 +++ vm/include/linux/sysctl.h Tue Oct 30 04:32:51 2001 @@ -134,12 +134,13 @@ VM_FREEPG=3, /* struct: Set free page thresholds */ VM_BDFLUSH=4, /* struct: Control buffer cache flushing */ VM_OVERCOMMIT_MEMORY=5, /* Turn off the virtual memory safety limit */ - VM_BUFFERMEM=6, /* struct: Set buffer memory thresholds */ - VM_PAGECACHE=7, /* struct: Set cache memory thresholds */ VM_PAGERDAEMON=8, /* struct: Control kswapd behaviour */ VM_PGT_CACHE=9, /* struct: Set page table cache parameters */ VM_PAGE_CLUSTER=10, /* int: set number of pages to swap together */ VM_HEAP_STACK_GAP=11, /* int: page gap between heap and stack */ + VM_SCAN_RATIO=12, /* part of the inactive list to scan */ + VM_BALANCE_RATIO=13, /* balance active and inactive caches */ + VM_MAPPED_RATIO=14, /* pageout when we find too many mapped pages */ }; diff -urN vm-ref/kernel/fork.c vm/kernel/fork.c --- vm-ref/kernel/fork.c Tue Oct 30 04:32:39 2001 +++ vm/kernel/fork.c Tue Oct 30 04:32:51 2001 @@ -645,7 +645,7 @@ p->lock_depth = -1; /* -1 = no lock */ p->start_time = jiffies; - INIT_LIST_HEAD(&p->local_pages); + INIT_LIST_HEAD(&p->local_pages.list); retval = -ENOMEM; /* copy all the process information */ diff -urN vm-ref/kernel/ksyms.c vm/kernel/ksyms.c --- vm-ref/kernel/ksyms.c Tue Oct 30 04:32:40 2001 +++ vm/kernel/ksyms.c Tue Oct 30 04:32:51 2001 @@ -89,6 +89,7 @@ EXPORT_SYMBOL(exit_sighand); /* internal kernel memory management */ +EXPORT_SYMBOL(start_aggressive_readahead); EXPORT_SYMBOL(_alloc_pages); EXPORT_SYMBOL(__alloc_pages); EXPORT_SYMBOL(alloc_pages_node); diff -urN vm-ref/kernel/sysctl.c vm/kernel/sysctl.c --- vm-ref/kernel/sysctl.c Tue Oct 30 04:32:40 2001 +++ vm/kernel/sysctl.c Tue Oct 30 04:32:51 2001 @@ -30,6 +30,7 @@ #include #include #include +#include #include @@ -259,6 +260,12 @@ }; static ctl_table vm_table[] = { + {VM_SCAN_RATIO, "vm_scan_ratio", + &vm_scan_ratio, sizeof(int), 0644, NULL, &proc_dointvec}, + {VM_BALANCE_RATIO, "vm_balance_ratio", + &vm_balance_ratio, sizeof(int), 0644, NULL, &proc_dointvec}, + {VM_MAPPED_RATIO, "vm_mapped_ratio", + &vm_mapped_ratio, sizeof(int), 0644, NULL, &proc_dointvec}, {VM_BDFLUSH, "bdflush", &bdf_prm, 9*sizeof(int), 0644, NULL, &proc_dointvec_minmax, &sysctl_intvec, NULL, &bdflush_min, &bdflush_max}, diff -urN vm-ref/mm/filemap.c vm/mm/filemap.c --- vm-ref/mm/filemap.c Tue Oct 30 04:32:41 2001 +++ vm/mm/filemap.c Tue Oct 30 04:32:51 2001 @@ -1853,7 +1853,7 @@ * Found the page and have a reference on it, need to check sharing * and possibly copy it over to another page.. */ - mark_page_accessed(page); + activate_page(page); flush_page_to_ram(page); return page; @@ -2957,8 +2957,15 @@ } unlock: kunmap(page); + + /* + * Mark the page accessed if we wrote the + * beginning or we just did an lseek. + */ + if (!offset || !file->f_reada) + SetPageReferenced(page); + /* Mark it unlocked again and drop the page.. */ - SetPageReferenced(page); UnlockPage(page); page_cache_release(page); diff -urN vm-ref/mm/memory.c vm/mm/memory.c --- vm-ref/mm/memory.c Tue Oct 30 04:32:40 2001 +++ vm/mm/memory.c Tue Oct 30 04:32:51 2001 @@ -941,7 +941,9 @@ if (TryLockPage(old_page)) break; /* Recheck swapcachedness once the page is locked */ - can_reuse = remove_exclusive_swap_page(old_page); + can_reuse = exclusive_swap_page(old_page); + if (can_reuse) + delete_from_swap_cache(old_page); UnlockPage(old_page); if (!can_reuse) break; @@ -975,7 +977,6 @@ if (PageReserved(old_page)) ++mm->rss; break_cow(vma, new_page, address, page_table); - lru_cache_add(new_page); /* Free the old page.. */ new_page = old_page; @@ -1141,8 +1142,12 @@ ret = 2; } - if (!Page_Uptodate(page)) - wait_on_page(page); + /* + * Freeze the "shared"ness of the page, ie page_count + swap_count. + * Must lock page before transferring our swap count to already + * obtained page count. + */ + lock_page(page); /* * Back out if somebody else faulted in this pte while we @@ -1150,6 +1155,7 @@ */ spin_lock(&mm->page_table_lock); if (!pte_same(*page_table, orig_pte)) { + UnlockPage(page); page_cache_release(page); spin_unlock(&mm->page_table_lock); return 1; @@ -1161,6 +1167,17 @@ swap_free(entry); + if (exclusive_swap_page(page)) { + if (write_access || vm_swap_full()) { + pte = pte_mkdirty(pte); + if (vma->vm_flags & VM_WRITE) + pte = pte_mkwrite(pte); + delete_from_swap_cache(page); + } + } + activate_page(page); + UnlockPage(page); + flush_page_to_ram(page); flush_icache_page(vma, page); set_pte(page_table, pte); @@ -1204,7 +1221,6 @@ mm->rss++; flush_page_to_ram(page); entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); - lru_cache_add(page); } set_pte(page_table, entry); @@ -1256,7 +1272,6 @@ return -1; copy_highpage(page, new_page); page_cache_release(new_page); - lru_cache_add(page); new_page = page; } diff -urN vm-ref/mm/mmap.c vm/mm/mmap.c --- vm-ref/mm/mmap.c Tue Oct 30 04:32:40 2001 +++ vm/mm/mmap.c Tue Oct 30 04:32:51 2001 @@ -74,6 +74,14 @@ free += nr_swap_pages; /* + * This double-counts: the nrpages are both in the page-cache + * and in the swapper space. At the same time, this compensates + * for the swap-space over-allocation (ie "nr_swap_pages" being + * too small. + */ + free += swapper_space.nrpages; + + /* * The code below doesn't account for free space in the inode * and dentry slab cache, slab cache fragmentation, inodes and * dentries which will become freeable under VM load, etc. diff -urN vm-ref/mm/oom_kill.c vm/mm/oom_kill.c --- vm-ref/mm/oom_kill.c Wed Oct 10 02:16:27 2001 +++ vm/mm/oom_kill.c Tue Oct 30 04:32:51 2001 @@ -192,67 +192,3 @@ schedule(); return; } - -static inline int node_zones_low(pg_data_t *pgdat) -{ - zone_t * zone; - int i; - - for (i = pgdat->nr_zones-1; i >= 0; i--) { - zone = pgdat->node_zones + i; - - if (zone->free_pages > (zone->pages_low)) - return 0; - - } - return 1; -} - -static int all_zones_low(void) -{ - pg_data_t * pgdat = pgdat_list; - - pgdat = pgdat_list; - do { - if (node_zones_low(pgdat)) - continue; - return 0; - } while ((pgdat = pgdat->node_next)); - - return 1; -} - -/** - * out_of_memory - is the system out of memory? - * - * Returns 0 if there is still enough memory left, - * 1 when we are out of memory (otherwise). - */ -int out_of_memory(void) -{ - long cache_mem, limit; - - /* Enough free memory? Not OOM. */ - if (!all_zones_low()) - return 0; - - /* Enough swap space left? Not OOM. */ - if (nr_swap_pages > 0) - return 0; - - /* - * If the buffer and page cache (including swap cache) are over - * their (/proc tunable) minimum, we're still not OOM. We test - * this to make sure we don't return OOM when the system simply - * has a hard time with the cache. - */ - cache_mem = atomic_read(&page_cache_size); - limit = 2; - limit *= num_physpages / 100; - - if (cache_mem > limit) - return 0; - - /* Else... */ - return 1; -} diff -urN vm-ref/mm/page_alloc.c vm/mm/page_alloc.c --- vm-ref/mm/page_alloc.c Tue Oct 30 04:32:40 2001 +++ vm/mm/page_alloc.c Tue Oct 30 04:32:51 2001 @@ -144,12 +144,13 @@ * local since we must deal with fragmentation too and we * can't rely on the nr_local_pages information. */ - if (current->nr_local_pages && !current->allocation_order) + if ((current->local_pages.nr && !current->local_pages.order) || + !memclass(page->zone, current->local_pages.classzone)) goto back_local_freelist; - list_add(&page->list, ¤t->local_pages); + list_add(&page->list, ¤t->local_pages.list); page->index = order; - current->nr_local_pages++; + current->local_pages.nr++; } #define MARK_USED(index, order, area) \ @@ -231,35 +232,36 @@ static struct page * balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed) { struct page * page = NULL; - int __freed = 0; + int __freed; - if (!(gfp_mask & __GFP_WAIT)) - goto out; if (in_interrupt()) BUG(); - current->allocation_order = order; + current->local_pages.order = order; + current->local_pages.classzone = classzone; current->flags |= PF_MEMALLOC | PF_FREE_PAGES; __freed = try_to_free_pages(classzone, gfp_mask, order); current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES); - if (current->nr_local_pages) { + if (current->local_pages.nr) { struct list_head * entry, * local_pages; struct page * tmp; int nr_pages; - local_pages = ¤t->local_pages; + local_pages = ¤t->local_pages.list; if (likely(__freed)) { /* pick from the last inserted so we're lifo */ entry = local_pages->next; do { tmp = list_entry(entry, struct page, list); - if (tmp->index == order && memclass(tmp->zone, classzone)) { + if (!memclass(tmp->zone, classzone)) + BUG(); + if (tmp->index == order) { list_del(entry); - current->nr_local_pages--; + current->local_pages.nr--; set_page_count(tmp, 1); page = tmp; @@ -285,7 +287,7 @@ } while ((entry = entry->next) != local_pages); } - nr_pages = current->nr_local_pages; + nr_pages = current->local_pages.nr; /* free in reverse order so that the global order will be lifo */ while ((entry = local_pages->prev) != local_pages) { list_del(entry); @@ -294,9 +296,8 @@ if (!nr_pages--) BUG(); } - current->nr_local_pages = 0; + current->local_pages.nr = 0; } - out: *freed = __freed; return page; } @@ -354,8 +355,7 @@ /* here we're in the low on memory slow path */ -rebalance: - if (current->flags & PF_MEMALLOC) { + if (current->flags & PF_MEMALLOC && !in_interrupt()) { zone = zonelist->zones; for (;;) { zone_t *z = *(zone++); @@ -371,34 +371,52 @@ /* Atomic allocations - we can't balance anything */ if (!(gfp_mask & __GFP_WAIT)) - return NULL; + goto out; + rebalance: page = balance_classzone(classzone, gfp_mask, order, &freed); if (page) return page; zone = zonelist->zones; - for (;;) { - zone_t *z = *(zone++); - if (!z) - break; + if (likely(freed)) { + for (;;) { + zone_t *z = *(zone++); + if (!z) + break; - if (zone_free_pages(z, order) > z->pages_min) { - page = rmqueue(z, order); - if (page) - return page; + if (zone_free_pages(z, order) > z->pages_min) { + page = rmqueue(z, order); + if (page) + return page; + } } - } + goto rebalance; + } else { + /* + * Check that no other task is been killed meanwhile, + * in such a case we can succeed the allocation. + */ + for (;;) { + zone_t *z = *(zone++); + if (!z) + break; - /* Don't let big-order allocations loop */ - if (order > 1) - return NULL; + if (zone_free_pages(z, order) > z->pages_high) { + page = rmqueue(z, order); + if (page) + return page; + } + } + } - /* Yield for kswapd, and try again */ - current->policy |= SCHED_YIELD; - __set_current_state(TASK_RUNNING); - schedule(); - goto rebalance; + out: + printk(KERN_NOTICE "__alloc_pages: %u-order allocation failed (gfp=0x%x/%i)\n", + order, gfp_mask, !!(current->flags & PF_MEMALLOC)); +#ifdef CONFIG_DEBUG_GFP + show_stack(NULL); +#endif + return NULL; } /* @@ -427,15 +445,6 @@ return 0; } -void free_lru_page(struct page *page) -{ - if (!PageReserved(page) && put_page_testzero(page)) { - if (PageActive(page) || PageInactive(page)) - lru_cache_del(page); - __free_pages_ok(page, 0); - } -} - void __free_pages(struct page *page, unsigned int order) { if (!PageReserved(page) && put_page_testzero(page)) @@ -521,17 +530,24 @@ { pg_data_t *pgdat = pgdat_list; unsigned int sum = 0; + zonelist_t *zonelist; + zone_t **zonep, *zone; do { - zonelist_t *zonelist = pgdat->node_zonelists + (GFP_USER & GFP_ZONEMASK); - zone_t **zonep = zonelist->zones; - zone_t *zone; + zonelist = pgdat->node_zonelists + (GFP_USER & GFP_ZONEMASK); + zonep = zonelist->zones; - for (zone = *zonep++; zone; zone = *zonep++) { - unsigned long size = zone->size; - unsigned long high = zone->pages_high; - if (size > high) - sum += size - high; + zone = *zonep; + if (zone) { + sum += zone->nr_inactive_pages; + do { + unsigned int free = zone->free_pages - zone->pages_high; + zonep++; + zone = *zonep; + if (free <= 0) + continue; + sum += free; + } while (zone); } pgdat = pgdat->node_next; @@ -554,6 +570,62 @@ } #endif +/* + * If it returns non zero it means there's lots of ram "free" + * (note: not in cache!) so any caller will know that + * he can allocate some memory to do some more aggressive + * (possibly wasteful) readahead. The state of the memory + * should be rechecked after every few pages allocated for + * doing this aggressive readahead. + * + * The gfp_mask parameter specifies in which kind of memory + * the readahead information will be applocated to. + */ +int start_aggressive_readahead(unsigned int gfp_mask) +{ + pg_data_t *pgdat = pgdat_list; + zonelist_t *zonelist; + zone_t **zonep, *zone; + int ret = 0; + + do { + zonelist = pgdat->node_zonelists + (gfp_mask & GFP_ZONEMASK); + zonep = zonelist->zones; + + for (zone = *zonep++; zone; zone = *zonep++) + if (zone->free_pages > zone->pages_high * 2) + ret = 1; + + pgdat = pgdat->node_next; + } while (pgdat); + + return ret; +} + +int try_to_free_pages_nozone(unsigned int gfp_mask) +{ + pg_data_t *pgdat = pgdat_list; + zonelist_t *zonelist; + zone_t **zonep; + int ret = 0; + unsigned long pf_free_pages; + + pf_free_pages = current->flags & PF_FREE_PAGES; + current->flags &= ~PF_FREE_PAGES; + + do { + zonelist = pgdat->node_zonelists + (gfp_mask & GFP_ZONEMASK); + zonep = zonelist->zones; + + ret |= try_to_free_pages(*zonep, gfp_mask, 0); + + pgdat = pgdat->node_next; + } while (pgdat); + + current->flags |= pf_free_pages; + return ret; +} + #define K(x) ((x) << (PAGE_SHIFT-10)) /* @@ -568,28 +640,31 @@ pg_data_t *tmpdat = pgdat; printk("Free pages: %6dkB (%6dkB HighMem)\n", - nr_free_pages() << (PAGE_SHIFT-10), - nr_free_highpages() << (PAGE_SHIFT-10)); + K(nr_free_pages()), + K(nr_free_highpages())); while (tmpdat) { zone_t *zone; for (zone = tmpdat->node_zones; zone < tmpdat->node_zones + MAX_NR_ZONES; zone++) - printk("Zone:%s freepages:%6lukB min:%6luKB low:%6lukB " - "high:%6lukB\n", - zone->name, - K(zone->free_pages), - K(zone->pages_min), - K(zone->pages_low), - K(zone->pages_high)); - + printk("Zone:%s freepages:%6lukB|%lu min:%6luKB|%lu low:%6lukB|%lu high:%6lukB:%lu active:%6dkB|%d inactive:%6dkB|%d\n", + zone->name, + K(zone->free_pages), + zone->free_pages, + K(zone->pages_min), + zone->pages_min, + K(zone->pages_low), + zone->pages_low, + K(zone->pages_high), + zone->pages_high, + K(zone->nr_active_pages), + zone->nr_active_pages, + K(zone->nr_inactive_pages), + zone->nr_inactive_pages); + tmpdat = tmpdat->node_next; } - printk("Free pages: %6dkB (%6dkB HighMem)\n", - K(nr_free_pages()), - K(nr_free_highpages())); - printk("( Active: %d, inactive: %d, free: %d )\n", nr_active_pages, nr_inactive_pages, @@ -764,6 +839,7 @@ zone->zone_pgdat = pgdat; zone->free_pages = 0; zone->need_balance = 0; + zone->nr_active_pages = zone->nr_inactive_pages = 0; if (!size) continue; diff -urN vm-ref/mm/page_io.c vm/mm/page_io.c --- vm-ref/mm/page_io.c Tue Oct 30 03:37:11 2001 +++ vm/mm/page_io.c Tue Oct 30 04:32:51 2001 @@ -41,7 +41,6 @@ kdev_t dev = 0; int block_size; struct inode *swapf = 0; - int wait = 0; if (rw == READ) { ClearPageUptodate(page); @@ -73,18 +72,6 @@ /* block_size == PAGE_SIZE/zones_used */ brw_page(rw, page, dev, zones, block_size); - - /* Note! For consistency we do all of the logic, - * decrementing the page count, and unlocking the page in the - * swap lock map - in the IO completion handler. - */ - if (!wait) - return 1; - - wait_on_page(page); - /* This shouldn't happen, but check to be sure. */ - if (page_count(page) == 0) - printk(KERN_ERR "rw_swap_page: page unused while waiting!\n"); return 1; } diff -urN vm-ref/mm/shmem.c vm/mm/shmem.c --- vm-ref/mm/shmem.c Tue Oct 30 03:37:11 2001 +++ vm/mm/shmem.c Tue Oct 30 04:32:53 2001 @@ -212,9 +212,7 @@ entry = *ptr; *ptr = (swp_entry_t){0}; freed++; - - /* vmscan will do the actual page freeing later.. */ - swap_free (entry); + free_swap_and_cache(entry); } return freed; } @@ -449,6 +447,7 @@ BUG(); /* Remove it from the page cache */ + lru_cache_del(page); remove_inode_page(page); page_cache_release(page); @@ -550,7 +549,7 @@ swap_free(*entry); *entry = (swp_entry_t) {0}; delete_from_swap_cache(page); - flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_referenced) | (1 << PG_arch_1)); + flags = page->flags & ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_referenced | 1 << PG_arch_1); page->flags = flags | (1 << PG_dirty); add_to_page_cache_locked(page, mapping, idx); info->swapped--; diff -urN vm-ref/mm/swap.c vm/mm/swap.c --- vm-ref/mm/swap.c Tue Oct 30 03:37:11 2001 +++ vm/mm/swap.c Tue Oct 30 04:32:51 2001 @@ -49,6 +49,8 @@ if (PageActive(page)) { del_page_from_active_list(page); add_page_to_inactive_list(page); + /* deactivate yes, but refile at the first mark_page_accessed */ + SetPageReferenced(page); } } @@ -67,7 +69,9 @@ if (PageInactive(page)) { del_page_from_inactive_list(page); add_page_to_active_list(page); - } + ClearPageReferenced(page); + } else + SetPageReferenced(page); } void activate_page(struct page * page) @@ -83,11 +87,11 @@ */ void lru_cache_add(struct page * page) { - if (!PageActive(page) && !PageInactive(page)) { - spin_lock(&pagemap_lru_lock); - add_page_to_inactive_list(page); - spin_unlock(&pagemap_lru_lock); - } + if (!PageLocked(page)) + BUG(); + spin_lock(&pagemap_lru_lock); + add_page_to_inactive_list(page); + spin_unlock(&pagemap_lru_lock); } /** @@ -103,9 +107,9 @@ del_page_from_active_list(page); } else if (PageInactive(page)) { del_page_from_inactive_list(page); - } else { -// printk("VM: __lru_cache_del, found unknown page ?!\n"); - } + } else + printk("VM: __lru_cache_del, found unknown page ?!\n"); + DEBUG_LRU_PAGE(page); } /** @@ -114,6 +118,8 @@ */ void lru_cache_del(struct page * page) { + if (!PageLocked(page)) + BUG(); spin_lock(&pagemap_lru_lock); __lru_cache_del(page); spin_unlock(&pagemap_lru_lock); diff -urN vm-ref/mm/swap_state.c vm/mm/swap_state.c --- vm-ref/mm/swap_state.c Tue Oct 30 03:37:11 2001 +++ vm/mm/swap_state.c Tue Oct 30 04:32:51 2001 @@ -17,8 +17,17 @@ #include +/* + * We may have stale swap cache pages in memory: notice + * them here and get rid of the unnecessary final write. + */ static int swap_writepage(struct page *page) { + if (exclusive_swap_page(page)) { + delete_from_swap_cache(page); + UnlockPage(page); + return 0; + } rw_swap_page(WRITE, page); return 0; } @@ -109,7 +118,8 @@ if (!PageLocked(page)) BUG(); - block_flushpage(page, 0); + if (block_flushpage(page, 0)) + lru_cache_del(page); entry.val = page->index; @@ -137,7 +147,8 @@ * - Marcelo */ if (PageSwapCache(page) && !TryLockPage(page)) { - remove_exclusive_swap_page(page); + if (exclusive_swap_page(page)) + delete_from_swap_cache(page); UnlockPage(page); } page_cache_release(page); diff -urN vm-ref/mm/swapfile.c vm/mm/swapfile.c --- vm-ref/mm/swapfile.c Tue Oct 30 03:37:11 2001 +++ vm/mm/swapfile.c Tue Oct 30 04:32:51 2001 @@ -224,50 +224,6 @@ } /* - * Work out if there are any other processes sharing this - * swap cache page. Free it if you can. Return success. - */ -int remove_exclusive_swap_page(struct page *page) -{ - int retval; - struct swap_info_struct * p; - swp_entry_t entry; - - if (!PageLocked(page)) - BUG(); - if (!PageSwapCache(page)) - return 0; - if (page_count(page) - !!page->buffers != 2) /* 2: us + cache */ - return 0; - - entry.val = page->index; - p = swap_info_get(entry); - if (!p) - return 0; - - /* Is the only swap cache user the cache itself? */ - retval = 0; - if (p->swap_map[SWP_OFFSET(entry)] == 1) { - /* Recheck the page count with the pagecache lock held.. */ - spin_lock(&pagecache_lock); - if (page_count(page) - !!page->buffers == 2) { - __delete_from_swap_cache(page); - retval = 1; - } - spin_unlock(&pagecache_lock); - } - swap_info_put(p); - - if (retval) { - block_flushpage(page, 0); - swap_free(entry); - page_cache_release(page); - } - - return retval; -} - -/* * Free the swap entry like above, but also try to * free the page cache entry if it is the last user. */ diff -urN vm-ref/mm/vmscan.c vm/mm/vmscan.c --- vm-ref/mm/vmscan.c Tue Oct 30 03:37:11 2001 +++ vm/mm/vmscan.c Tue Oct 30 04:32:51 2001 @@ -26,12 +26,28 @@ #include /* - * The "priority" of VM scanning is how much of the queues we - * will scan in one go. A value of 6 for DEF_PRIORITY implies - * that we'll scan 1/64th of the queues ("queue_length >> 6") - * during a normal aging round. + * "vm_scan_ratio" is how much of the queues we will scan + * in one go. A value of 8 for vm_scan_ratio implies that we'll + * scan 1/8 of the inactive list during a normal aging round. + * So if 1/vm_scan_ratio of the inactive cache is unfreeable + * we'll start the background paging. */ -#define DEF_PRIORITY (6) +int vm_scan_ratio = 8; + +/* + * "vm_scan_ratio" controls when we start to swapout, the lower, + * the earlier we'll start to swapout. + */ +int vm_mapped_ratio = 5; + +/* + * "vm_balance_ratio" controls the balance between active and + * inactive cache. The bigger vm_balance_ratio is, the easier the + * active cache will grow, because we'll rotate the active list + * slowly. A value of 6 means we'll go towards a balance of + * 1/7 of the cache being inactive. + */ +int vm_balance_ratio = 6; /* * The swap-out function returns 1 if it successfully @@ -50,7 +66,7 @@ /* Don't look at this pte if it's been accessed recently. */ if (ptep_test_and_clear_young(page_table)) { - mark_page_accessed(page); + activate_page(page); return 0; } @@ -91,6 +107,11 @@ UnlockPage(page); { int freeable = page_count(page) - !!page->buffers <= 2; +#if 0 + if (freeable) + /* don't waste time waiting this page */ + deactivate_page(page); +#endif page_cache_release(page); return freeable; } @@ -287,13 +308,13 @@ return count; } -static int FASTCALL(swap_out(unsigned int priority, unsigned int gfp_mask, zone_t * classzone)); -static int swap_out(unsigned int priority, unsigned int gfp_mask, zone_t * classzone) +static int FASTCALL(swap_out(zone_t * classzone)); +static int swap_out(zone_t * classzone) { int counter, nr_pages = SWAP_CLUSTER_MAX; struct mm_struct *mm; - counter = mmlist_nr; + counter = mmlist_nr << 1; do { if (unlikely(current->need_resched)) { __set_current_state(TASK_RUNNING); @@ -329,15 +350,13 @@ return 0; } -static int FASTCALL(shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int priority)); -static int shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int priority) +static int FASTCALL(shrink_cache(int nr_pages, int max_scan, zone_t * classzone, unsigned int gfp_mask)); +static int shrink_cache(int nr_pages, int max_scan, zone_t * classzone, unsigned int gfp_mask) { struct list_head * entry; - int max_scan = nr_inactive_pages / priority; - int max_mapped = nr_pages*10; + int max_mapped = nr_pages * vm_mapped_ratio; - spin_lock(&pagemap_lru_lock); - while (--max_scan >= 0 && (entry = inactive_list.prev) != &inactive_list) { + while (max_scan && classzone->nr_inactive_pages && (entry = inactive_list.prev) != &inactive_list) { struct page * page; if (unlikely(current->need_resched)) { @@ -356,18 +375,13 @@ list_del(entry); list_add(entry, &inactive_list); - /* - * Zero page counts can happen because we unlink the pages - * _after_ decrementing the usage count.. - */ - if (unlikely(!page_count(page))) - continue; - if (!memclass(page->zone, classzone)) continue; + max_scan--; + /* Racy check to avoid trylocking when not worthwhile */ - if (!page->buffers && (page_count(page) != 1 || !page->mapping)) + if (!page->buffers && page_count(page) != 1) goto page_mapped; /* @@ -460,31 +474,35 @@ } } + if (unlikely(!page->mapping)) + BUG(); + spin_lock(&pagecache_lock); /* - * this is the non-racy check for busy page. + * This is the non-racy check for busy page. + * It is critical to check PageDirty _after_ we made sure + * the page is freeable so not in use by anybody. + * At this point we're guaranteed that page->buffers is NULL, + * nobody can refill page->buffers under us because we still + * hold the page lock. */ - if (!page->mapping || !is_page_cache_freeable(page)) { + if (unlikely(page_count(page) > 1)) { spin_unlock(&pagecache_lock); UnlockPage(page); -page_mapped: - if (--max_mapped >= 0) - continue; + page_mapped: + if (--max_mapped < 0) { + spin_unlock(&pagemap_lru_lock); - /* - * Alert! We've found too many mapped pages on the - * inactive list, so we start swapping out now! - */ - spin_unlock(&pagemap_lru_lock); - swap_out(priority, gfp_mask, classzone); - return nr_pages; - } + if (!swap_out(classzone)) + return nr_pages; + max_mapped = nr_pages * vm_mapped_ratio; - /* - * It is critical to check PageDirty _after_ we made sure - * the page is freeable* so not in use by anybody. - */ + spin_lock(&pagemap_lru_lock); + } + continue; + + } if (PageDirty(page)) { spin_unlock(&pagecache_lock); UnlockPage(page); @@ -515,20 +533,6 @@ } spin_unlock(&pagemap_lru_lock); - if (nr_pages <= 0) - return 0; - - /* - * If swapping out isn't appropriate, and - * we still fail, try the other (usually smaller) - * caches instead. - */ - shrink_dcache_memory(priority, gfp_mask); - shrink_icache_memory(priority, gfp_mask); -#ifdef CONFIG_QUOTA - shrink_dqcache_memory(DEF_PRIORITY, gfp_mask); -#endif - return nr_pages; } @@ -539,60 +543,99 @@ * We move them the other way when we see the * reference bit on the page. */ -static void refill_inactive(int nr_pages) +static void FASTCALL(refill_inactive(int nr_pages, zone_t * classzone)); +static void refill_inactive(int nr_pages, zone_t * classzone) { struct list_head * entry; - spin_lock(&pagemap_lru_lock); entry = active_list.prev; - while (nr_pages-- && entry != &active_list) { + while (nr_pages && entry != &active_list) { struct page * page; page = list_entry(entry, struct page, lru); entry = entry->prev; + + if (!memclass(page->zone, classzone)) + continue; + if (PageTestandClearReferenced(page)) { list_del(&page->lru); list_add(&page->lru, &active_list); continue; } + nr_pages--; + del_page_from_active_list(page); add_page_to_inactive_list(page); + SetPageReferenced(page); + } + if (entry != &active_list) { + list_del(&active_list); + list_add(&active_list, entry); } - spin_unlock(&pagemap_lru_lock); } -static int FASTCALL(shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages)); -static int shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages) +static int FASTCALL(shrink_caches(zone_t * classzone, unsigned int gfp_mask, int nr_pages, int * shrink_vfs)); +static int shrink_caches(zone_t * classzone, unsigned int gfp_mask, int nr_pages, int * shrink_vfs) { - int chunk_size = nr_pages; + int max_scan; unsigned long ratio; nr_pages -= kmem_cache_reap(gfp_mask); if (nr_pages <= 0) return 0; - nr_pages = chunk_size; - /* try to keep the active list 2/3 of the size of the cache */ - ratio = (unsigned long) nr_pages * nr_active_pages / ((nr_inactive_pages + 1) * 2); - refill_inactive(ratio); + spin_lock(&pagemap_lru_lock); + ratio = (unsigned long) nr_pages * classzone->nr_active_pages / (((unsigned long) classzone->nr_inactive_pages * vm_balance_ratio) + 1); + if (ratio > nr_pages * 2) { + ratio = nr_pages * 2; + /* too much active cache so shrink the vfs as well */ + *shrink_vfs = 1; + } + refill_inactive(ratio, classzone); + + max_scan = classzone->nr_inactive_pages / vm_scan_ratio; + nr_pages = shrink_cache(nr_pages, max_scan, classzone, gfp_mask); - return shrink_cache(nr_pages, classzone, gfp_mask, priority); + return nr_pages; } +static int check_classzone_need_balance(zone_t * classzone); + int try_to_free_pages(zone_t *classzone, unsigned int gfp_mask, unsigned int order) { - int ret = 0; - int priority = DEF_PRIORITY; - int nr_pages = SWAP_CLUSTER_MAX; + for (;;) { + int tries = vm_scan_ratio << 2; + int nr_pages = SWAP_CLUSTER_MAX; - do { - nr_pages = shrink_caches(classzone, priority, gfp_mask, nr_pages); - if (nr_pages <= 0) - return 1; - } while (--priority); + do { + int shrink_vfs = 0; + nr_pages = shrink_caches(classzone, gfp_mask, nr_pages, &shrink_vfs); + if (shrink_vfs || nr_pages > 0) { + shrink_dcache_memory(vm_scan_ratio, gfp_mask); + shrink_icache_memory(vm_scan_ratio, gfp_mask); +#ifdef CONFIG_QUOTA + shrink_dqcache_memory(vm_scan_ratio, gfp_mask); +#endif + } + if (nr_pages <= 0) + return 1; - return ret; + if (!swap_out(classzone)) + return 0; + } while (--tries); + + if (likely(current->pid != 1)) + break; + if (!check_classzone_need_balance(classzone)) + break; + current->policy |= SCHED_YIELD; + __set_current_state(TASK_RUNNING); + schedule(); + } + + return 0; } DECLARE_WAIT_QUEUE_HEAD(kswapd_wait); @@ -624,7 +667,7 @@ if (!try_to_free_pages(zone, GFP_KSWAPD, 0)) { zone->need_balance = 0; __set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(HZ); + schedule_timeout(HZ*5); continue; } if (check_classzone_need_balance(zone)) @@ -647,9 +690,6 @@ do need_more_balance |= kswapd_balance_pgdat(pgdat); while ((pgdat = pgdat->node_next)); - if (need_more_balance && out_of_memory()) { - oom_kill(); - } } while (need_more_balance); }