diff -urN vm-ref/arch/i386/config.in vm/arch/i386/config.in --- vm-ref/arch/i386/config.in Fri Oct 19 05:19:12 2001 +++ vm/arch/i386/config.in Fri Oct 19 05:19:29 2001 @@ -402,6 +402,7 @@ bool ' Magic SysRq key' CONFIG_MAGIC_SYSRQ bool ' Spinlock debugging' CONFIG_DEBUG_SPINLOCK bool ' Verbose BUG() reporting (adds 70K)' CONFIG_DEBUG_BUGVERBOSE + bool ' Debug allocation faliures' CONFIG_DEBUG_GFP fi endmenu diff -urN vm-ref/fs/buffer.c vm/fs/buffer.c --- vm-ref/fs/buffer.c Fri Oct 19 05:19:13 2001 +++ vm/fs/buffer.c Fri Oct 19 05:19:29 2001 @@ -115,7 +115,7 @@ int dummy5; /* unused */ } b_un; unsigned int data[N_PARAM]; -} bdf_prm = {{30, 64, 64, 256, 5*HZ, 30*HZ, 60, 0, 0}}; +} bdf_prm = {{40, 64, 64, 256, 5*HZ, 30*HZ, 60, 0, 0}}; /* These are the min and max parameter values that we will allow to be assigned */ int bdflush_min[N_PARAM] = { 0, 10, 5, 25, 0, 1*HZ, 0, 0, 0}; @@ -2396,11 +2396,8 @@ ll_rw_block(WRITE, 1, &p); tryagain = 0; } else if (buffer_locked(p)) { - if (gfp_mask & __GFP_WAIT) { - wait_on_buffer(p); - tryagain = 1; - } else - tryagain = 0; + wait_on_buffer(p); + tryagain = 1; } } else tryagain = 0; diff -urN vm-ref/include/linux/mmzone.h vm/include/linux/mmzone.h --- vm-ref/include/linux/mmzone.h Mon Oct 8 04:28:58 2001 +++ vm/include/linux/mmzone.h Fri Oct 19 05:19:29 2001 @@ -41,6 +41,7 @@ unsigned long free_pages; unsigned long pages_min, pages_low, pages_high; int need_balance; + int nr_active_pages, nr_inactive_pages; /* * free areas of different sizes diff -urN vm-ref/include/linux/sched.h vm/include/linux/sched.h --- vm-ref/include/linux/sched.h Fri Oct 19 05:19:13 2001 +++ vm/include/linux/sched.h Fri Oct 19 05:19:29 2001 @@ -280,6 +280,14 @@ extern struct user_struct root_user; #define INIT_USER (&root_user) +struct zone_struct; + +struct local_pages { + struct list_head list; + unsigned int order, nr; + struct zone_struct * classzone; +}; + struct task_struct { /* * offsets of these are hardcoded elsewhere - touch with care @@ -318,8 +326,7 @@ struct task_struct *next_task, *prev_task; struct mm_struct *active_mm; struct rw_sem_recursor mm_recursor; - struct list_head local_pages; - unsigned int allocation_order, nr_local_pages; + struct local_pages local_pages; /* task state */ struct linux_binfmt *binfmt; diff -urN vm-ref/include/linux/swap.h vm/include/linux/swap.h --- vm-ref/include/linux/swap.h Fri Oct 19 05:19:13 2001 +++ vm/include/linux/swap.h Fri Oct 19 05:19:29 2001 @@ -105,15 +105,14 @@ extern void FASTCALL(lru_cache_del(struct page *)); extern void FASTCALL(deactivate_page(struct page *)); -extern void FASTCALL(deactivate_page_nolock(struct page *)); extern void FASTCALL(activate_page(struct page *)); -extern void FASTCALL(activate_page_nolock(struct page *)); extern void swap_setup(void); /* linux/mm/vmscan.c */ extern wait_queue_head_t kswapd_wait; -extern int FASTCALL(try_to_free_pages(unsigned int, unsigned int)); +extern int FASTCALL(try_to_free_pages(zone_t *, unsigned int, unsigned int)); +extern int vm_scan_ratio, vm_mapped_ratio, vm_balance_ratio; /* linux/mm/page_io.c */ extern void rw_swap_page(int, struct page *); @@ -134,7 +133,6 @@ extern struct page * read_swap_cache_async(swp_entry_t); /* linux/mm/oom_kill.c */ -extern int out_of_memory(void); extern void oom_kill(void); /* linux/mm/swapfile.c */ @@ -177,27 +175,91 @@ BUG(); \ } while (0) +#define inc_nr_active_pages(page) \ +do { \ + pg_data_t * __pgdat; \ + zone_t * __classzone, * __overflow; \ + \ + __classzone = (page)->zone; \ + __pgdat = __classzone->zone_pgdat; \ + __overflow = __pgdat->node_zones + __pgdat->nr_zones; \ + \ + while (__classzone < __overflow) { \ + __classzone->nr_active_pages++; \ + __classzone++; \ + } \ + nr_active_pages++; \ +} while (0) + +#define dec_nr_active_pages(page) \ +do { \ + pg_data_t * __pgdat; \ + zone_t * __classzone, * __overflow; \ + \ + __classzone = (page)->zone; \ + __pgdat = __classzone->zone_pgdat; \ + __overflow = __pgdat->node_zones + __pgdat->nr_zones; \ + \ + while (__classzone < __overflow) { \ + __classzone->nr_active_pages--; \ + __classzone++; \ + } \ + nr_active_pages--; \ +} while (0) + +#define inc_nr_inactive_pages(page) \ +do { \ + pg_data_t * __pgdat; \ + zone_t * __classzone, * __overflow; \ + \ + __classzone = (page)->zone; \ + __pgdat = __classzone->zone_pgdat; \ + __overflow = __pgdat->node_zones + __pgdat->nr_zones; \ + \ + while (__classzone < __overflow) { \ + __classzone->nr_inactive_pages++; \ + __classzone++; \ + } \ + nr_inactive_pages++; \ +} while (0) + +#define dec_nr_inactive_pages(page) \ +do { \ + pg_data_t * __pgdat; \ + zone_t * __classzone, * __overflow; \ + \ + __classzone = (page)->zone; \ + __pgdat = __classzone->zone_pgdat; \ + __overflow = __pgdat->node_zones + __pgdat->nr_zones; \ + \ + while (__classzone < __overflow) { \ + __classzone->nr_inactive_pages--; \ + __classzone++; \ + } \ + nr_inactive_pages--; \ +} while (0) + #define add_page_to_active_list(page) \ do { \ DEBUG_LRU_PAGE(page); \ SetPageActive(page); \ list_add(&(page)->lru, &active_list); \ - nr_active_pages++; \ + inc_nr_active_pages(page); \ } while (0) #define add_page_to_inactive_list(page) \ do { \ DEBUG_LRU_PAGE(page); \ - SetPageInactive(page); \ + SetPageInactive(page); \ list_add(&(page)->lru, &inactive_list); \ - nr_inactive_pages++; \ + inc_nr_inactive_pages(page); \ } while (0) #define del_page_from_active_list(page) \ do { \ list_del(&(page)->lru); \ ClearPageActive(page); \ - nr_active_pages--; \ + dec_nr_active_pages(page); \ DEBUG_LRU_PAGE(page); \ } while (0) @@ -205,7 +267,7 @@ do { \ list_del(&(page)->lru); \ ClearPageInactive(page); \ - nr_inactive_pages--; \ + dec_nr_inactive_pages(page); \ DEBUG_LRU_PAGE(page); \ } while (0) diff -urN vm-ref/include/linux/sysctl.h vm/include/linux/sysctl.h --- vm-ref/include/linux/sysctl.h Fri Oct 19 05:19:13 2001 +++ vm/include/linux/sysctl.h Fri Oct 19 05:19:29 2001 @@ -134,12 +134,13 @@ VM_FREEPG=3, /* struct: Set free page thresholds */ VM_BDFLUSH=4, /* struct: Control buffer cache flushing */ VM_OVERCOMMIT_MEMORY=5, /* Turn off the virtual memory safety limit */ - VM_BUFFERMEM=6, /* struct: Set buffer memory thresholds */ - VM_PAGECACHE=7, /* struct: Set cache memory thresholds */ VM_PAGERDAEMON=8, /* struct: Control kswapd behaviour */ VM_PGT_CACHE=9, /* struct: Set page table cache parameters */ VM_PAGE_CLUSTER=10, /* int: set number of pages to swap together */ VM_HEAP_STACK_GAP=11, /* int: page gap between heap and stack */ + VM_SCAN_RATIO=12, /* part of the inactive list to scan */ + VM_MAPPED_RATIO=13, /* detect when it's time to start paging */ + VM_BALANCE_RATIO=14, /* balance active and inactive caches */ }; diff -urN vm-ref/kernel/fork.c vm/kernel/fork.c --- vm-ref/kernel/fork.c Fri Oct 19 05:19:10 2001 +++ vm/kernel/fork.c Fri Oct 19 05:19:29 2001 @@ -645,7 +645,7 @@ p->lock_depth = -1; /* -1 = no lock */ p->start_time = jiffies; - INIT_LIST_HEAD(&p->local_pages); + INIT_LIST_HEAD(&p->local_pages.list); retval = -ENOMEM; /* copy all the process information */ diff -urN vm-ref/kernel/sysctl.c vm/kernel/sysctl.c --- vm-ref/kernel/sysctl.c Fri Oct 19 05:19:13 2001 +++ vm/kernel/sysctl.c Fri Oct 19 05:19:29 2001 @@ -30,6 +30,7 @@ #include #include #include +#include #include @@ -259,6 +260,12 @@ }; static ctl_table vm_table[] = { + {VM_SCAN_RATIO, "vm_scan_ratio", + &vm_scan_ratio, sizeof(int), 0644, NULL, &proc_dointvec}, + {VM_MAPPED_RATIO, "vm_mapped_ratio", + &vm_mapped_ratio, sizeof(int), 0644, NULL, &proc_dointvec}, + {VM_BALANCE_RATIO, "vm_balance_ratio", + &vm_balance_ratio, sizeof(int), 0644, NULL, &proc_dointvec}, {VM_BDFLUSH, "bdflush", &bdf_prm, 9*sizeof(int), 0644, NULL, &proc_dointvec_minmax, &sysctl_intvec, NULL, &bdflush_min, &bdflush_max}, diff -urN vm-ref/mm/filemap.c vm/mm/filemap.c --- vm-ref/mm/filemap.c Fri Oct 19 05:19:13 2001 +++ vm/mm/filemap.c Fri Oct 19 05:19:29 2001 @@ -2919,8 +2919,15 @@ } unlock: kunmap(page); + + /* + * Mark the page accessed if we wrote the + * beginning or we just did an lseek. + */ + if (!offset || !file->f_reada) + mark_page_accessed(page); + /* Mark it unlocked again and drop the page.. */ - SetPageReferenced(page); UnlockPage(page); page_cache_release(page); diff -urN vm-ref/mm/highmem.c vm/mm/highmem.c --- vm-ref/mm/highmem.c Fri Oct 19 05:19:13 2001 +++ vm/mm/highmem.c Fri Oct 19 05:19:29 2001 @@ -328,7 +328,6 @@ struct list_head *tmp; struct page *page; -repeat_alloc: page = alloc_page(GFP_NOHIGHIO); if (page) return page; @@ -338,6 +337,7 @@ */ wakeup_bdflush(); +repeat_alloc: /* * Try to allocate from the emergency pool. */ @@ -366,7 +366,6 @@ struct list_head *tmp; struct buffer_head *bh; -repeat_alloc: bh = kmem_cache_alloc(bh_cachep, SLAB_NOHIGHIO); if (bh) return bh; @@ -376,6 +375,7 @@ */ wakeup_bdflush(); +repeat_alloc: /* * Try to allocate from the emergency pool. */ diff -urN vm-ref/mm/memory.c vm/mm/memory.c --- vm-ref/mm/memory.c Fri Oct 19 05:19:13 2001 +++ vm/mm/memory.c Fri Oct 19 05:19:29 2001 @@ -1105,10 +1105,6 @@ return; } -/* Swap 80% full? Release the pages as they are paged in.. */ -#define vm_swap_full() \ - (swapper_space.nrpages*5 > total_swap_pages*4) - /* * We hold the mm semaphore and the page_table_lock on entry and exit. */ @@ -1164,12 +1160,10 @@ swap_free(entry); mark_page_accessed(page); if (exclusive_swap_page(page)) { - if (write_access || vm_swap_full()) { - pte = pte_mkdirty(pte); - if (vma->vm_flags & VM_WRITE) - pte = pte_mkwrite(pte); - delete_from_swap_cache(page); - } + if (vma->vm_flags & VM_WRITE) + pte = pte_mkwrite(pte); + pte = pte_mkdirty(pte); + delete_from_swap_cache(page); } UnlockPage(page); diff -urN vm-ref/mm/oom_kill.c vm/mm/oom_kill.c --- vm-ref/mm/oom_kill.c Wed Oct 10 02:16:27 2001 +++ vm/mm/oom_kill.c Fri Oct 19 05:19:29 2001 @@ -192,67 +192,3 @@ schedule(); return; } - -static inline int node_zones_low(pg_data_t *pgdat) -{ - zone_t * zone; - int i; - - for (i = pgdat->nr_zones-1; i >= 0; i--) { - zone = pgdat->node_zones + i; - - if (zone->free_pages > (zone->pages_low)) - return 0; - - } - return 1; -} - -static int all_zones_low(void) -{ - pg_data_t * pgdat = pgdat_list; - - pgdat = pgdat_list; - do { - if (node_zones_low(pgdat)) - continue; - return 0; - } while ((pgdat = pgdat->node_next)); - - return 1; -} - -/** - * out_of_memory - is the system out of memory? - * - * Returns 0 if there is still enough memory left, - * 1 when we are out of memory (otherwise). - */ -int out_of_memory(void) -{ - long cache_mem, limit; - - /* Enough free memory? Not OOM. */ - if (!all_zones_low()) - return 0; - - /* Enough swap space left? Not OOM. */ - if (nr_swap_pages > 0) - return 0; - - /* - * If the buffer and page cache (including swap cache) are over - * their (/proc tunable) minimum, we're still not OOM. We test - * this to make sure we don't return OOM when the system simply - * has a hard time with the cache. - */ - cache_mem = atomic_read(&page_cache_size); - limit = 2; - limit *= num_physpages / 100; - - if (cache_mem > limit) - return 0; - - /* Else... */ - return 1; -} diff -urN vm-ref/mm/page_alloc.c vm/mm/page_alloc.c --- vm-ref/mm/page_alloc.c Fri Oct 19 05:19:13 2001 +++ vm/mm/page_alloc.c Fri Oct 19 05:19:31 2001 @@ -146,12 +146,13 @@ * local since we must deal with fragmentation too and we * can't rely on the nr_local_pages information. */ - if (current->nr_local_pages && !current->allocation_order) + if ((current->local_pages.nr && !current->local_pages.order) || + !memclass(page->zone, current->local_pages.classzone)) goto back_local_freelist; - list_add(&page->list, ¤t->local_pages); + list_add(&page->list, ¤t->local_pages.list); page->index = order; - current->nr_local_pages++; + current->local_pages.nr++; } #define MARK_USED(index, order, area) \ @@ -233,35 +234,36 @@ static struct page * balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed) { struct page * page = NULL; - int __freed = 0; + int __freed; - if (!(gfp_mask & __GFP_WAIT)) - goto out; if (in_interrupt()) BUG(); - current->allocation_order = order; + current->local_pages.order = order; + current->local_pages.classzone = classzone; current->flags |= PF_MEMALLOC | PF_FREE_PAGES; - __freed = try_to_free_pages(gfp_mask, order); + __freed = try_to_free_pages(classzone, gfp_mask, order); current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES); - if (current->nr_local_pages) { + if (current->local_pages.nr) { struct list_head * entry, * local_pages; struct page * tmp; int nr_pages; - local_pages = ¤t->local_pages; + local_pages = ¤t->local_pages.list; if (likely(__freed)) { /* pick from the last inserted so we're lifo */ entry = local_pages->next; do { tmp = list_entry(entry, struct page, list); - if (tmp->index == order && memclass(tmp->zone, classzone)) { + if (!memclass(tmp->zone, classzone)) + BUG(); + if (tmp->index == order) { list_del(entry); - current->nr_local_pages--; + current->local_pages.nr--; set_page_count(tmp, 1); page = tmp; @@ -289,7 +291,7 @@ } while ((entry = entry->next) != local_pages); } - nr_pages = current->nr_local_pages; + nr_pages = current->local_pages.nr; /* free in reverse order so that the global order will be lifo */ while ((entry = local_pages->prev) != local_pages) { list_del(entry); @@ -298,9 +300,8 @@ if (!nr_pages--) BUG(); } - current->nr_local_pages = 0; + current->local_pages.nr = 0; } - out: *freed = __freed; return page; } @@ -358,8 +359,7 @@ /* here we're in the low on memory slow path */ -rebalance: - if (current->flags & PF_MEMALLOC) { + if (current->flags & PF_MEMALLOC && !in_interrupt()) { zone = zonelist->zones; for (;;) { zone_t *z = *(zone++); @@ -375,34 +375,52 @@ /* Atomic allocations - we can't balance anything */ if (!(gfp_mask & __GFP_WAIT)) - return NULL; + goto out; + rebalance: page = balance_classzone(classzone, gfp_mask, order, &freed); if (page) return page; zone = zonelist->zones; - for (;;) { - zone_t *z = *(zone++); - if (!z) - break; + if (likely(freed)) { + for (;;) { + zone_t *z = *(zone++); + if (!z) + break; - if (zone_free_pages(z, order) > z->pages_min) { - page = rmqueue(z, order); - if (page) - return page; + if (zone_free_pages(z, order) > z->pages_min) { + page = rmqueue(z, order); + if (page) + return page; + } } - } + goto rebalance; + } else { + /* + * Check that no other task is been killed meanwhile, + * in such a case we can succeed the allocation. + */ + for (;;) { + zone_t *z = *(zone++); + if (!z) + break; - /* Don't let big-order allocations loop */ - if (order > 1) - return NULL; + if (zone_free_pages(z, order) > z->pages_high) { + page = rmqueue(z, order); + if (page) + return page; + } + } + } - /* Yield for kswapd, and try again */ - current->policy |= SCHED_YIELD; - __set_current_state(TASK_RUNNING); - schedule(); - goto rebalance; + out: + printk(KERN_NOTICE "__alloc_pages: %u-order allocation failed (gfp=0x%x/%i)\n", + order, gfp_mask, !!(current->flags & PF_MEMALLOC)); +#ifdef CONFIG_DEBUG_GFP + show_stack(NULL); +#endif + return NULL; } /* @@ -523,13 +541,20 @@ zonelist = pgdat->node_zonelists + (GFP_USER & GFP_ZONEMASK); zonep = zonelist->zones; - for (zone = *zonep++; zone; zone = *zonep++) - sum += zone->free_pages; + zone = *zonep; + if (zone) { + sum += zone->nr_inactive_pages + zone->nr_active_pages; + do { + sum += zone->free_pages; + zonep++; + zone = *zonep; + } while (zone); + } pgdat = pgdat->node_next; } while (pgdat); - return sum + nr_active_pages + nr_inactive_pages; + return sum; } #if CONFIG_HIGHMEM @@ -597,25 +622,24 @@ zone_t *zone; for (zone = tmpdat->node_zones; zone < tmpdat->node_zones + MAX_NR_ZONES; zone++) - printk("Zone:%s freepages:%6lukB min:%6luKB low:%6lukB " - "high:%6lukB\n", - zone->name, - (zone->free_pages) - << ((PAGE_SHIFT-10)), - zone->pages_min - << ((PAGE_SHIFT-10)), - zone->pages_low - << ((PAGE_SHIFT-10)), - zone->pages_high - << ((PAGE_SHIFT-10))); - + printk("Zone:%s freepages:%6lukB|%lu min:%6luKB|%lu low:%6lukB|%lu high:%6lukB:%lu active:%6dkB|%d inactive:%6dkB|%d\n", + zone->name, + zone->free_pages << (PAGE_SHIFT-10), + zone->free_pages, + zone->pages_min << (PAGE_SHIFT-10), + zone->pages_min, + zone->pages_low << (PAGE_SHIFT-10), + zone->pages_low, + zone->pages_high << (PAGE_SHIFT-10), + zone->pages_high, + zone->nr_active_pages << (PAGE_SHIFT-10), + zone->nr_active_pages, + zone->nr_inactive_pages << (PAGE_SHIFT-10), + zone->nr_inactive_pages); + tmpdat = tmpdat->node_next; } - printk("Free pages: %6dkB (%6dkB HighMem)\n", - nr_free_pages() << (PAGE_SHIFT-10), - nr_free_highpages() << (PAGE_SHIFT-10)); - printk("( Active: %d, inactive: %d, free: %d )\n", nr_active_pages, nr_inactive_pages, @@ -790,6 +814,7 @@ zone->zone_pgdat = pgdat; zone->free_pages = 0; zone->need_balance = 0; + zone->nr_active_pages = zone->nr_inactive_pages = 0; if (!size) continue; diff -urN vm-ref/mm/shmem.c vm/mm/shmem.c --- vm-ref/mm/shmem.c Fri Oct 19 05:00:08 2001 +++ vm/mm/shmem.c Fri Oct 19 05:19:29 2001 @@ -557,7 +557,7 @@ swap_free(*entry); *entry = (swp_entry_t) {0}; delete_from_swap_cache(page); - flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_referenced) | (1 << PG_arch_1)); + flags = page->flags & ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_referenced | 1 << PG_arch_1); page->flags = flags | (1 << PG_dirty); add_to_page_cache_locked(page, mapping, idx); info->swapped--; diff -urN vm-ref/mm/swap.c vm/mm/swap.c --- vm-ref/mm/swap.c Fri Oct 19 05:00:08 2001 +++ vm/mm/swap.c Fri Oct 19 05:19:29 2001 @@ -48,7 +48,7 @@ * called on a page which is not on any of the lists, the * page is left alone. */ -void deactivate_page_nolock(struct page * page) +static inline void deactivate_page_nolock(struct page * page) { if (PageActive(page)) { del_page_from_active_list(page); @@ -66,7 +66,7 @@ /* * Move an inactive page to the active list. */ -void activate_page_nolock(struct page * page) +static inline void activate_page_nolock(struct page * page) { if (PageInactive(page)) { del_page_from_inactive_list(page); diff -urN vm-ref/mm/vmscan.c vm/mm/vmscan.c --- vm-ref/mm/vmscan.c Fri Oct 19 05:00:08 2001 +++ vm/mm/vmscan.c Fri Oct 19 05:19:29 2001 @@ -26,14 +26,29 @@ #include /* - * The "priority" of VM scanning is how much of the queues we - * will scan in one go. A value of 6 for DEF_PRIORITY implies - * that we'll scan 1/64th of the queues ("queue_length >> 6") - * during a normal aging round. + * The "vm_scan_ratio" is how much of the queues we will scan + * in one go. A value of 6 for vm_scan_ratio implies that we'll + * scan 1/6 of the inactive list during a normal aging round. */ -#define DEF_PRIORITY (6) +int vm_scan_ratio = 16; -#define page_zone_plenty(page) ((page)->zone->free_pages > (page)->zone->pages_high) +/* + * The "vm_mapped_ratio" controls when to start early-paging, we probe + * the inactive list during shrink_cache() and if there are too many + * mapped unfreeable pages we have an indication that we'd better + * start paging. The bigger vm_mapped_ratio is, the eaerlier the + * machine will run into swapping activities. + */ +int vm_mapped_ratio = 8; + +/* + * The "vm_balance_ratio" controls the balance between active and + * inactive cache. The bigger vm_balance_ratio is, the easier the + * active cache will grow, because we'll rotate the active list + * slowly. A value of 4 means we'll go towards a balance of + * 1/5 of the cache being inactive. + */ +int vm_balance_ratio = 32; /* * The swap-out function returns 1 if it successfully @@ -45,7 +60,7 @@ */ /* mm->page_table_lock is held. mmap_sem is not held */ -static inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page) +static inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page, zone_t * classzone) { pte_t pte; swp_entry_t entry; @@ -53,11 +68,14 @@ /* Don't look at this pte if it's been accessed recently. */ if (ptep_test_and_clear_young(page_table)) { flush_tlb_page(vma, address); + mark_page_accessed(page); return 0; } - /* Don't bother replenishing zones that have tons of memory */ - if (page_zone_plenty(page)) + if (PageActive(page)) + return 0; + + if (!memclass(page->zone, classzone)) return 0; if (TryLockPage(page)) @@ -146,7 +164,7 @@ } /* mm->page_table_lock is held. mmap_sem is not held */ -static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count) +static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone) { pte_t * pte; unsigned long pmd_end; @@ -170,7 +188,7 @@ struct page *page = pte_page(*pte); if (VALID_PAGE(page) && !PageReserved(page)) { - count -= try_to_swap_out(mm, vma, address, pte, page); + count -= try_to_swap_out(mm, vma, address, pte, page, classzone); if (!count) { address += PAGE_SIZE; break; @@ -185,7 +203,7 @@ } /* mm->page_table_lock is held. mmap_sem is not held */ -static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count) +static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone) { pmd_t * pmd; unsigned long pgd_end; @@ -205,7 +223,7 @@ end = pgd_end; do { - count = swap_out_pmd(mm, vma, pmd, address, end, count); + count = swap_out_pmd(mm, vma, pmd, address, end, count, classzone); if (!count) break; address = (address + PMD_SIZE) & PMD_MASK; @@ -215,7 +233,7 @@ } /* mm->page_table_lock is held. mmap_sem is not held */ -static inline int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count) +static inline int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count, zone_t * classzone) { pgd_t *pgdir; unsigned long end; @@ -230,7 +248,7 @@ if (address >= end) BUG(); do { - count = swap_out_pgd(mm, vma, pgdir, address, end, count); + count = swap_out_pgd(mm, vma, pgdir, address, end, count, classzone); if (!count) break; address = (address + PGDIR_SIZE) & PGDIR_MASK; @@ -245,7 +263,7 @@ /* * Returns remaining count of pages to be swapped out by followup call. */ -static inline int swap_out_mm(struct mm_struct * mm, int count, int * mmcounter) +static inline int swap_out_mm(struct mm_struct * mm, int count, int * mmcounter, zone_t * classzone) { unsigned long address; struct vm_area_struct* vma; @@ -267,7 +285,7 @@ address = vma->vm_start; for (;;) { - count = swap_out_vma(mm, vma, address, count); + count = swap_out_vma(mm, vma, address, count, classzone); vma = vma->vm_next; if (!vma) break; @@ -284,14 +302,14 @@ return count; } -static int FASTCALL(swap_out(unsigned int priority, unsigned int gfp_mask, int nr_pages)); -static int swap_out(unsigned int priority, unsigned int gfp_mask, int nr_pages) +static int FASTCALL(swap_out(zone_t * classzone, unsigned int gfp_mask)); +static int swap_out(zone_t * classzone, unsigned int gfp_mask) { - int counter; + int counter, nr_pages = SWAP_CLUSTER_MAX; struct mm_struct *mm; /* Then, look at the other mm's */ - counter = mmlist_nr / priority; + counter = mmlist_nr; do { if (unlikely(current->need_resched)) { __set_current_state(TASK_RUNNING); @@ -312,7 +330,7 @@ atomic_inc(&mm->mm_users); spin_unlock(&mmlist_lock); - nr_pages = swap_out_mm(mm, nr_pages, &counter); + nr_pages = swap_out_mm(mm, nr_pages, &counter, classzone); mmput(mm); @@ -327,13 +345,13 @@ return 0; } -static int FASTCALL(shrink_cache(int nr_pages, int max_scan, unsigned int gfp_mask)); -static int shrink_cache(int nr_pages, int max_scan, unsigned int gfp_mask) +static int FASTCALL(shrink_cache(int nr_pages, int max_scan, zone_t * classzone, unsigned int gfp_mask, int * mapped)); +static int shrink_cache(int nr_pages, int max_scan, zone_t * classzone, unsigned int gfp_mask, int * mapped) { struct list_head * entry; + int __mapped = 0; - spin_lock(&pagemap_lru_lock); - while (max_scan && (entry = inactive_list.prev) != &inactive_list) { + while (max_scan && classzone->nr_inactive_pages && (entry = inactive_list.prev) != &inactive_list) { struct page * page; if (unlikely(current->need_resched)) { @@ -351,16 +369,17 @@ list_del(entry); list_add(entry, &inactive_list); - if (PageTestandClearReferenced(page)) + + if (!memclass(page->zone, classzone)) continue; max_scan--; - if (unlikely(page_zone_plenty(page))) - continue; /* Racy check to avoid trylocking when not worthwhile */ - if (!page->buffers && page_count(page) != 1) + if (!page->buffers && page_count(page) != 1) { + __mapped++; continue; + } /* * The page is locked. IO in progress? @@ -457,13 +476,20 @@ /* * this is the non-racy check, it is critical to check * PageDirty _after_ we made sure the page is freeable - * so not in use by anybody. + * so not in use by anybody. At this point we're + * guaranteed that page->buffers is NULL, nobody + * can refill page->buffers under us because we still + * hold the page lock. */ - if (!is_page_cache_freeable(page) || PageDirty(page)) { + if (page_count(page) > 1) { + __mapped++; + page_was_dirty: spin_unlock(&pagecache_lock); UnlockPage(page); continue; } + if (unlikely(PageDirty(page))) + goto page_was_dirty; /* point of no return */ if (likely(!PageSwapCache(page))) { @@ -489,6 +515,7 @@ } spin_unlock(&pagemap_lru_lock); + *mapped = __mapped; return nr_pages; } @@ -499,74 +526,93 @@ * We move them the other way when we see the * reference bit on the page. */ -static void refill_inactive(int nr_pages) +static void FASTCALL(refill_inactive(int nr_pages, zone_t * classzone)); +static void refill_inactive(int nr_pages, zone_t * classzone) { struct list_head * entry; - spin_lock(&pagemap_lru_lock); entry = active_list.prev; - while (nr_pages-- && entry != &active_list) { + while (nr_pages && entry != &active_list) { struct page * page; page = list_entry(entry, struct page, lru); entry = entry->prev; + + if (!memclass(page->zone, classzone)) + continue; + if (PageTestandClearReferenced(page)) { list_del(&page->lru); list_add(&page->lru, &active_list); continue; } + nr_pages--; + del_page_from_active_list(page); add_page_to_inactive_list(page); + SetPageReferenced(page); + } + if (entry != &active_list) { + list_del(&active_list); + list_add(&active_list, entry); } - spin_unlock(&pagemap_lru_lock); } -static int FASTCALL(shrink_caches(int priority, unsigned int gfp_mask, int nr_pages)); -static int shrink_caches(int priority, unsigned int gfp_mask, int nr_pages) +static int FASTCALL(shrink_caches(zone_t * classzone, unsigned int gfp_mask, int nr_pages, int * force_paging)); +static int shrink_caches(zone_t * classzone, unsigned int gfp_mask, int nr_pages, int * force_paging) { - int max_scan; - int chunk_size = nr_pages; + int max_scan, orig_nr_pages = nr_pages, mapped; unsigned long ratio; nr_pages -= kmem_cache_reap(gfp_mask); if (nr_pages <= 0) return 0; - nr_pages = chunk_size; - /* try to keep the active list 2/3 of the size of the cache */ - ratio = (unsigned long) nr_pages * nr_active_pages / ((nr_inactive_pages + 1) * 2); - refill_inactive(ratio); - - max_scan = nr_inactive_pages / priority; - nr_pages = shrink_cache(nr_pages, max_scan, gfp_mask); - if (nr_pages <= 0) - return 0; + spin_lock(&pagemap_lru_lock); + ratio = (unsigned long) orig_nr_pages * classzone->nr_active_pages / (((unsigned long) classzone->nr_inactive_pages * vm_balance_ratio) + 1); + refill_inactive(ratio, classzone); - shrink_dcache_memory(priority, gfp_mask); - shrink_icache_memory(priority, gfp_mask); -#ifdef CONFIG_QUOTA - shrink_dqcache_memory(DEF_PRIORITY, gfp_mask); -#endif + max_scan = classzone->nr_inactive_pages / vm_scan_ratio; + nr_pages = shrink_cache(orig_nr_pages, max_scan, classzone, gfp_mask, &mapped); + *force_paging = 0; + if ((unsigned long) mapped * vm_mapped_ratio > max_scan) + *force_paging = 1; return nr_pages; } -int try_to_free_pages(unsigned int gfp_mask, unsigned int order) +static int FASTCALL(check_classzone_need_balance(zone_t * classzone)); + +int try_to_free_pages(zone_t * classzone, unsigned int gfp_mask, unsigned int order) { int ret = 0; - int priority = DEF_PRIORITY; - int nr_pages = SWAP_CLUSTER_MAX; - do { - nr_pages = shrink_caches(priority, gfp_mask, nr_pages); - if (nr_pages <= 0) - return 1; + for (;;) { + int tries = vm_scan_ratio << 2, force_paging; + int nr_pages = SWAP_CLUSTER_MAX; - ret |= swap_out(priority, gfp_mask, SWAP_CLUSTER_MAX << 2); - } while (--priority); + do { + nr_pages = shrink_caches(classzone, gfp_mask, nr_pages, &force_paging); + if (force_paging || nr_pages > 0) + ret |= swap_out(classzone, gfp_mask); + if (nr_pages <= 0) + return 1; + + shrink_dcache_memory(vm_scan_ratio, gfp_mask); + shrink_icache_memory(1, gfp_mask); + } while (--tries); + + if (likely(current->pid != 1)) + break; + if (!check_classzone_need_balance(classzone)) + break; + current->policy |= SCHED_YIELD; + __set_current_state(TASK_RUNNING); + schedule(); + } - return ret; + return 0; } DECLARE_WAIT_QUEUE_HEAD(kswapd_wait); @@ -595,10 +641,10 @@ schedule(); if (!zone->need_balance) continue; - if (!try_to_free_pages(GFP_KSWAPD, 0)) { + if (!try_to_free_pages(zone, GFP_KSWAPD, 0)) { zone->need_balance = 0; __set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(HZ); + schedule_timeout(HZ*5); continue; } if (check_classzone_need_balance(zone)) @@ -621,9 +667,6 @@ do need_more_balance |= kswapd_balance_pgdat(pgdat); while ((pgdat = pgdat->node_next)); - if (need_more_balance && out_of_memory()) { - oom_kill(); - } } while (need_more_balance); }