diff -urN vm-ref/arch/sparc/kernel/sys_sunos.c vm/arch/sparc/kernel/sys_sunos.c --- vm-ref/arch/sparc/kernel/sys_sunos.c Tue Jan 22 18:52:53 2002 +++ vm/arch/sparc/kernel/sys_sunos.c Tue Feb 26 16:44:30 2002 @@ -193,7 +193,7 @@ * fool it, but this should catch most mistakes. */ freepages = atomic_read(&buffermem_pages) >> PAGE_SHIFT; - freepages += atomic_read(&page_cache_size); + freepages += page_cache_size; freepages >>= 1; freepages += nr_free_pages(); freepages += nr_swap_pages; diff -urN vm-ref/arch/sparc64/kernel/sys_sunos32.c vm/arch/sparc64/kernel/sys_sunos32.c --- vm-ref/arch/sparc64/kernel/sys_sunos32.c Tue Jan 22 18:52:53 2002 +++ vm/arch/sparc64/kernel/sys_sunos32.c Tue Feb 26 16:44:30 2002 @@ -157,7 +157,7 @@ * fool it, but this should catch most mistakes. */ freepages = atomic_read(&buffermem_pages) >> PAGE_SHIFT; - freepages += atomic_read(&page_cache_size); + freepages += page_cache_size; freepages >>= 1; freepages += nr_free_pages(); freepages += nr_swap_pages; diff -urN vm-ref/fs/buffer.c vm/fs/buffer.c --- vm-ref/fs/buffer.c Tue Feb 26 16:44:02 2002 +++ vm/fs/buffer.c Tue Feb 26 17:16:34 2002 @@ -105,27 +105,27 @@ struct { int nfract; /* Percentage of buffer cache dirty to activate bdflush */ - int dummy1; /* old "ndirty" */ + int ndirty; /* Maximum number of dirty blocks to write out per + wake-cycle */ int dummy2; /* old "nrefill" */ int dummy3; /* unused */ int interval; /* jiffies delay between kupdate flushes */ int age_buffer; /* Time for normal buffer to age before we flush it */ int nfract_sync;/* Percentage of buffer cache dirty to activate bdflush synchronously */ - int dummy4; /* unused */ + int nfract_stop_bdflush; /* Percetange of buffer cache dirty to stop bdflush */ int dummy5; /* unused */ } b_un; unsigned int data[N_PARAM]; -} bdf_prm = {{40, 0, 0, 0, 5*HZ, 30*HZ, 60, 0, 0}}; +} bdf_prm = {{30, 500, 0, 0, 5*HZ, 30*HZ, 60, 20, 0}}; /* These are the min and max parameter values that we will allow to be assigned */ -int bdflush_min[N_PARAM] = { 0, 10, 5, 25, 0, 1*HZ, 0, 0, 0}; -int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,10000*HZ, 6000*HZ, 100, 0, 0}; +int bdflush_min[N_PARAM] = { 0, 1, 0, 0, 0, 1*HZ, 0, 0, 0}; +int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,10000*HZ, 10000*HZ, 100, 100, 0}; void unlock_buffer(struct buffer_head *bh) { clear_bit(BH_Wait_IO, &bh->b_state); - clear_bit(BH_launder, &bh->b_state); clear_bit(BH_Lock, &bh->b_state); smp_mb__after_clear_bit(); if (waitqueue_active(&bh->b_wait)) @@ -180,6 +180,7 @@ do { struct buffer_head * bh = *array++; bh->b_end_io = end_buffer_io_sync; + clear_bit(BH_Pending_IO, &bh->b_state); submit_bh(WRITE, bh); } while (--count); } @@ -212,6 +213,7 @@ if (atomic_set_buffer_clean(bh)) { __refile_buffer(bh); get_bh(bh); + set_bit(BH_Pending_IO, &bh->b_state); array[count++] = bh; if (count < NRSYNC) continue; @@ -241,7 +243,6 @@ do spin_lock(&lru_list_lock); while (write_some_buffers(dev)); - run_task_queue(&tq_disk); } /* @@ -281,12 +282,6 @@ return 0; } -static inline void wait_for_some_buffers(kdev_t dev) -{ - spin_lock(&lru_list_lock); - wait_for_buffers(dev, BUF_LOCKED, 1); -} - static int wait_for_locked_buffers(kdev_t dev, int index, int refile) { do @@ -737,12 +732,8 @@ static void free_more_memory(void) { - zone_t * zone = contig_page_data.node_zonelists[GFP_NOFS & GFP_ZONEMASK].zones[0]; - - balance_dirty(); wakeup_bdflush(); - try_to_free_pages(zone, GFP_NOFS, 0); - run_task_queue(&tq_disk); + try_to_free_pages_nozone(GFP_NOIO); current->policy |= SCHED_YIELD; __set_current_state(TASK_RUNNING); schedule(); @@ -1039,8 +1030,10 @@ conditional_schedule(); bh = get_hash_table(dev, block, size); - if (bh) + if (bh) { + touch_buffer(bh); return bh; + } if (!grow_buffers(dev, block, size)) free_more_memory(); @@ -1055,7 +1048,6 @@ unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit; dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT; - dirty += size_buffers_type[BUF_LOCKED] >> PAGE_SHIFT; tot = nr_free_buffer_pages(); dirty *= 100; @@ -1072,6 +1064,21 @@ return -1; } +static int bdflush_stop(void) +{ + unsigned long dirty, tot, dirty_limit; + + dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT; + tot = nr_free_buffer_pages(); + + dirty *= 100; + dirty_limit = tot * bdf_prm.b_un.nfract_stop_bdflush; + + if (dirty > dirty_limit) + return 0; + return 1; +} + /* * if a new dirty buffer is created we need to balance bdflush. * @@ -1086,19 +1093,16 @@ if (state < 0) return; - /* If we're getting into imbalance, start write-out */ - spin_lock(&lru_list_lock); - write_some_buffers(NODEV); + wakeup_bdflush(); /* * And if we're _really_ out of balance, wait for - * some of the dirty/locked buffers ourselves and - * start bdflush. + * some of the dirty/locked buffers ourselves. * This will throttle heavy writers. */ if (state > 0) { - wait_for_some_buffers(NODEV); - wakeup_bdflush(); + spin_lock(&lru_list_lock); + write_some_buffers(NODEV); } } @@ -1192,7 +1196,6 @@ struct buffer_head * bh; bh = getblk(dev, block, size); - touch_buffer(bh); if (buffer_uptodate(bh)) return bh; ll_rw_block(READ, 1, &bh); @@ -2618,20 +2621,25 @@ static int sync_page_buffers(struct buffer_head *head) { struct buffer_head * bh = head; - int tryagain = 0; + int tryagain = 1; do { if (!buffer_dirty(bh) && !buffer_locked(bh)) continue; /* Don't start IO first time around.. */ - if (!test_and_set_bit(BH_Wait_IO, &bh->b_state)) + if (!test_and_set_bit(BH_Wait_IO, &bh->b_state)) { + tryagain = 0; continue; + } + + if (unlikely(buffer_pending_IO(bh))) { + tryagain = 0; + continue; + } /* Second time through we start actively writing out.. */ if (test_and_set_bit(BH_Lock, &bh->b_state)) { - if (!test_bit(BH_launder, &bh->b_state)) - continue; wait_on_buffer(bh); tryagain = 1; continue; @@ -2644,7 +2652,6 @@ __mark_buffer_clean(bh); get_bh(bh); - set_bit(BH_launder, &bh->b_state); bh->b_end_io = end_buffer_io_sync; submit_bh(WRITE, bh); tryagain = 0; @@ -2747,7 +2754,7 @@ atomic_read(&buffermem_pages) << (PAGE_SHIFT-10)); printk("Cache memory: %6dkB\n", - (atomic_read(&page_cache_size)- atomic_read(&buffermem_pages)) << (PAGE_SHIFT-10)); + (page_cache_size - atomic_read(&buffermem_pages)) << (PAGE_SHIFT-10)); #ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */ if (!spin_trylock(&lru_list_lock)) @@ -2970,13 +2977,18 @@ complete((struct completion *)startup); for (;;) { + int ndirty = bdf_prm.b_un.ndirty; + CHECK_EMERGENCY_SYNC - spin_lock(&lru_list_lock); - if (!write_some_buffers(NODEV) || balance_dirty_state() < 0) { - wait_for_some_buffers(NODEV); - interruptible_sleep_on(&bdflush_wait); + while (ndirty > 0) { + spin_lock(&lru_list_lock); + if (!write_some_buffers(NODEV)) + break; + ndirty -= NRSYNC; } + if (ndirty > 0 || bdflush_stop()) + interruptible_sleep_on(&bdflush_wait); } } @@ -3005,8 +3017,6 @@ complete((struct completion *)startup); for (;;) { - wait_for_some_buffers(NODEV); - /* update interval */ interval = bdf_prm.b_un.interval; if (interval) { @@ -3034,6 +3044,7 @@ printk(KERN_DEBUG "kupdate() activated...\n"); #endif sync_old_buffers(); + run_task_queue(&tq_disk); } } diff -urN vm-ref/fs/proc/proc_misc.c vm/fs/proc/proc_misc.c --- vm-ref/fs/proc/proc_misc.c Tue Jan 22 18:55:57 2002 +++ vm/fs/proc/proc_misc.c Tue Feb 26 16:44:30 2002 @@ -142,7 +142,7 @@ #define B(x) ((unsigned long long)(x) << PAGE_SHIFT) si_meminfo(&i); si_swapinfo(&i); - pg_size = atomic_read(&page_cache_size) - i.bufferram ; + pg_size = page_cache_size - i.bufferram; len = sprintf(page, " total: used: free: shared: buffers: cached:\n" "Mem: %8Lu %8Lu %8Lu %8Lu %8Lu %8Lu\n" diff -urN vm-ref/include/linux/fs.h vm/include/linux/fs.h --- vm-ref/include/linux/fs.h Tue Feb 26 16:43:50 2002 +++ vm/include/linux/fs.h Tue Feb 26 16:44:30 2002 @@ -217,7 +217,7 @@ BH_New, /* 1 if the buffer is new and not yet written out */ BH_Async, /* 1 if the buffer is under end_buffer_io_async I/O */ BH_Wait_IO, /* 1 if we should write out this buffer */ - BH_launder, /* 1 if we should throttle on this buffer */ + BH_Pending_IO, /* 1 if the buffer is locked but not in the I/O queue yet */ BH_JBD, /* 1 if it has an attached journal_head */ BH_PrivateStart,/* not a state bit, but the first bit available @@ -279,6 +279,7 @@ #define buffer_mapped(bh) __buffer_state(bh,Mapped) #define buffer_new(bh) __buffer_state(bh,New) #define buffer_async(bh) __buffer_state(bh,Async) +#define buffer_pending_IO(bh) __buffer_state(bh,Pending_IO) #define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK) diff -urN vm-ref/include/linux/kernel.h vm/include/linux/kernel.h --- vm-ref/include/linux/kernel.h Fri Feb 22 20:32:02 2002 +++ vm/include/linux/kernel.h Tue Feb 26 16:44:30 2002 @@ -106,6 +106,8 @@ extern int tainted; extern const char *print_tainted(void); +extern void show_stack(unsigned long *); + #if DEBUG #define pr_debug(fmt,arg...) \ printk(KERN_DEBUG fmt,##arg) diff -urN vm-ref/include/linux/mm.h vm/include/linux/mm.h --- vm-ref/include/linux/mm.h Tue Feb 26 16:44:02 2002 +++ vm/include/linux/mm.h Tue Feb 26 16:44:30 2002 @@ -299,8 +299,10 @@ #define TryLockPage(page) test_and_set_bit(PG_locked, &(page)->flags) #define PageChecked(page) test_bit(PG_checked, &(page)->flags) #define SetPageChecked(page) set_bit(PG_checked, &(page)->flags) + #define PageLaunder(page) test_bit(PG_launder, &(page)->flags) #define SetPageLaunder(page) set_bit(PG_launder, &(page)->flags) +#define ClearPageLaunder(page) clear_bit(PG_launder, &(page)->flags) extern void FASTCALL(set_page_dirty(struct page *)); @@ -396,6 +398,8 @@ #define __free_page(page) __free_pages((page), 0) #define free_page(addr) free_pages((addr),0) +extern int start_aggressive_readahead(unsigned int); + extern void show_free_areas(void); extern void show_free_areas_node(pg_data_t *pgdat); @@ -460,8 +464,8 @@ return page_count(page) - !!page->buffers == 1; } -extern int can_share_swap_page(struct page *); -extern int remove_exclusive_swap_page(struct page *); +extern int FASTCALL(make_exclusive_page(struct page *, int)); +extern int FASTCALL(remove_exclusive_swap_page(struct page *)); extern void __free_pte(pte_t); diff -urN vm-ref/include/linux/mmzone.h vm/include/linux/mmzone.h --- vm-ref/include/linux/mmzone.h Tue Jan 22 18:53:55 2002 +++ vm/include/linux/mmzone.h Tue Feb 26 16:44:30 2002 @@ -18,6 +18,11 @@ #define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER #endif +#define ZONE_DMA 0 +#define ZONE_NORMAL 1 +#define ZONE_HIGHMEM 2 +#define MAX_NR_ZONES 3 + typedef struct free_area_struct { struct list_head free_list; unsigned long *map; @@ -25,6 +30,10 @@ struct pglist_data; +typedef struct zone_watermarks_s { + unsigned long min, low, high; +} zone_watermarks_t; + /* * On machines where it is needed (eg PCs) we divide physical memory * into multiple physical zones. On a PC we have 3 zones: @@ -39,8 +48,27 @@ */ spinlock_t lock; unsigned long free_pages; - unsigned long pages_min, pages_low, pages_high; - int need_balance; + + /* + * We don't know if the memory that we're going to allocate will be freeable + * or/and it will be released eventually, so to avoid totally wasting several + * GB of ram we must reserve some of the lower zone memory (otherwise we risk + * to run OOM on the lower zones despite there's tons of freeable ram + * on the higher zones). + */ + zone_watermarks_t watermarks[MAX_NR_ZONES]; + + /* + * The below fields are protected by different locks (or by + * no lock at all like need_balance), so they're longs to + * provide an atomic granularity against each other on + * all architectures. + */ + unsigned long need_balance; + /* protected by the pagemap_lru_lock */ + unsigned long nr_active_pages, nr_inactive_pages; + /* protected by the pagecache_lock */ + unsigned long nr_cache_pages; /* * free areas of different sizes @@ -60,13 +88,9 @@ */ char *name; unsigned long size; + unsigned long realsize; } zone_t; -#define ZONE_DMA 0 -#define ZONE_NORMAL 1 -#define ZONE_HIGHMEM 2 -#define MAX_NR_ZONES 3 - /* * One allocation request operates on a zonelist. A zonelist * is a list of zones, the first one is the 'goal' of the @@ -113,8 +137,8 @@ extern int numnodes; extern pg_data_t *pgdat_list; -#define memclass(pgzone, classzone) (((pgzone)->zone_pgdat == (classzone)->zone_pgdat) \ - && ((pgzone) <= (classzone))) +#define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) +#define memclass(pgzone, classzone) (zone_idx(pgzone) <= zone_idx(classzone)) /* * The following two are not meant for general usage. They are here as diff -urN vm-ref/include/linux/pagemap.h vm/include/linux/pagemap.h --- vm-ref/include/linux/pagemap.h Fri Feb 22 20:32:04 2002 +++ vm/include/linux/pagemap.h Tue Feb 26 16:44:30 2002 @@ -45,7 +45,7 @@ #define PAGE_HASH_BITS (page_hash_bits) #define PAGE_HASH_SIZE (1 << PAGE_HASH_BITS) -extern atomic_t page_cache_size; /* # of pages currently in the hash table */ +extern unsigned long page_cache_size; /* # of pages currently in the hash table */ extern struct page **page_hash_table; extern void page_cache_init(unsigned long); diff -urN vm-ref/include/linux/sched.h vm/include/linux/sched.h --- vm-ref/include/linux/sched.h Tue Feb 26 16:44:02 2002 +++ vm/include/linux/sched.h Tue Feb 26 16:44:30 2002 @@ -280,6 +280,14 @@ extern struct user_struct root_user; #define INIT_USER (&root_user) +struct zone_struct; + +struct local_pages { + struct list_head list; + unsigned int order, nr; + struct zone_struct * classzone; +}; + struct task_struct { /* * offsets of these are hardcoded elsewhere - touch with care @@ -325,8 +333,7 @@ struct task_struct *next_task, *prev_task; struct mm_struct *active_mm; - struct list_head local_pages; - unsigned int allocation_order, nr_local_pages; + struct local_pages local_pages; /* task state */ struct linux_binfmt *binfmt; @@ -426,7 +433,6 @@ #define PF_DUMPCORE 0x00000200 /* dumped core */ #define PF_SIGNALED 0x00000400 /* killed by a signal */ #define PF_MEMALLOC 0x00000800 /* Allocating memory */ -#define PF_MEMDIE 0x00001000 /* Killed for out-of-memory */ #define PF_FREE_PAGES 0x00002000 /* per process page freeing */ #define PF_NOIO 0x00004000 /* avoid generating further I/O */ diff -urN vm-ref/include/linux/swap.h vm/include/linux/swap.h --- vm-ref/include/linux/swap.h Tue Feb 26 16:44:02 2002 +++ vm/include/linux/swap.h Tue Feb 26 16:44:30 2002 @@ -88,7 +88,7 @@ extern int nr_active_pages; extern int nr_inactive_pages; extern atomic_t nr_async_pages; -extern atomic_t page_cache_size; +extern unsigned long page_cache_size; extern atomic_t buffermem_pages; extern spinlock_cacheline_t pagecache_lock_cacheline; @@ -115,6 +115,8 @@ /* linux/mm/vmscan.c */ extern wait_queue_head_t kswapd_wait; extern int FASTCALL(try_to_free_pages(zone_t *, unsigned int, unsigned int)); +extern int FASTCALL(try_to_free_pages_nozone(unsigned int)); +extern int vm_vfs_scan_ratio, vm_lru_balance_ratio, vm_mapped_ratio, vm_passes, vm_gfp_debug; /* linux/mm/page_io.c */ extern void rw_swap_page(int, struct page *); @@ -176,32 +178,128 @@ BUG(); \ } while (0) +#define inc_nr_active_pages(page) \ +do { \ + pg_data_t * __pgdat; \ + zone_t * __classzone, * __overflow; \ + \ + __classzone = (page)->zone; \ + __pgdat = __classzone->zone_pgdat; \ + __overflow = __pgdat->node_zones + __pgdat->nr_zones; \ + \ + while (__classzone < __overflow) { \ + __classzone->nr_active_pages++; \ + __classzone++; \ + } \ + nr_active_pages++; \ +} while (0) + +#define dec_nr_active_pages(page) \ +do { \ + pg_data_t * __pgdat; \ + zone_t * __classzone, * __overflow; \ + \ + __classzone = (page)->zone; \ + __pgdat = __classzone->zone_pgdat; \ + __overflow = __pgdat->node_zones + __pgdat->nr_zones; \ + \ + while (__classzone < __overflow) { \ + __classzone->nr_active_pages--; \ + __classzone++; \ + } \ + nr_active_pages--; \ +} while (0) + +#define inc_nr_inactive_pages(page) \ +do { \ + pg_data_t * __pgdat; \ + zone_t * __classzone, * __overflow; \ + \ + __classzone = (page)->zone; \ + __pgdat = __classzone->zone_pgdat; \ + __overflow = __pgdat->node_zones + __pgdat->nr_zones; \ + \ + while (__classzone < __overflow) { \ + __classzone->nr_inactive_pages++; \ + __classzone++; \ + } \ + nr_inactive_pages++; \ +} while (0) + +#define dec_nr_inactive_pages(page) \ +do { \ + pg_data_t * __pgdat; \ + zone_t * __classzone, * __overflow; \ + \ + __classzone = (page)->zone; \ + __pgdat = __classzone->zone_pgdat; \ + __overflow = __pgdat->node_zones + __pgdat->nr_zones; \ + \ + while (__classzone < __overflow) { \ + __classzone->nr_inactive_pages--; \ + __classzone++; \ + } \ + nr_inactive_pages--; \ +} while (0) + #define add_page_to_active_list(page) \ do { \ DEBUG_LRU_PAGE(page); \ SetPageActive(page); \ list_add(&(page)->lru, &active_list); \ - nr_active_pages++; \ + inc_nr_active_pages(page); \ } while (0) #define add_page_to_inactive_list(page) \ do { \ DEBUG_LRU_PAGE(page); \ list_add(&(page)->lru, &inactive_list); \ - nr_inactive_pages++; \ + inc_nr_inactive_pages(page); \ } while (0) #define del_page_from_active_list(page) \ do { \ list_del(&(page)->lru); \ ClearPageActive(page); \ - nr_active_pages--; \ + dec_nr_active_pages(page); \ } while (0) #define del_page_from_inactive_list(page) \ do { \ list_del(&(page)->lru); \ - nr_inactive_pages--; \ + dec_nr_inactive_pages(page); \ +} while (0) + +#define inc_nr_cache_pages(page) \ +do { \ + pg_data_t * __pgdat; \ + zone_t * __classzone, * __overflow; \ + \ + __classzone = (page)->zone; \ + __pgdat = __classzone->zone_pgdat; \ + __overflow = __pgdat->node_zones + __pgdat->nr_zones; \ + \ + while (__classzone < __overflow) { \ + __classzone->nr_cache_pages++; \ + __classzone++; \ + } \ + page_cache_size++; \ +} while (0) + +#define dec_nr_cache_pages(page) \ +do { \ + pg_data_t * __pgdat; \ + zone_t * __classzone, * __overflow; \ + \ + __classzone = (page)->zone; \ + __pgdat = __classzone->zone_pgdat; \ + __overflow = __pgdat->node_zones + __pgdat->nr_zones; \ + \ + while (__classzone < __overflow) { \ + __classzone->nr_cache_pages--; \ + __classzone++; \ + } \ + page_cache_size--; \ } while (0) extern spinlock_t swaplock; diff -urN vm-ref/include/linux/sysctl.h vm/include/linux/sysctl.h --- vm-ref/include/linux/sysctl.h Tue Feb 26 16:44:02 2002 +++ vm/include/linux/sysctl.h Tue Feb 26 16:44:30 2002 @@ -141,8 +141,13 @@ VM_PGT_CACHE=9, /* struct: Set page table cache parameters */ VM_PAGE_CLUSTER=10, /* int: set number of pages to swap together */ VM_HEAP_STACK_GAP=11, /* int: page gap between heap and stack */ - VM_MIN_READAHEAD=12, /* Min file readahead */ - VM_MAX_READAHEAD=13 /* Max file readahead */ + VM_MIN_READAHEAD=12, /* Min file readahead */ + VM_MAX_READAHEAD=13, /* Max file readahead */ + VM_VFS_SCAN_RATIO=14, /* part of the inactive list to scan */ + VM_LRU_BALANCE_RATIO=15,/* balance active and inactive caches */ + VM_MAPPED_RATIO=16, /* pageout when we find too many mapped pages */ + VM_PASSES=17, /* number of vm passes before failing */ + VM_GFP_DEBUG=18, /* debug GFP failures */ }; diff -urN vm-ref/kernel/fork.c vm/kernel/fork.c --- vm-ref/kernel/fork.c Tue Feb 26 16:44:01 2002 +++ vm/kernel/fork.c Tue Feb 26 16:44:30 2002 @@ -660,7 +660,7 @@ p->lock_depth = -1; /* -1 = no lock */ p->start_time = jiffies; - INIT_LIST_HEAD(&p->local_pages); + INIT_LIST_HEAD(&p->local_pages.list); retval = -ENOMEM; /* copy all the process information */ diff -urN vm-ref/kernel/ksyms.c vm/kernel/ksyms.c --- vm-ref/kernel/ksyms.c Tue Feb 26 16:43:50 2002 +++ vm/kernel/ksyms.c Tue Feb 26 16:44:30 2002 @@ -90,6 +90,7 @@ EXPORT_SYMBOL(exit_sighand); /* internal kernel memory management */ +EXPORT_SYMBOL(start_aggressive_readahead); EXPORT_SYMBOL(_alloc_pages); EXPORT_SYMBOL(__alloc_pages); EXPORT_SYMBOL(alloc_pages_node); diff -urN vm-ref/kernel/sysctl.c vm/kernel/sysctl.c --- vm-ref/kernel/sysctl.c Tue Feb 26 16:44:02 2002 +++ vm/kernel/sysctl.c Tue Feb 26 16:44:30 2002 @@ -30,6 +30,7 @@ #include #include #include +#include #include @@ -260,6 +261,16 @@ }; static ctl_table vm_table[] = { + {VM_VFS_SCAN_RATIO, "vm_gfp_debug", + &vm_gfp_debug, sizeof(int), 0644, NULL, &proc_dointvec}, + {VM_VFS_SCAN_RATIO, "vm_vfs_scan_ratio", + &vm_vfs_scan_ratio, sizeof(int), 0644, NULL, &proc_dointvec}, + {VM_LRU_BALANCE_RATIO, "vm_lru_balance_ratio", + &vm_lru_balance_ratio, sizeof(int), 0644, NULL, &proc_dointvec}, + {VM_MAPPED_RATIO, "vm_mapped_ratio", + &vm_mapped_ratio, sizeof(int), 0644, NULL, &proc_dointvec}, + {VM_PASSES, "vm_passes", + &vm_passes, sizeof(int), 0644, NULL, &proc_dointvec}, {VM_BDFLUSH, "bdflush", &bdf_prm, 9*sizeof(int), 0644, NULL, &proc_dointvec_minmax, &sysctl_intvec, NULL, &bdflush_min, &bdflush_max}, diff -urN vm-ref/mm/filemap.c vm/mm/filemap.c --- vm-ref/mm/filemap.c Tue Feb 26 16:44:03 2002 +++ vm/mm/filemap.c Tue Feb 26 16:44:30 2002 @@ -43,7 +43,7 @@ * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli */ -atomic_t page_cache_size = ATOMIC_INIT(0); +unsigned long page_cache_size; unsigned int page_hash_bits; struct page **page_hash_table; @@ -80,7 +80,7 @@ next->pprev_hash = &page->next_hash; if (page->buffers) PAGE_BUG(page); - atomic_inc(&page_cache_size); + inc_nr_cache_pages(page); } static inline void add_page_to_inode_queue(struct address_space *mapping, struct page * page) @@ -110,7 +110,7 @@ next->pprev_hash = pprev; *pprev = next; page->pprev_hash = NULL; - atomic_dec(&page_cache_size); + dec_nr_cache_pages(page); } /* @@ -766,7 +766,7 @@ void unlock_page(struct page *page) { - clear_bit(PG_launder, &(page)->flags); + ClearPageLaunder(page); smp_mb__before_clear_bit(); if (!test_and_clear_bit(PG_locked, &(page)->flags)) BUG(); @@ -1901,7 +1901,6 @@ * and possibly copy it over to another page.. */ mark_page_accessed(page); - flush_page_to_ram(page); return page; no_cached_page: @@ -3023,8 +3022,15 @@ } unlock: kunmap(page); + + /* + * Mark the page accessed if we wrote the + * beginning or we just did an lseek. + */ + if (!offset || !file->f_reada) + SetPageReferenced(page); + /* Mark it unlocked again and drop the page.. */ - SetPageReferenced(page); UnlockPage(page); page_cache_release(page); diff -urN vm-ref/mm/memory.c vm/mm/memory.c --- vm-ref/mm/memory.c Tue Feb 26 16:44:01 2002 +++ vm/mm/memory.c Tue Feb 26 16:44:30 2002 @@ -961,15 +961,11 @@ if (!VALID_PAGE(old_page)) goto bad_wp_page; - if (!TryLockPage(old_page)) { - int reuse = can_share_swap_page(old_page); - unlock_page(old_page); - if (reuse) { - flush_cache_page(vma, address); - establish_pte(vma, address, page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte)))); - spin_unlock(&mm->page_table_lock); - return 1; /* Minor fault */ - } + if (make_exclusive_page(old_page, 1)) { + flush_cache_page(vma, address); + establish_pte(vma, address, page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte)))); + spin_unlock(&mm->page_table_lock); + return 1; /* Minor fault */ } /* @@ -987,6 +983,19 @@ * Re-check the pte - we dropped the lock */ spin_lock(&mm->page_table_lock); + /* + * keep the page pinned until we return runnable + * to avoid another thread to skip the break_cow + * path, so we're sure pte_same below check also implys + * that the _contents_ of the old_page didn't changed + * under us (not only that the pagetable is the same). + * + * Since we have the page_table_lock acquired here, if the + * pte is the same it means we're still holding an additional + * reference on the old_page so we can safely + * page_cache_release(old_page) before the "pte_same == true" path. + */ + page_cache_release(old_page); if (pte_same(*page_table, pte)) { if (PageReserved(old_page)) ++mm->rss; @@ -998,7 +1007,6 @@ } spin_unlock(&mm->page_table_lock); page_cache_release(new_page); - page_cache_release(old_page); return 1; /* Minor fault */ bad_wp_page: @@ -1151,9 +1159,8 @@ ret = 2; } - mark_page_accessed(page); - - lock_page(page); + if (!Page_Uptodate(page)) + wait_on_page(page); /* * Back out if somebody else faulted in this pte while we @@ -1162,7 +1169,6 @@ spin_lock(&mm->page_table_lock); if (!pte_same(*page_table, orig_pte)) { spin_unlock(&mm->page_table_lock); - unlock_page(page); page_cache_release(page); return 1; } @@ -1170,14 +1176,15 @@ /* The page isn't present yet, go ahead with the fault. */ swap_free(entry); - if (vm_swap_full()) - remove_exclusive_swap_page(page); - mm->rss++; pte = mk_pte(page, vma->vm_page_prot); - if (write_access && can_share_swap_page(page)) - pte = pte_mkdirty(pte_mkwrite(pte)); - unlock_page(page); + if (make_exclusive_page(page, write_access)) { + if (write_access) + pte = pte_mkdirty(pte); + if (vma->vm_flags & VM_WRITE) + pte = pte_mkwrite(pte); + } + mark_page_accessed(page); flush_page_to_ram(page); flush_icache_page(vma, page); @@ -1215,15 +1222,14 @@ spin_lock(&mm->page_table_lock); if (!pte_none(*page_table)) { - page_cache_release(page); spin_unlock(&mm->page_table_lock); + page_cache_release(page); return 1; } mm->rss++; flush_page_to_ram(page); entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); lru_cache_add(page); - mark_page_accessed(page); } set_pte(page_table, entry); @@ -1302,9 +1308,9 @@ entry = pte_mkwrite(pte_mkdirty(entry)); set_pte(page_table, entry); } else { + spin_unlock(&mm->page_table_lock); /* One of our sibling threads was faster, back out. */ page_cache_release(new_page); - spin_unlock(&mm->page_table_lock); return 1; } diff -urN vm-ref/mm/mmap.c vm/mm/mmap.c --- vm-ref/mm/mmap.c Tue Feb 26 16:44:02 2002 +++ vm/mm/mmap.c Tue Feb 26 16:44:30 2002 @@ -69,7 +69,7 @@ return 1; /* The page cache contains buffer pages these days.. */ - free = atomic_read(&page_cache_size); + free = page_cache_size; free += nr_free_pages(); free += nr_swap_pages; diff -urN vm-ref/mm/oom_kill.c vm/mm/oom_kill.c --- vm-ref/mm/oom_kill.c Tue Jan 22 18:55:26 2002 +++ vm/mm/oom_kill.c Tue Feb 26 16:44:30 2002 @@ -150,7 +150,6 @@ * exit() and clear out its resources quickly... */ p->counter = 5 * HZ; - p->flags |= PF_MEMALLOC | PF_MEMDIE; /* This process has hardware access, be more careful. */ if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO)) { diff -urN vm-ref/mm/page_alloc.c vm/mm/page_alloc.c --- vm-ref/mm/page_alloc.c Tue Feb 26 16:43:50 2002 +++ vm/mm/page_alloc.c Tue Feb 26 16:44:30 2002 @@ -31,6 +31,9 @@ static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 128, 128, 128, }; static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, }; static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, }; +static int lower_zone_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 }; + +int vm_gfp_debug = 0; /* * Free_page() adds the page to the free lists. This is optimized for @@ -144,14 +147,14 @@ return; local_freelist: - if (current->nr_local_pages) + if ((current->local_pages.nr && !current->local_pages.order) || + !memclass(page->zone, current->local_pages.classzone) || + in_interrupt()) goto back_local_freelist; - if (in_interrupt()) - goto back_local_freelist; - list_add(&page->list, ¤t->local_pages); + list_add(&page->list, ¤t->local_pages.list); page->index = order; - current->nr_local_pages++; + current->local_pages.nr++; } #define MARK_USED(index, order, area) \ @@ -236,35 +239,36 @@ static struct page * balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed) { struct page * page = NULL; - int __freed = 0; + int __freed; - if (!(gfp_mask & __GFP_WAIT)) - goto out; if (in_interrupt()) BUG(); - current->allocation_order = order; + current->local_pages.order = order; + current->local_pages.classzone = classzone; current->flags |= PF_MEMALLOC | PF_FREE_PAGES; __freed = try_to_free_pages(classzone, gfp_mask, order); current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES); - if (current->nr_local_pages) { + if (current->local_pages.nr) { struct list_head * entry, * local_pages; struct page * tmp; int nr_pages; - local_pages = ¤t->local_pages; + local_pages = ¤t->local_pages.list; if (likely(__freed)) { /* pick from the last inserted so we're lifo */ entry = local_pages->next; do { tmp = list_entry(entry, struct page, list); - if (tmp->index == order && memclass(tmp->zone, classzone)) { + if (!memclass(tmp->zone, classzone)) + BUG(); + if (tmp->index == order) { list_del(entry); - current->nr_local_pages--; + current->local_pages.nr--; set_page_count(tmp, 1); page = tmp; @@ -290,7 +294,7 @@ } while ((entry = entry->next) != local_pages); } - nr_pages = current->nr_local_pages; + nr_pages = current->local_pages.nr; /* free in reverse order so that the global order will be lifo */ while ((entry = local_pages->prev) != local_pages) { list_del(entry); @@ -299,33 +303,37 @@ if (!nr_pages--) BUG(); } - current->nr_local_pages = 0; + current->local_pages.nr = 0; } - out: *freed = __freed; return page; } +static inline unsigned long zone_free_pages(zone_t * zone, unsigned int order) +{ + long free = zone->free_pages - (1UL << order); + return free >= 0 ? free : 0; +} + /* * This is the 'heart' of the zoned buddy allocator: */ struct page * __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist) { - unsigned long min; zone_t **zone, * classzone; struct page * page; - int freed; + int freed, class_idx; zone = zonelist->zones; classzone = *zone; - min = 1UL << order; + class_idx = zone_idx(classzone); + for (;;) { zone_t *z = *(zone++); if (!z) break; - min += z->pages_low; - if (z->free_pages > min) { + if (zone_free_pages(z, order) > z->watermarks[class_idx].low) { page = rmqueue(z, order); if (page) return page; @@ -338,18 +346,16 @@ wake_up_interruptible(&kswapd_wait); zone = zonelist->zones; - min = 1UL << order; for (;;) { - unsigned long local_min; + unsigned long min; zone_t *z = *(zone++); if (!z) break; - local_min = z->pages_min; + min = z->watermarks[class_idx].min; if (!(gfp_mask & __GFP_WAIT)) - local_min >>= 2; - min += local_min; - if (z->free_pages > min) { + min >>= 2; + if (zone_free_pages(z, order) > min) { page = rmqueue(z, order); if (page) return page; @@ -358,8 +364,7 @@ /* here we're in the low on memory slow path */ -rebalance: - if (current->flags & (PF_MEMALLOC | PF_MEMDIE)) { + if (current->flags & PF_MEMALLOC && !in_interrupt()) { zone = zonelist->zones; for (;;) { zone_t *z = *(zone++); @@ -375,36 +380,51 @@ /* Atomic allocations - we can't balance anything */ if (!(gfp_mask & __GFP_WAIT)) - return NULL; + goto out; + rebalance: page = balance_classzone(classzone, gfp_mask, order, &freed); if (page) return page; zone = zonelist->zones; - min = 1UL << order; - for (;;) { - zone_t *z = *(zone++); - if (!z) - break; + if (likely(freed)) { + for (;;) { + zone_t *z = *(zone++); + if (!z) + break; - min += z->pages_min; - if (z->free_pages > min) { - page = rmqueue(z, order); - if (page) - return page; + if (zone_free_pages(z, order) > z->watermarks[class_idx].min) { + page = rmqueue(z, order); + if (page) + return page; + } } - } + goto rebalance; + } else { + /* + * Check that no other task is been killed meanwhile, + * in such a case we can succeed the allocation. + */ + for (;;) { + zone_t *z = *(zone++); + if (!z) + break; - /* Don't let big-order allocations loop */ - if (order > 3) - return NULL; + if (zone_free_pages(z, order) > z->watermarks[class_idx].high) { + page = rmqueue(z, order); + if (page) + return page; + } + } + } - /* Yield for kswapd, and try again */ - current->policy |= SCHED_YIELD; - __set_current_state(TASK_RUNNING); - schedule(); - goto rebalance; + out: + printk(KERN_NOTICE "__alloc_pages: %u-order allocation failed (gfp=0x%x/%i)\n", + order, gfp_mask, !!(current->flags & PF_MEMALLOC)); + if (vm_gfp_debug) + show_stack(NULL); + return NULL; } /* @@ -518,18 +538,25 @@ { pg_data_t *pgdat = pgdat_list; unsigned int sum = 0; + zonelist_t *zonelist; + zone_t **zonep, *zone; do { - zonelist_t *zonelist = pgdat->node_zonelists + (GFP_USER & GFP_ZONEMASK); - zone_t **zonep = zonelist->zones; - zone_t *zone; - - for (zone = *zonep++; zone; zone = *zonep++) { - unsigned long size = zone->size; - unsigned long high = zone->pages_high; - if (size > high) - sum += size - high; - } + int class_idx; + zonelist = pgdat->node_zonelists + (GFP_USER & GFP_ZONEMASK); + zonep = zonelist->zones; + zone = *zonep; + class_idx = zone_idx(zone); + + sum += zone->nr_cache_pages; + do { + unsigned int free = zone->free_pages - zone->watermarks[class_idx].high; + zonep++; + zone = *zonep; + if (free <= 0) + continue; + sum += free; + } while (zone); pgdat = pgdat->node_next; } while (pgdat); @@ -551,6 +578,65 @@ } #endif +/* + * If it returns non zero it means there's lots of ram "free" + * (note: not in cache!) so any caller will know that + * he can allocate some memory to do some more aggressive + * (possibly wasteful) readahead. The state of the memory + * should be rechecked after every few pages allocated for + * doing this aggressive readahead. + * + * The gfp_mask parameter specifies in which kind of memory + * the readahead information will be applocated to. + */ +int start_aggressive_readahead(unsigned int gfp_mask) +{ + pg_data_t *pgdat = pgdat_list; + zonelist_t *zonelist; + zone_t **zonep, *zone; + int ret = 0; + + do { + int class_idx; + zonelist = pgdat->node_zonelists + (gfp_mask & GFP_ZONEMASK); + zonep = zonelist->zones; + zone = *(zonep++); + class_idx = zone_idx(zone); + + for (; zone; zone = *(zonep++)) + if (zone->free_pages > zone->watermarks[class_idx].high * 2) + ret = 1; + + pgdat = pgdat->node_next; + } while (pgdat); + + return ret; +} + +int try_to_free_pages_nozone(unsigned int gfp_mask) +{ + pg_data_t *pgdat = pgdat_list; + zonelist_t *zonelist; + zone_t **zonep; + int ret = 0; + unsigned long pf_free_pages; + + pf_free_pages = current->flags & PF_FREE_PAGES; + current->flags &= ~PF_FREE_PAGES; + + do { + zonelist = pgdat->node_zonelists + (gfp_mask & GFP_ZONEMASK); + zonep = zonelist->zones; + + ret |= try_to_free_pages(*zonep, gfp_mask, 0); + + pgdat = pgdat->node_next; + } while (pgdat); + + current->flags |= pf_free_pages; + return ret; +} + #define K(x) ((x) << (PAGE_SHIFT-10)) /* @@ -572,13 +658,9 @@ zone_t *zone; for (zone = tmpdat->node_zones; zone < tmpdat->node_zones + MAX_NR_ZONES; zone++) - printk("Zone:%s freepages:%6lukB min:%6lukB low:%6lukB " - "high:%6lukB\n", + printk("Zone:%s freepages:%6lukB\n", zone->name, - K(zone->free_pages), - K(zone->pages_min), - K(zone->pages_low), - K(zone->pages_high)); + K(zone->free_pages)); tmpdat = tmpdat->node_next; } @@ -744,6 +826,7 @@ zone_t *zone = pgdat->node_zones + j; unsigned long mask; unsigned long size, realsize; + int idx; realsize = size = zones_size[j]; if (zholes_size) @@ -751,11 +834,13 @@ printk("zone(%lu): %lu pages.\n", j, size); zone->size = size; + zone->realsize = realsize; zone->name = zone_names[j]; zone->lock = SPIN_LOCK_UNLOCKED; zone->zone_pgdat = pgdat; zone->free_pages = 0; zone->need_balance = 0; + zone->nr_active_pages = zone->nr_inactive_pages = 0; if (!size) continue; @@ -766,9 +851,29 @@ mask = zone_balance_min[j]; else if (mask > zone_balance_max[j]) mask = zone_balance_max[j]; - zone->pages_min = mask; - zone->pages_low = mask*2; - zone->pages_high = mask*3; + zone->watermarks[j].min = mask; + zone->watermarks[j].low = mask*2; + zone->watermarks[j].high = mask*3; + /* now set the watermarks of the lower zones in the "j" classzone */ + for (idx = j-1; idx >= 0; idx--) { + zone_t * lower_zone = pgdat->node_zones + idx; + unsigned long lower_zone_reserve; + if (!lower_zone->size) + continue; + + mask = lower_zone->watermarks[idx].min; + lower_zone->watermarks[j].min = mask; + lower_zone->watermarks[j].low = mask*2; + lower_zone->watermarks[j].high = mask*3; + + /* now the brainer part */ + lower_zone_reserve = realsize / lower_zone_reserve_ratio[idx]; + lower_zone->watermarks[j].min += lower_zone_reserve; + lower_zone->watermarks[j].low += lower_zone_reserve; + lower_zone->watermarks[j].high += lower_zone_reserve; + + realsize += lower_zone->realsize; + } zone->zone_mem_map = mem_map + offset; zone->zone_start_mapnr = offset; @@ -844,3 +949,16 @@ } __setup("memfrac=", setup_mem_frac); + +static int __init setup_lower_zone_reserve(char *str) +{ + int j = 0; + + while (get_option(&str, &lower_zone_reserve_ratio[j++]) == 2); + printk("setup_lower_zone_reserve: "); + for (j = 0; j < MAX_NR_ZONES-1; j++) printk("%d ", lower_zone_reserve_ratio[j]); + printk("\n"); + return 1; +} + +__setup("lower_zone_reserve=", setup_lower_zone_reserve); diff -urN vm-ref/mm/page_io.c vm/mm/page_io.c --- vm-ref/mm/page_io.c Tue Jan 22 18:56:00 2002 +++ vm/mm/page_io.c Tue Feb 26 16:44:30 2002 @@ -73,10 +73,6 @@ /* block_size == PAGE_SIZE/zones_used */ brw_page(rw, page, dev, zones, block_size); - /* Note! For consistency we do all of the logic, - * decrementing the page count, and unlocking the page in the - * swap lock map - in the IO completion handler. - */ return 1; } diff -urN vm-ref/mm/slab.c vm/mm/slab.c --- vm-ref/mm/slab.c Tue Jan 22 18:56:30 2002 +++ vm/mm/slab.c Tue Feb 26 16:44:30 2002 @@ -916,8 +916,6 @@ slab_t *slabp; int ret; - drain_cpu_caches(cachep); - spin_lock_irq(&cachep->spinlock); /* If the cache is growing, stop shrinking. */ @@ -987,6 +985,8 @@ kmem_cache_t, next); list_del(&cachep->next); up(&cache_chain_sem); + + drain_cpu_caches(cachep); if (__kmem_cache_shrink(cachep)) { printk(KERN_ERR "kmem_cache_destroy: Can't free all objects %p\n", diff -urN vm-ref/mm/swap.c vm/mm/swap.c --- vm-ref/mm/swap.c Tue Jan 22 18:56:00 2002 +++ vm/mm/swap.c Tue Feb 26 16:44:30 2002 @@ -36,18 +36,17 @@ /* * Move an inactive page to the active list. */ -static inline void activate_page_nolock(struct page * page) -{ - if (PageLRU(page) && !PageActive(page)) { - del_page_from_inactive_list(page); - add_page_to_active_list(page); - } -} - void activate_page(struct page * page) { spin_lock(&pagemap_lru_lock); - activate_page_nolock(page); + if (PageLRU(page)) { + if (!PageActive(page)) { + del_page_from_inactive_list(page); + add_page_to_active_list(page); + ClearPageReferenced(page); + } else + SetPageReferenced(page); + } spin_unlock(&pagemap_lru_lock); } diff -urN vm-ref/mm/swap_state.c vm/mm/swap_state.c --- vm-ref/mm/swap_state.c Tue Jan 22 18:55:27 2002 +++ vm/mm/swap_state.c Tue Feb 26 16:44:30 2002 @@ -117,7 +117,9 @@ if (!PageLocked(page)) BUG(); - block_flushpage(page, 0); + if (!block_flushpage(page, 0)) + /* an anonymous page cannot have page->buffers set */ + BUG(); entry.val = page->index; diff -urN vm-ref/mm/swapfile.c vm/mm/swapfile.c --- vm-ref/mm/swapfile.c Fri Feb 22 20:32:04 2002 +++ vm/mm/swapfile.c Tue Feb 26 16:44:30 2002 @@ -227,6 +227,7 @@ * Check if we're the only user of a swap page, * when the page is locked. */ +static int FASTCALL(exclusive_swap_page(struct page *page)); static int exclusive_swap_page(struct page *page) { int retval = 0; @@ -240,12 +241,13 @@ if (p->swap_map[SWP_OFFSET(entry)] == 1) { /* Recheck the page count with the pagecache lock held.. */ spin_lock(&pagecache_lock); - if (page_count(page) - !!page->buffers == 2) + if (PageSwapCache(page) && page_count(page) - !!page->buffers == 2) retval = 1; spin_unlock(&pagecache_lock); } swap_info_put(p); } + return retval; } @@ -257,21 +259,42 @@ * work, but we opportunistically check whether * we need to get all the locks first.. */ -int can_share_swap_page(struct page *page) +int make_exclusive_page(struct page *page, int write) { int retval = 0; - if (!PageLocked(page)) - BUG(); switch (page_count(page)) { case 3: if (!page->buffers) break; /* Fallthrough */ case 2: + /* racy fastpath check */ if (!PageSwapCache(page)) break; - retval = exclusive_swap_page(page); + + if ((!write && !vm_swap_full()) || TryLockPage(page)) { + /* + * Don't remove the page from the swapcache if: + * - it was a read fault and... + * - the swap isn't full + * or if + * - we failed acquiring the page lock + * + * NOTE: if failed acquiring the lock we cannot remove the + * page from the swapcache, but still we can safely takeover + * the page if it's exclusive, see the swapcache check in + * the innermost critical section of exclusive_swap_page(). + */ + retval = exclusive_swap_page(page); + } else { + /* + * Here we've the page lock acquired and we're asked + * to try to drop this page from the swapcache. + */ + retval = remove_exclusive_swap_page(page); + unlock_page(page); + } break; case 1: if (PageReserved(page)) @@ -300,7 +323,7 @@ entry.val = page->index; p = swap_info_get(entry); - if (!p) + if (unlikely(!p)) return 0; /* Is the only swap cache user the cache itself? */ @@ -309,7 +332,11 @@ /* Recheck the page count with the pagecache lock held.. */ spin_lock(&pagecache_lock); if (page_count(page) - !!page->buffers == 2) { + if (page->buffers && !try_to_free_buffers(page, 0)) + /* an anonymous page cannot have page->buffers set */ + BUG(); __delete_from_swap_cache(page); + swap_entry_free(p, SWP_OFFSET(entry)); SetPageDirty(page); retval = 1; } @@ -317,11 +344,8 @@ } swap_info_put(p); - if (retval) { - block_flushpage(page, 0); - swap_free(entry); + if (retval) page_cache_release(page); - } return retval; } @@ -343,11 +367,7 @@ } if (page) { page_cache_get(page); - /* Only cache user (+us), or swap space full? Free it! */ - if (page_count(page) - !!page->buffers == 2 || vm_swap_full()) { - delete_from_swap_cache(page); - SetPageDirty(page); - } + remove_exclusive_swap_page(page); UnlockPage(page); page_cache_release(page); } diff -urN vm-ref/mm/vmscan.c vm/mm/vmscan.c --- vm-ref/mm/vmscan.c Fri Feb 22 20:32:04 2002 +++ vm/mm/vmscan.c Tue Feb 26 17:42:40 2002 @@ -25,12 +25,32 @@ #include /* - * The "priority" of VM scanning is how much of the queues we - * will scan in one go. A value of 6 for DEF_PRIORITY implies - * that we'll scan 1/64th of the queues ("queue_length >> 6") - * during a normal aging round. + * "vm_vfs_scan_ratio" is how much of the VFS queues we will scan + * in one go. A value of 6 for vm_vfs_scan_ratio implies that we'll + * scan 1/6 of the inactive lists during a normal aging round. */ -#define DEF_PRIORITY (6) +int vm_vfs_scan_ratio = 6; + +/* + * "vm_mapped_ratio" controls when we start to swapout, the bigger, + * the earlier we'll start to swapout. + */ +int vm_mapped_ratio = 100; + +/* + * "vm_lru_balance_ratio" controls the balance between active and + * inactive cache. The bigger vm_balance is, the easier the + * active cache will grow, because we'll rotate the active list + * slowly. A value of 2 means we'll go towards a balance of + * 1/3 of the cache being inactive. + */ +int vm_lru_balance_ratio = 2; + +/* + * "vm_passes" is the number of vm passes before failing the + * memory balancing. + */ +int vm_passes = 20; /* * The swap-out function returns 1 if it successfully @@ -53,10 +73,6 @@ return 0; } - /* Don't bother unmapping pages that are active */ - if (PageActive(page)) - return 0; - /* Don't bother replenishing zones not under pressure.. */ if (!memclass(page->zone, classzone)) return 0; @@ -256,6 +272,7 @@ { unsigned long address; struct vm_area_struct* vma; + int tlb_flush = 0; /* * Find the proper vm-area after freezing the vma chain @@ -270,6 +287,7 @@ } vma = find_vma(mm, address); if (vma) { + tlb_flush = 1; if (address < vma->vm_start) address = vma->vm_start; @@ -288,11 +306,13 @@ out_unlock: spin_unlock(&mm->page_table_lock); + if (tlb_flush) + flush_tlb_mm(mm); return count; } -static int FASTCALL(swap_out(unsigned int priority, unsigned int gfp_mask, zone_t * classzone)); -static int swap_out(unsigned int priority, unsigned int gfp_mask, zone_t * classzone) +static int FASTCALL(swap_out(zone_t * classzone)); +static int swap_out(zone_t * classzone) { int counter, nr_pages = SWAP_CLUSTER_MAX; struct mm_struct *mm; @@ -333,15 +353,18 @@ return 0; } -static int FASTCALL(shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int priority)); -static int shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int priority) +static int FASTCALL(shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int * failed_swapout)); +static int shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int * failed_swapout) { struct list_head * entry; - int max_scan = nr_inactive_pages / priority; - int max_mapped = min((nr_pages << (10 - priority)), max_scan / 10); + /* + * Worst case 3 passes, one for submitting the I/O, + * one for waiting the I/O and one to finally release the page. + */ + int max_scan = classzone->nr_inactive_pages * 3; + int max_mapped = SWAP_CLUSTER_MAX * vm_mapped_ratio; - spin_lock(&pagemap_lru_lock); - while (--max_scan >= 0 && (entry = inactive_list.prev) != &inactive_list) { + while (max_scan && classzone->nr_inactive_pages && (entry = inactive_list.prev) != &inactive_list) { struct page * page; if (unlikely(current->need_resched)) { @@ -372,6 +395,8 @@ if (!memclass(page->zone, classzone)) continue; + max_scan--; + /* Racy check to avoid trylocking when not worthwhile */ if (!page->buffers && (page_count(page) != 1 || !page->mapping)) goto page_mapped; @@ -469,34 +494,43 @@ spin_lock(&pagecache_lock); /* - * this is the non-racy check for busy page. + * This is the non-racy check for busy page. + * It is critical to check PageDirty _after_ we made sure + * the page is freeable so not in use by anybody. + * At this point we're guaranteed that page->buffers is NULL, + * nobody can refill page->buffers under us because we still + * hold the page lock. */ - if (!page->mapping || !is_page_cache_freeable(page)) { + if (!page->mapping || page_count(page) > 1) { spin_unlock(&pagecache_lock); UnlockPage(page); -page_mapped: - if (--max_mapped >= 0) - continue; + page_mapped: + if (--max_mapped < 0) { + spin_unlock(&pagemap_lru_lock); - /* - * Alert! We've found too many mapped pages on the - * inactive list, so we start swapping out now! - */ - spin_unlock(&pagemap_lru_lock); - swap_out(priority, gfp_mask, classzone); - return nr_pages; - } + shrink_dcache_memory(vm_vfs_scan_ratio, gfp_mask); + shrink_icache_memory(vm_vfs_scan_ratio, gfp_mask); +#ifdef CONFIG_QUOTA + shrink_dqcache_memory(vm_vfs_scan_ratio, gfp_mask); +#endif - /* - * It is critical to check PageDirty _after_ we made sure - * the page is freeable* so not in use by anybody. - */ + if (!*failed_swapout) + *failed_swapout = !swap_out(classzone); + + max_mapped = SWAP_CLUSTER_MAX * vm_mapped_ratio; + spin_lock(&pagemap_lru_lock); + } + continue; + + } if (PageDirty(page)) { spin_unlock(&pagecache_lock); UnlockPage(page); continue; } + __lru_cache_del(page); + /* point of no return */ if (likely(!PageSwapCache(page))) { __remove_inode_page(page); @@ -509,7 +543,6 @@ swap_free(swap); } - __lru_cache_del(page); UnlockPage(page); /* effectively free the page here */ @@ -531,17 +564,21 @@ * We move them the other way when we see the * reference bit on the page. */ -static void refill_inactive(int nr_pages) +static void FASTCALL(refill_inactive(int nr_pages, zone_t * classzone)); +static void refill_inactive(int nr_pages, zone_t * classzone) { struct list_head * entry; - spin_lock(&pagemap_lru_lock); entry = active_list.prev; while (nr_pages && entry != &active_list) { struct page * page; page = list_entry(entry, struct page, lru); entry = entry->prev; + + if (!memclass(page->zone, classzone)) + continue; + if (PageTestandClearReferenced(page)) { list_del(&page->lru); list_add(&page->lru, &active_list); @@ -554,54 +591,73 @@ add_page_to_inactive_list(page); SetPageReferenced(page); } - spin_unlock(&pagemap_lru_lock); + if (entry != &active_list) { + list_del(&active_list); + list_add(&active_list, entry); + } } -static int FASTCALL(shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages)); -static int shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages) +static int FASTCALL(shrink_caches(zone_t * classzone, unsigned int gfp_mask, int nr_pages, int * failed_swapout)); +static int shrink_caches(zone_t * classzone, unsigned int gfp_mask, int nr_pages, int * failed_swapout) { - int chunk_size = nr_pages; unsigned long ratio; nr_pages -= kmem_cache_reap(gfp_mask); if (nr_pages <= 0) return 0; - nr_pages = chunk_size; - /* try to keep the active list 2/3 of the size of the cache */ - ratio = (unsigned long) nr_pages * nr_active_pages / ((nr_inactive_pages + 1) * 2); - refill_inactive(ratio); - - nr_pages = shrink_cache(nr_pages, classzone, gfp_mask, priority); - if (nr_pages <= 0) - return 0; + spin_lock(&pagemap_lru_lock); + ratio = (unsigned long) SWAP_CLUSTER_MAX * classzone->nr_active_pages / (((unsigned long) classzone->nr_inactive_pages * vm_lru_balance_ratio) + 1); + /* + * We limit the ratio to avoid looping in the refill_inactive for too long time. + * There must be at least "SWAP_CLUSTER_MAX * vm_mapped_ratio" pages in the inactive list + * or we can fall into swapout false positives because we reach the end of the list. + * Multiplied by 2 to have a rasonable margin (the list can shrink while we browse it). + */ + if (ratio > SWAP_CLUSTER_MAX * vm_mapped_ratio * 2) + ratio = SWAP_CLUSTER_MAX * vm_mapped_ratio * 2; + refill_inactive(ratio, classzone); - shrink_dcache_memory(priority, gfp_mask); - shrink_icache_memory(priority, gfp_mask); -#ifdef CONFIG_QUOTA - shrink_dqcache_memory(DEF_PRIORITY, gfp_mask); -#endif + nr_pages = shrink_cache(nr_pages, classzone, gfp_mask, failed_swapout); return nr_pages; } +static int check_classzone_need_balance(zone_t * classzone); + int try_to_free_pages(zone_t *classzone, unsigned int gfp_mask, unsigned int order) { - int priority = DEF_PRIORITY; - int nr_pages = SWAP_CLUSTER_MAX; - gfp_mask = pf_gfp_mask(gfp_mask); - do { - nr_pages = shrink_caches(classzone, priority, gfp_mask, nr_pages); - if (nr_pages <= 0) - return 1; - } while (--priority); - /* - * Hmm.. Cache shrink failed - time to kill something? - * Mhwahahhaha! This is the part I really like. Giggle. - */ - out_of_memory(); + for (;;) { + int tries = vm_passes; + int failed_swapout = !(gfp_mask & __GFP_IO); + int nr_pages = SWAP_CLUSTER_MAX; + + do { + nr_pages = shrink_caches(classzone, gfp_mask, nr_pages, &failed_swapout); + if (nr_pages <= 0) + return 1; + + shrink_dcache_memory(vm_vfs_scan_ratio, gfp_mask); + shrink_icache_memory(vm_vfs_scan_ratio, gfp_mask); +#ifdef CONFIG_QUOTA + shrink_dqcache_memory(vm_vfs_scan_ratio, gfp_mask); +#endif + + if (!failed_swapout) + failed_swapout = !swap_out(classzone); + } while (--tries); + + if (likely(current->pid != 1)) + break; + if (!check_classzone_need_balance(classzone)) + break; + current->policy |= SCHED_YIELD; + __set_current_state(TASK_RUNNING); + schedule(); + } + return 0; } @@ -609,11 +665,12 @@ static int check_classzone_need_balance(zone_t * classzone) { - zone_t * first_classzone; + zone_t * first_zone; + int class_idx = zone_idx(classzone); - first_classzone = classzone->zone_pgdat->node_zones; - while (classzone >= first_classzone) { - if (classzone->free_pages > classzone->pages_high) + first_zone = classzone->zone_pgdat->node_zones; + while (classzone >= first_zone) { + if (classzone->free_pages > classzone->watermarks[class_idx].high) return 0; classzone--; } @@ -629,12 +686,12 @@ zone = pgdat->node_zones + i; if (unlikely(current->need_resched)) schedule(); - if (!zone->need_balance) + if (!zone->need_balance || !zone->size) continue; if (!try_to_free_pages(zone, GFP_KSWAPD, 0)) { zone->need_balance = 0; __set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(HZ); + schedule_timeout(HZ*5); continue; } if (check_classzone_need_balance(zone)) @@ -667,7 +724,7 @@ for (i = pgdat->nr_zones-1; i >= 0; i--) { zone = pgdat->node_zones + i; - if (!zone->need_balance) + if (!zone->need_balance || !zone->size) continue; return 0; }