diff -urN vm-ref/arch/alpha/mm/fault.c vm/arch/alpha/mm/fault.c --- vm-ref/arch/alpha/mm/fault.c Mon Sep 17 01:26:12 2001 +++ vm/arch/alpha/mm/fault.c Mon Sep 17 01:26:25 2001 @@ -140,6 +140,7 @@ goto bad_area; } + survive: /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo @@ -194,6 +195,12 @@ * us unable to handle the page fault gracefully. */ out_of_memory: + if (current->pid == 1) { + current->policy |= SCHED_YIELD; + schedule(); + down_read(&mm->mmap_sem); + goto survive; + } printk(KERN_ALERT "VM: killing process %s(%d)\n", current->comm, current->pid); if (!user_mode(regs)) diff -urN vm-ref/arch/i386/mm/fault.c vm/arch/i386/mm/fault.c --- vm-ref/arch/i386/mm/fault.c Mon Sep 17 01:26:12 2001 +++ vm/arch/i386/mm/fault.c Mon Sep 17 01:26:25 2001 @@ -51,8 +51,14 @@ start &= PAGE_MASK; for (;;) { - if (handle_mm_fault(current->mm, vma, start, 1) <= 0) - goto bad_area; + survive: + { + int fault = handle_mm_fault(current->mm, vma, start, 1); + if (!fault) + goto bad_area; + if (fault < 0) + goto out_of_memory; + } if (!size) break; size--; @@ -76,6 +82,14 @@ bad_area: return 0; + +out_of_memory: + if (current->pid == 1) { + current->policy |= SCHED_YIELD; + schedule(); + goto survive; + } + goto bad_area; } extern spinlock_t console_lock, timerlist_lock; @@ -198,6 +212,7 @@ goto bad_area; } + survive: /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo @@ -300,6 +315,12 @@ */ out_of_memory: up_read(&mm->mmap_sem); + if (tsk->pid == 1) { + tsk->policy |= SCHED_YIELD; + schedule(); + down_read(&mm->mmap_sem); + goto survive; + } printk("VM: killing process %s\n", tsk->comm); if (error_code & 4) do_exit(SIGKILL); diff -urN vm-ref/fs/buffer.c vm/fs/buffer.c --- vm-ref/fs/buffer.c Mon Sep 17 01:26:14 2001 +++ vm/fs/buffer.c Mon Sep 17 01:26:25 2001 @@ -135,6 +135,7 @@ inline void unlock_buffer(struct buffer_head *bh) { + clear_bit(BH_Wait_IO, &bh->b_state); clear_bit(BH_Lock, &bh->b_state); smp_mb__after_clear_bit(); if (waitqueue_active(&bh->b_wait)) @@ -838,9 +839,7 @@ static void free_more_memory(void) { balance_dirty(); - page_launder(GFP_NOFS, 0); wakeup_bdflush(); - wakeup_kswapd(); current->policy |= SCHED_YIELD; __set_current_state(TASK_RUNNING); schedule(); @@ -1185,6 +1184,7 @@ out: write_unlock(&hash_table_lock); spin_unlock(&lru_list_lock); + touch_buffer(bh); return bh; } @@ -1349,7 +1349,6 @@ struct buffer_head * bh; bh = getblk(dev, block, size); - touch_buffer(bh); if (buffer_uptodate(bh)) return bh; ll_rw_block(READ, 1, &bh); @@ -2516,34 +2515,31 @@ return 0; } -/* - * Sync all the buffers on one page.. - * - * If we have old buffers that are locked, we'll - * wait on them, but we won't wait on the new ones - * we're writing out now. - * - * This all is required so that we can free up memory - * later. - * - * Wait: - * 0 - no wait (this does not get called - see try_to_free_buffers below) - * 1 - start IO for dirty buffers - * 2 - wait for completion of locked buffers - */ -static void sync_page_buffers(struct buffer_head *bh, unsigned int gfp_mask) +static int sync_page_buffers(struct buffer_head *bh, unsigned int gfp_mask) { - struct buffer_head * tmp = bh; + struct buffer_head * p = bh; + int tryagain = 1; do { - struct buffer_head *p = tmp; - tmp = tmp->b_this_page; - if (buffer_locked(p)) { - if (gfp_mask & __GFP_WAIT) - __wait_on_buffer(p); - } else if (buffer_dirty(p)) - ll_rw_block(WRITE, 1, &p); - } while (tmp != bh); + if (buffer_dirty(p) || buffer_locked(p)) { + if (test_and_set_bit(BH_Wait_IO, &p->b_state)) { + if (buffer_dirty(p)) { + ll_rw_block(WRITE, 1, &p); + tryagain = 0; + } else if (buffer_locked(p)) { + if (gfp_mask & __GFP_WAIT) { + wait_on_buffer(p); + tryagain = 1; + } else + tryagain = 0; + } + } else + tryagain = 0; + } + p = p->b_this_page; + } while (p != bh); + + return tryagain; } /* @@ -2614,16 +2610,16 @@ write_unlock(&hash_table_lock); spin_unlock(&lru_list_lock); if (gfp_mask & __GFP_IO && !(current->flags & PF_ATOMICALLOC)) { - if (!(gfp_mask & __GFP_HIGHIO) && PageHighMem(page)) - return 0; - sync_page_buffers(bh, gfp_mask); - /* We waited synchronously, so we can free the buffers. */ - if (gfp_mask & __GFP_WAIT) { - gfp_mask = 0; /* no IO or waiting this time around */ - goto cleaned_buffers_try_again; + if (gfp_mask & __GFP_HIGHIO || !PageHighMem(page)) { + if (sync_page_buffers(bh, gfp_mask)) { + /* no IO or waiting next time */ + gfp_mask = 0; + goto cleaned_buffers_try_again; + } } - wakeup_bdflush(); } + if (balance_dirty_state() >= 0) + wakeup_bdflush(); return 0; } diff -urN vm-ref/fs/dcache.c vm/fs/dcache.c --- vm-ref/fs/dcache.c Mon Sep 17 01:26:14 2001 +++ vm/fs/dcache.c Mon Sep 17 01:26:25 2001 @@ -569,7 +569,7 @@ if (!(gfp_mask & __GFP_FS)) return 0; - count = dentry_stat.nr_unused >> priority; + count = dentry_stat.nr_unused / priority; prune_dcache(count); kmem_cache_shrink(dentry_cache); diff -urN vm-ref/fs/inode.c vm/fs/inode.c --- vm-ref/fs/inode.c Mon Sep 17 01:26:13 2001 +++ vm/fs/inode.c Mon Sep 17 01:26:25 2001 @@ -278,27 +278,18 @@ } } -static inline int try_to_sync_unused_list(struct list_head *head) +static inline int try_to_sync_unused_list(struct list_head *head, int nr_inodes) { struct list_head *tmp = head; struct inode *inode; - while ((tmp = tmp->prev) != head) { + while (nr_inodes && (tmp = tmp->prev) != head) { inode = list_entry(tmp, struct inode, i_list); if (!atomic_read(&inode->i_count)) { - /* - * We're under PF_MEMALLOC here, and syncing the - * inode may have to allocate memory. To avoid - * running into a OOM deadlock, we write one - * inode synchronously and stop syncing in case - * we're under freepages.low - */ + __sync_one(inode, 0); + nr_inodes--; - int sync = nr_free_pages() < freepages.low; - __sync_one(inode, sync); - if (sync) - return 0; /* * __sync_one moved the inode to another list, * so we have to start looking from the list head. @@ -306,7 +297,8 @@ tmp = head; } } - return 1; + + return nr_inodes; } void sync_inodes_sb(struct super_block *sb) @@ -402,24 +394,25 @@ } } -/* - * Called with the spinlock already held.. - */ -static void try_to_sync_unused_inodes(void) +static void try_to_sync_unused_inodes(void * arg) { struct super_block * sb; + int nr_inodes = inodes_stat.nr_unused; + spin_lock(&inode_lock); spin_lock(&sb_lock); sb = sb_entry(super_blocks.next); - for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.next)) { + for (; nr_inodes && sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.next)) { spin_unlock(&sb_lock); - if (!try_to_sync_unused_list(&sb->s_dirty)) - return; + nr_inodes = try_to_sync_unused_list(&sb->s_dirty, nr_inodes); spin_lock(&sb_lock); } spin_unlock(&sb_lock); + spin_unlock(&inode_lock); } +static struct tq_struct unused_inodes_flush_task; + /** * write_inode_now - write an inode to disk * @inode: inode to write to disk @@ -672,12 +665,11 @@ { LIST_HEAD(list); struct list_head *entry, *freeable = &list; - int count, synced = 0; + int count; struct inode * inode; spin_lock(&inode_lock); -free_unused: count = 0; entry = inode_unused.prev; while (entry != &inode_unused) @@ -707,18 +699,13 @@ dispose_list(freeable); /* - * If we freed enough clean inodes, avoid writing - * dirty ones. Also giveup if we already tried to - * sync dirty inodes. + * If we didn't freed enough clean inodes schedule + * a sync of the dirty inodes, we cannot do it + * from here or we're either synchronously dogslow + * or we deadlock with oom. */ - if (!goal || synced) - return; - - synced = 1; - - spin_lock(&inode_lock); - try_to_sync_unused_inodes(); - goto free_unused; + if (goal) + schedule_task(&unused_inodes_flush_task); } int shrink_icache_memory(int priority, int gfp_mask) @@ -735,7 +722,7 @@ if (!(gfp_mask & __GFP_FS)) return 0; - count = inodes_stat.nr_unused >> priority; + count = inodes_stat.nr_unused / priority; prune_icache(count); kmem_cache_shrink(inode_cachep); @@ -1182,6 +1169,8 @@ NULL); if (!inode_cachep) panic("cannot create inode slab cache"); + + unused_inodes_flush_task.routine = try_to_sync_unused_inodes; } /** diff -urN vm-ref/fs/proc/proc_misc.c vm/fs/proc/proc_misc.c --- vm-ref/fs/proc/proc_misc.c Mon Sep 17 01:26:14 2001 +++ vm/fs/proc/proc_misc.c Mon Sep 17 01:26:25 2001 @@ -168,9 +168,7 @@ "Cached: %8lu kB\n" "SwapCached: %8lu kB\n" "Active: %8u kB\n" - "Inact_dirty: %8u kB\n" - "Inact_clean: %8u kB\n" - "Inact_target: %8lu kB\n" + "Inactive: %8u kB\n" "HighTotal: %8lu kB\n" "HighFree: %8lu kB\n" "LowTotal: %8lu kB\n" @@ -184,9 +182,7 @@ K(atomic_read(&page_cache_size) - swapper_space.nrpages), K(swapper_space.nrpages), K(nr_active_pages), - K(nr_inactive_dirty_pages), - K(nr_inactive_clean_pages()), - K(inactive_target), + K(nr_inactive_pages), K(i.totalhigh), K(i.freehigh), K(i.totalram-i.totalhigh), diff -urN vm-ref/include/linux/fs.h vm/include/linux/fs.h --- vm-ref/include/linux/fs.h Mon Sep 17 01:26:14 2001 +++ vm/include/linux/fs.h Mon Sep 17 01:26:25 2001 @@ -216,6 +216,7 @@ BH_Mapped, /* 1 if the buffer has a disk mapping */ BH_New, /* 1 if the buffer is new and not yet written out */ BH_Async, /* 1 if the buffer is under end_buffer_io_async I/O */ + BH_Wait_IO, /* 1 if we should throttle on this buffer */ BH_PrivateStart,/* not a state bit, but the first bit available * for private allocation by other entities diff -urN vm-ref/include/linux/highmem.h vm/include/linux/highmem.h --- vm-ref/include/linux/highmem.h Mon Sep 17 00:14:59 2001 +++ vm/include/linux/highmem.h Mon Sep 17 01:26:25 2001 @@ -11,7 +11,7 @@ #include /* declarations for linux/mm/highmem.c */ -FASTCALL(unsigned int nr_free_highpages(void)); +unsigned int nr_free_highpages(void); extern struct buffer_head * create_bounce(int rw, struct buffer_head * bh_orig); diff -urN vm-ref/include/linux/list.h vm/include/linux/list.h --- vm-ref/include/linux/list.h Mon Sep 17 01:26:13 2001 +++ vm/include/linux/list.h Mon Sep 17 01:26:25 2001 @@ -92,6 +92,7 @@ static __inline__ void list_del(struct list_head *entry) { __list_del(entry->prev, entry->next); + entry->next = entry->prev = 0; } /** diff -urN vm-ref/include/linux/mm.h vm/include/linux/mm.h --- vm-ref/include/linux/mm.h Mon Sep 17 01:26:15 2001 +++ vm/include/linux/mm.h Mon Sep 17 01:26:25 2001 @@ -19,7 +19,7 @@ extern int page_cluster; /* The inactive_clean lists are per zone. */ extern struct list_head active_list; -extern struct list_head inactive_dirty_list; +extern struct list_head inactive_list; #include #include @@ -154,7 +154,6 @@ updated asynchronously */ struct list_head lru; /* Pageout list, eg. active_list; protected by pagemap_lru_lock !! */ - unsigned long age; /* Page aging counter. */ wait_queue_head_t wait; /* Page locked? Stand in line... */ struct page **pprev_hash; /* Complement to *next_hash. */ struct buffer_head * buffers; /* Buffer maps us to a disk block. */ @@ -275,16 +274,14 @@ #define PG_dirty 4 #define PG_decr_after 5 #define PG_active 6 -#define PG_inactive_dirty 7 +#define PG_inactive 7 #define PG_slab 8 #define PG_swap_cache 9 #define PG_skip 10 -#define PG_inactive_clean 11 -#define PG_highmem 12 -#define PG_checked 13 /* kill me in 2.5.. */ - /* bits 21-29 unused */ -#define PG_arch_1 30 -#define PG_reserved 31 +#define PG_highmem 11 +#define PG_checked 12 /* kill me in 2.5.. */ +#define PG_arch_1 13 +#define PG_reserved 14 /* Make it prettier to test the above... */ #define Page_Uptodate(page) test_bit(PG_uptodate, &(page)->flags) @@ -347,14 +344,14 @@ #define PageActive(page) test_bit(PG_active, &(page)->flags) #define SetPageActive(page) set_bit(PG_active, &(page)->flags) #define ClearPageActive(page) clear_bit(PG_active, &(page)->flags) +#define TestandSetPageActive(page) test_and_set_bit(PG_active, &(page)->flags) +#define TestandClearPageActive(page) test_and_clear_bit(PG_active, &(page)->flags) -#define PageInactiveDirty(page) test_bit(PG_inactive_dirty, &(page)->flags) -#define SetPageInactiveDirty(page) set_bit(PG_inactive_dirty, &(page)->flags) -#define ClearPageInactiveDirty(page) clear_bit(PG_inactive_dirty, &(page)->flags) - -#define PageInactiveClean(page) test_bit(PG_inactive_clean, &(page)->flags) -#define SetPageInactiveClean(page) set_bit(PG_inactive_clean, &(page)->flags) -#define ClearPageInactiveClean(page) clear_bit(PG_inactive_clean, &(page)->flags) +#define PageInactive(page) test_bit(PG_inactive, &(page)->flags) +#define SetPageInactive(page) set_bit(PG_inactive, &(page)->flags) +#define ClearPageInactive(page) clear_bit(PG_inactive, &(page)->flags) +#define TestandSetPageInactive(page) test_and_set_bit(PG_inactive, &(page)->flags) +#define TestandClearPageInactive(page) test_and_clear_bit(PG_inactive, &(page)->flags) #ifdef CONFIG_HIGHMEM #define PageHighMem(page) test_bit(PG_highmem, &(page)->flags) @@ -380,11 +377,11 @@ * can allocate highmem pages, the *get*page*() variants return * virtual kernel addresses to the allocated page(s). */ -extern struct page * FASTCALL(_alloc_pages(unsigned int gfp_mask, unsigned long order)); -extern struct page * FASTCALL(__alloc_pages(unsigned int gfp_mask, unsigned long order, zonelist_t *zonelist)); -extern struct page * alloc_pages_node(int nid, int gfp_mask, unsigned long order); +extern struct page * FASTCALL(_alloc_pages(unsigned int gfp_mask, unsigned int order)); +extern struct page * FASTCALL(__alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist)); +extern struct page * alloc_pages_node(int nid, unsigned int gfp_mask, unsigned int order); -static inline struct page * alloc_pages(int gfp_mask, unsigned long order) +static inline struct page * alloc_pages(unsigned int gfp_mask, unsigned int order) { /* * Gets optimized away by the compiler. @@ -396,8 +393,8 @@ #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) -extern unsigned long FASTCALL(__get_free_pages(int gfp_mask, unsigned long order)); -extern unsigned long FASTCALL(get_zeroed_page(int gfp_mask)); +extern unsigned long FASTCALL(__get_free_pages(unsigned int gfp_mask, unsigned int order)); +extern unsigned long FASTCALL(get_zeroed_page(unsigned int gfp_mask)); #define __get_free_page(gfp_mask) \ __get_free_pages((gfp_mask),0) @@ -413,8 +410,8 @@ /* * There is only one 'core' page-freeing function. */ -extern void FASTCALL(__free_pages(struct page *page, unsigned long order)); -extern void FASTCALL(free_pages(unsigned long addr, unsigned long order)); +extern void FASTCALL(__free_pages(struct page *page, unsigned int order)); +extern void FASTCALL(free_pages(unsigned long addr, unsigned int order)); extern void * FASTCALL(alloc_exact(unsigned int size)); extern void FASTCALL(free_exact(void * addr, unsigned int size)); @@ -469,6 +466,11 @@ extern void show_mem(void); extern void si_meminfo(struct sysinfo * val); extern void swapin_readahead(swp_entry_t); + +static inline int is_page_cache_freeable(struct page * page) +{ + return page_count(page) - !!page->buffers == 1; +} /* * Work out if there are any other processes sharing this diff -urN vm-ref/include/linux/mmzone.h vm/include/linux/mmzone.h --- vm-ref/include/linux/mmzone.h Mon Sep 17 00:14:59 2001 +++ vm/include/linux/mmzone.h Mon Sep 17 01:26:25 2001 @@ -39,14 +39,12 @@ */ spinlock_t lock; unsigned long free_pages; - unsigned long inactive_clean_pages; - unsigned long inactive_dirty_pages; unsigned long pages_min, pages_low, pages_high; + int need_balance; /* * free areas of different sizes */ - struct list_head inactive_clean_list; free_area_t free_area[MAX_ORDER]; /* @@ -101,6 +99,7 @@ typedef struct pglist_data { zone_t node_zones[MAX_NR_ZONES]; zonelist_t node_zonelists[GFP_ZONEMASK+1]; + int nr_zones; struct page *node_mem_map; unsigned long *valid_addr_bitmap; struct bootmem_data *bdata; @@ -114,8 +113,8 @@ extern int numnodes; extern pg_data_t *pgdat_list; -#define memclass(pgzone, tzone) (((pgzone)->zone_pgdat == (tzone)->zone_pgdat) \ - && ((pgzone) <= (tzone))) +#define memclass(pgzone, classzone) (((pgzone)->zone_pgdat == (classzone)->zone_pgdat) \ + && ((pgzone) <= (classzone))) /* * The following two are not meant for general usage. They are here as diff -urN vm-ref/include/linux/pagemap.h vm/include/linux/pagemap.h --- vm-ref/include/linux/pagemap.h Mon Sep 17 00:14:59 2001 +++ vm/include/linux/pagemap.h Mon Sep 17 01:26:25 2001 @@ -29,7 +29,6 @@ #define PAGE_CACHE_ALIGN(addr) (((addr)+PAGE_CACHE_SIZE-1)&PAGE_CACHE_MASK) #define page_cache_get(x) get_page(x) -#define page_cache_free(x) __free_page(x) #define page_cache_release(x) __free_page(x) static inline struct page *page_cache_alloc(struct address_space *x) diff -urN vm-ref/include/linux/sched.h vm/include/linux/sched.h --- vm-ref/include/linux/sched.h Mon Sep 17 01:26:15 2001 +++ vm/include/linux/sched.h Mon Sep 17 01:26:25 2001 @@ -318,6 +318,8 @@ int get_child_timeslice; struct task_struct *next_task, *prev_task; struct mm_struct *active_mm; + struct list_head local_pages; + unsigned int allocation_order, nr_local_pages; /* task state */ struct linux_binfmt *binfmt; @@ -417,6 +419,7 @@ #define PF_MEMALLOC (1UL<<5) /* Allocating memory */ #define PF_USEDFPU (1UL<<6) /* task used FPU this quantum (SMP) */ #define PF_ATOMICALLOC (1UL<<7) /* do not block during memalloc */ +#define PF_FREE_PAGES (1UL<<8) /* per process page freeing */ /* * Ptrace flags diff -urN vm-ref/include/linux/slab.h vm/include/linux/slab.h --- vm-ref/include/linux/slab.h Mon Sep 17 00:14:59 2001 +++ vm/include/linux/slab.h Mon Sep 17 01:26:25 2001 @@ -60,7 +60,7 @@ extern void *kmalloc(size_t, int); extern void kfree(const void *); -extern void kmem_cache_reap(int); +extern int FASTCALL(kmem_cache_reap(int)); extern int slabinfo_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data); extern int slabinfo_write_proc(struct file *file, const char *buffer, diff -urN vm-ref/include/linux/swap.h vm/include/linux/swap.h --- vm-ref/include/linux/swap.h Mon Sep 17 01:26:13 2001 +++ vm/include/linux/swap.h Mon Sep 17 01:26:25 2001 @@ -80,10 +80,9 @@ extern int nr_swap_pages; extern unsigned int nr_free_pages(void); -extern unsigned int nr_inactive_clean_pages(void); extern unsigned int nr_free_buffer_pages(void); extern int nr_active_pages; -extern int nr_inactive_dirty_pages; +extern int nr_inactive_pages; extern atomic_t nr_async_pages; extern struct address_space swapper_space; extern atomic_t page_cache_size; @@ -99,26 +98,20 @@ struct zone_t; /* linux/mm/swap.c */ -extern int memory_pressure; -extern void deactivate_page(struct page *); -extern void deactivate_page_nolock(struct page *); -extern void activate_page(struct page *); -extern void activate_page_nolock(struct page *); -extern void lru_cache_add(struct page *); -extern void __lru_cache_del(struct page *); -extern void lru_cache_del(struct page *); -extern void recalculate_vm_stats(void); +extern void FASTCALL(lru_cache_add(struct page *)); +extern void FASTCALL(__lru_cache_del(struct page *)); +extern void FASTCALL(lru_cache_del(struct page *)); + +extern void FASTCALL(deactivate_page(struct page *)); +extern void FASTCALL(deactivate_page_nolock(struct page *)); +extern void FASTCALL(activate_page(struct page *)); +extern void FASTCALL(activate_page_nolock(struct page *)); + extern void swap_setup(void); /* linux/mm/vmscan.c */ -extern struct page * reclaim_page(zone_t *); extern wait_queue_head_t kswapd_wait; -extern wait_queue_head_t kreclaimd_wait; -extern int page_launder(int, int); -extern int free_shortage(void); -extern int inactive_shortage(void); -extern void wakeup_kswapd(void); -extern int try_to_free_pages(unsigned int gfp_mask); +extern int FASTCALL(try_to_free_pages(zone_t *, unsigned int, unsigned int)); /* linux/mm/page_io.c */ extern void rw_swap_page(int, struct page *); @@ -134,7 +127,6 @@ extern struct page * read_swap_cache_async(swp_entry_t); /* linux/mm/oom_kill.c */ -extern int out_of_memory(void); extern void oom_kill(void); /* @@ -146,7 +138,6 @@ extern void free_page_and_swap_cache(struct page *page); /* linux/mm/swapfile.c */ -extern int vm_swap_full(void); extern unsigned int nr_swapfiles; extern struct swap_info_struct swap_info[]; extern int is_swap_partition(kdev_t); @@ -179,90 +170,51 @@ extern spinlock_t pagemap_lru_lock; -extern void FASTCALL(mark_page_accessed(struct page *)); - -/* - * Page aging defines. - * Since we do exponential decay of the page age, we - * can chose a fairly large maximum. - */ -#define PAGE_AGE_START 2 -#define PAGE_AGE_ADV 3 -#define PAGE_AGE_MAX 64 - /* * List add/del helper macros. These must be called * with the pagemap_lru_lock held! */ -#define DEBUG_ADD_PAGE \ - if (PageActive(page) || PageInactiveDirty(page) || \ - PageInactiveClean(page)) BUG(); - -#define ZERO_PAGE_BUG \ - if (page_count(page) == 0) BUG(); - -#define add_page_to_active_list(page) { \ - DEBUG_ADD_PAGE \ - ZERO_PAGE_BUG \ - page->age = 0; \ - ClearPageReferenced(page); \ - SetPageActive(page); \ - list_add(&(page)->lru, &active_list); \ - nr_active_pages++; \ -} - -#define add_page_to_inactive_dirty_list(page) { \ - DEBUG_ADD_PAGE \ - ZERO_PAGE_BUG \ - SetPageInactiveDirty(page); \ - list_add(&(page)->lru, &inactive_dirty_list); \ - nr_inactive_dirty_pages++; \ - page->zone->inactive_dirty_pages++; \ -} - -#define add_page_to_inactive_clean_list(page) { \ - DEBUG_ADD_PAGE \ - ZERO_PAGE_BUG \ - SetPageInactiveClean(page); \ - list_add(&(page)->lru, &page->zone->inactive_clean_list); \ - page->zone->inactive_clean_pages++; \ -} - -#define del_page_from_active_list(page) { \ - list_del(&(page)->lru); \ - ClearPageActive(page); \ - nr_active_pages--; \ - DEBUG_ADD_PAGE \ - ZERO_PAGE_BUG \ -} - -#define del_page_from_inactive_dirty_list(page) { \ - list_del(&(page)->lru); \ - ClearPageInactiveDirty(page); \ - nr_inactive_dirty_pages--; \ - page->zone->inactive_dirty_pages--; \ - DEBUG_ADD_PAGE \ - ZERO_PAGE_BUG \ -} - -#define del_page_from_inactive_clean_list(page) { \ - list_del(&(page)->lru); \ - ClearPageInactiveClean(page); \ - page->zone->inactive_clean_pages--; \ - DEBUG_ADD_PAGE \ - ZERO_PAGE_BUG \ -} - -/* - * In mm/swap.c::recalculate_vm_stats(), we substract - * inactive_target from memory_pressure every second. - * This means that memory_pressure is smoothed over - * 64 (1 << INACTIVE_SHIFT) seconds. - */ -#define INACTIVE_SHIFT 6 -#define inactive_target min_t(unsigned long, \ - (memory_pressure >> INACTIVE_SHIFT), \ - (num_physpages / 4)) +#define DEBUG_LRU_PAGE(page) \ +do { \ + if (PageActive(page)) \ + BUG(); \ + if (PageInactive(page)) \ + BUG(); \ + if (page_count(page) == 0) \ + BUG(); \ +} while (0) + +#define add_page_to_active_list(page) \ +do { \ + DEBUG_LRU_PAGE(page); \ + SetPageActive(page); \ + list_add(&(page)->lru, &active_list); \ + nr_active_pages++; \ +} while (0) + +#define add_page_to_inactive_list(page) \ +do { \ + DEBUG_LRU_PAGE(page); \ + SetPageInactive(page); \ + list_add(&(page)->lru, &inactive_list); \ + nr_inactive_pages++; \ +} while (0) + +#define del_page_from_active_list(page) \ +do { \ + list_del(&(page)->lru); \ + ClearPageActive(page); \ + nr_active_pages--; \ + DEBUG_LRU_PAGE(page); \ +} while (0) + +#define del_page_from_inactive_list(page) \ +do { \ + list_del(&(page)->lru); \ + ClearPageInactive(page); \ + nr_inactive_pages--; \ + DEBUG_LRU_PAGE(page); \ +} while (0) /* * Ugly ugly ugly HACK to make sure the inactive lists diff -urN vm-ref/include/linux/swapctl.h vm/include/linux/swapctl.h --- vm-ref/include/linux/swapctl.h Sun Sep 16 14:15:30 2001 +++ vm/include/linux/swapctl.h Mon Sep 17 01:26:25 2001 @@ -1,28 +1,6 @@ #ifndef _LINUX_SWAPCTL_H #define _LINUX_SWAPCTL_H -#include -#include - -typedef struct buffer_mem_v1 -{ - unsigned int min_percent; - unsigned int borrow_percent; - unsigned int max_percent; -} buffer_mem_v1; -typedef buffer_mem_v1 buffer_mem_t; -extern buffer_mem_t buffer_mem; -extern buffer_mem_t page_cache; - -typedef struct freepages_v1 -{ - unsigned int min; - unsigned int low; - unsigned int high; -} freepages_v1; -typedef freepages_v1 freepages_t; -extern freepages_t freepages; - typedef struct pager_daemon_v1 { unsigned int tries_base; diff -urN vm-ref/kernel/fork.c vm/kernel/fork.c --- vm-ref/kernel/fork.c Mon Sep 17 01:26:15 2001 +++ vm/kernel/fork.c Mon Sep 17 01:26:25 2001 @@ -649,6 +649,8 @@ p->lock_depth = -1; /* -1 = no lock */ p->start_time = jiffies; + INIT_LIST_HEAD(&p->local_pages); + retval = -ENOMEM; /* copy all the process information */ if (copy_files(clone_flags, p)) diff -urN vm-ref/kernel/signal.c vm/kernel/signal.c --- vm-ref/kernel/signal.c Mon Sep 17 01:26:12 2001 +++ vm/kernel/signal.c Mon Sep 17 01:26:25 2001 @@ -382,7 +382,7 @@ switch (sig) { case SIGKILL: case SIGCONT: /* Wake up the process if stopped. */ - if (t->state == TASK_STOPPED) + if (t->state == TASK_STOPPED && !(t->ptrace & PT_PTRACED)) wake_up_process(t); t->exit_code = 0; rm_sig_from_queue(SIGSTOP, t); diff -urN vm-ref/kernel/sysctl.c vm/kernel/sysctl.c --- vm-ref/kernel/sysctl.c Mon Sep 17 01:26:12 2001 +++ vm/kernel/sysctl.c Mon Sep 17 01:26:25 2001 @@ -254,17 +254,11 @@ }; static ctl_table vm_table[] = { - {VM_FREEPG, "freepages", - &freepages, sizeof(freepages_t), 0444, NULL, &proc_dointvec}, {VM_BDFLUSH, "bdflush", &bdf_prm, 9*sizeof(int), 0644, NULL, &proc_dointvec_minmax, &sysctl_intvec, NULL, &bdflush_min, &bdflush_max}, {VM_OVERCOMMIT_MEMORY, "overcommit_memory", &sysctl_overcommit_memory, sizeof(sysctl_overcommit_memory), 0644, NULL, &proc_dointvec}, - {VM_BUFFERMEM, "buffermem", - &buffer_mem, sizeof(buffer_mem_t), 0644, NULL, &proc_dointvec}, - {VM_PAGECACHE, "pagecache", - &page_cache, sizeof(buffer_mem_t), 0644, NULL, &proc_dointvec}, {VM_PAGERDAEMON, "kswapd", &pager_daemon, sizeof(pager_daemon_t), 0644, NULL, &proc_dointvec}, {VM_PGT_CACHE, "pagetable_cache", diff -urN vm-ref/mm/filemap.c vm/mm/filemap.c --- vm-ref/mm/filemap.c Mon Sep 17 01:26:15 2001 +++ vm/mm/filemap.c Mon Sep 17 01:26:25 2001 @@ -419,6 +419,9 @@ if (page->index == offset) break; } + + SetPageReferenced(page); + not_found: return page; } @@ -596,9 +599,9 @@ if (!PageLocked(page)) BUG(); + page->index = index; page_cache_get(page); spin_lock(&pagecache_lock); - page->index = index; add_page_to_inode_queue(mapping, page); add_page_to_hash_queue(page, page_hash(mapping, index)); lru_cache_add(page); @@ -618,7 +621,7 @@ if (PageLocked(page)) BUG(); - flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty) | (1 << PG_referenced) | (1 << PG_arch_1) | (1 << PG_checked)); + flags = page->flags & ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_dirty | 1 << PG_referenced | 1 << PG_arch_1 | 1 << PG_checked); page->flags = flags | (1 << PG_locked); page_cache_get(page); page->index = offset; @@ -658,7 +661,8 @@ * This adds the requested page to the page cache if it isn't already there, * and schedules an I/O to read in its contents from disk. */ -static inline int page_cache_read(struct file * file, unsigned long offset) +static int FASTCALL(page_cache_read(struct file * file, unsigned long offset)); +static int page_cache_read(struct file * file, unsigned long offset) { struct inode *inode = file->f_dentry->d_inode; struct address_space *mapping = inode->i_mapping; @@ -666,7 +670,7 @@ struct page *page; spin_lock(&pagecache_lock); - page = __find_page_nolock(mapping, offset, *hash); + page = __find_page_nolock(mapping, offset, *hash); spin_unlock(&pagecache_lock); if (page) return 0; @@ -684,7 +688,7 @@ * We arrive here in the unlikely event that someone * raced with us and added our page to the cache first. */ - page_cache_free(page); + page_cache_release(page); return 0; } @@ -692,6 +696,8 @@ * Read in an entire cluster at once. A cluster is usually a 64k- * aligned block that includes the page requested in "offset." */ +static int FASTCALL(read_cluster_nonblocking(struct file * file, unsigned long offset, + unsigned long filesize)); static int read_cluster_nonblocking(struct file * file, unsigned long offset, unsigned long filesize) { @@ -1081,26 +1087,6 @@ } /* - * Mark a page as having seen activity. - * - * If it was already so marked, move it - * to the active queue and drop the referenced - * bit. Otherwise, just mark it for future - * action.. - */ -void mark_page_accessed(struct page *page) -{ - if (!PageActive(page) && PageReferenced(page)) { - activate_page(page); - ClearPageReferenced(page); - return; - } - - /* Mark the page referenced, AFTER checking for previous usage.. */ - SetPageReferenced(page); -} - -/* * This is a generic file read routine, and uses the * inode->i_op->readpage() function for the actual low-level * stuff. @@ -1224,7 +1210,6 @@ index += offset >> PAGE_CACHE_SHIFT; offset &= ~PAGE_CACHE_MASK; - mark_page_accessed(page); page_cache_release(page); if (ret == nr && desc->count) continue; @@ -1320,7 +1305,7 @@ *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; filp->f_reada = 1; if (cached_page) - page_cache_free(cached_page); + page_cache_release(cached_page); UPDATE_ATIME(inode); } @@ -2569,8 +2554,7 @@ } } if (cached_page) - page_cache_free(cached_page); - mark_page_accessed(page); + page_cache_release(cached_page); return page; } @@ -2639,7 +2623,7 @@ struct page *cached_page = NULL; struct page *page = __grab_cache_page(mapping,index,&cached_page); if (cached_page) - page_cache_free(cached_page); + page_cache_release(cached_page); return page; } @@ -2861,7 +2845,7 @@ *ppos = pos; if (cached_page) - page_cache_free(cached_page); + page_cache_release(cached_page); /* For now, when the user asks for O_SYNC, we'll actually * provide O_DSYNC. */ diff -urN vm-ref/mm/memory.c vm/mm/memory.c --- vm-ref/mm/memory.c Mon Sep 17 01:26:12 2001 +++ vm/mm/memory.c Mon Sep 17 01:26:27 2001 @@ -274,12 +274,8 @@ * free_page() used to be able to clear swap cache * entries. We may now have to do it manually. */ - if (page->mapping) { - if (pte_dirty(pte)) - set_page_dirty(page); - if (pte_young(pte)) - mark_page_accessed(page); - } + if (pte_dirty(pte) && page->mapping) + set_page_dirty(page); free_page_and_swap_cache(page); return 1; } @@ -928,6 +924,10 @@ break; /* Recheck swapcachedness once the page is locked */ can_reuse = exclusive_swap_page(old_page); +#if 1 + if (can_reuse) + delete_from_swap_cache_nolock(old_page); +#endif UnlockPage(old_page); if (!can_reuse) break; @@ -1154,12 +1154,13 @@ swap_free(entry); if (exclusive_swap_page(page)) { +#if 0 if (write_access) pte = pte_mkwrite(pte_mkdirty(pte)); - if (vm_swap_full()) { - delete_from_swap_cache_nolock(page); - pte = pte_mkdirty(pte); - } +#else + delete_from_swap_cache_nolock(page); + pte = pte_mkwrite(pte_mkdirty(pte)); +#endif } UnlockPage(page); diff -urN vm-ref/mm/numa.c vm/mm/numa.c --- vm-ref/mm/numa.c Wed Jul 4 04:03:47 2001 +++ vm/mm/numa.c Mon Sep 17 01:26:25 2001 @@ -31,7 +31,7 @@ #endif /* !CONFIG_DISCONTIGMEM */ -struct page * alloc_pages_node(int nid, int gfp_mask, unsigned long order) +struct page * alloc_pages_node(int nid, unsigned int gfp_mask, unsigned int order) { #ifdef CONFIG_NUMA return __alloc_pages(gfp_mask, order, NODE_DATA(nid)->node_zonelists + (gfp_mask & GFP_ZONEMASK)); @@ -82,8 +82,8 @@ memset(pgdat->valid_addr_bitmap, 0, size); } -static struct page * alloc_pages_pgdat(pg_data_t *pgdat, int gfp_mask, - unsigned long order) +static struct page * alloc_pages_pgdat(pg_data_t *pgdat, unsigned int gfp_mask, + unsigned int order) { return __alloc_pages(gfp_mask, order, pgdat->node_zonelists + (gfp_mask & GFP_ZONEMASK)); } @@ -92,7 +92,7 @@ * This can be refined. Currently, tries to do round robin, instead * should do concentratic circle search, starting from current node. */ -struct page * _alloc_pages(unsigned int gfp_mask, unsigned long order) +struct page * _alloc_pages(unsigned int gfp_mask, unsigned int order) { struct page *ret = 0; pg_data_t *start, *temp; diff -urN vm-ref/mm/oom_kill.c vm/mm/oom_kill.c --- vm-ref/mm/oom_kill.c Thu Aug 16 22:03:41 2001 +++ vm/mm/oom_kill.c Mon Sep 17 01:26:25 2001 @@ -192,43 +192,3 @@ schedule(); return; } - -/** - * out_of_memory - is the system out of memory? - * - * Returns 0 if there is still enough memory left, - * 1 when we are out of memory (otherwise). - */ -int out_of_memory(void) -{ - long cache_mem, limit; - - /* Enough free memory? Not OOM. */ - if (nr_free_pages() > freepages.min) - return 0; - - if (nr_free_pages() + nr_inactive_clean_pages() > freepages.low) - return 0; - - /* - * If the buffer and page cache (excluding swap cache) are over - * their (/proc tunable) minimum, we're still not OOM. We test - * this to make sure we don't return OOM when the system simply - * has a hard time with the cache. - */ - cache_mem = atomic_read(&page_cache_size); - cache_mem += atomic_read(&buffermem_pages); - cache_mem -= swapper_space.nrpages; - limit = (page_cache.min_percent + buffer_mem.min_percent); - limit *= num_physpages / 100; - - if (cache_mem > limit) - return 0; - - /* Enough swap space left? Not OOM. */ - if (nr_swap_pages > 0) - return 0; - - /* Else... */ - return 1; -} diff -urN vm-ref/mm/page_alloc.c vm/mm/page_alloc.c --- vm-ref/mm/page_alloc.c Mon Sep 17 01:26:14 2001 +++ vm/mm/page_alloc.c Mon Sep 17 01:26:25 2001 @@ -21,16 +21,16 @@ int nr_swap_pages; int nr_active_pages; -int nr_inactive_dirty_pages; +int nr_inactive_pages; +struct list_head inactive_list; +struct list_head active_list; pg_data_t *pgdat_list; static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; -static int zone_balance_ratio[MAX_NR_ZONES] = { 32, 128, 128, }; -static int zone_balance_min[MAX_NR_ZONES] = { 10 , 10, 10, }; -static int zone_balance_max[MAX_NR_ZONES] = { 255 , 255, 255, }; +static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 32, 128, 128, }; +static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, }; +static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, }; -struct list_head active_list; -struct list_head inactive_dirty_list; /* * Free_page() adds the page to the free lists. This is optimized for * fast normal cases (no error jumps taken normally). @@ -62,8 +62,8 @@ * Hint: -mask = 1+~mask */ -static void FASTCALL(__free_pages_ok (struct page *page, unsigned long order)); -static void __free_pages_ok (struct page *page, unsigned long order) +static void FASTCALL(__free_pages_ok (struct page *page, unsigned int order)); +static void __free_pages_ok (struct page *page, unsigned int order) { unsigned long index, page_idx, mask, flags; free_area_t *area; @@ -84,14 +84,15 @@ BUG(); if (PageActive(page)) BUG(); - if (PageInactiveDirty(page)) + if (PageInactive(page)) BUG(); - if (PageInactiveClean(page)) + if (PageDirty(page)) BUG(); - page->flags &= ~((1<age = PAGE_AGE_START; - + if (current->flags & PF_FREE_PAGES) + goto local_freelist; + back_local_freelist: + zone = page->zone; mask = (~0UL) << order; @@ -136,14 +137,21 @@ memlist_add_head(&(base + page_idx)->list, &area->free_list); spin_unlock_irqrestore(&zone->lock, flags); + return; + local_freelist: /* - * We don't want to protect this variable from race conditions - * since it's nothing important, but we do want to make sure - * it never gets negative. + * This is a little subtle: if the allocation order + * wanted is major than zero we'd better take all the pages + * local since we must deal with fragmentation too and we + * can't rely on the nr_local_pages information. */ - if (memory_pressure > NR_CPUS) - memory_pressure--; + if (current->nr_local_pages && !current->allocation_order) + goto back_local_freelist; + + list_add(&page->list, ¤t->local_pages); + page->index = order; + current->nr_local_pages++; } #define MARK_USED(index, order, area) \ @@ -170,11 +178,11 @@ return page; } -static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned long order)); -static struct page * rmqueue(zone_t *zone, unsigned long order) +static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned int order)); +static struct page * rmqueue(zone_t *zone, unsigned int order) { free_area_t * area = zone->free_area + order; - unsigned long curr_order = order; + unsigned int curr_order = order; struct list_head *head, *curr; unsigned long flags; struct page *page; @@ -194,7 +202,7 @@ index = page - zone->zone_mem_map; if (curr_order != MAX_ORDER-1) MARK_USED(index, curr_order, area); - zone->free_pages -= 1 << order; + zone->free_pages -= 1UL << order; page = expand(zone, page, index, order, curr_order, area); spin_unlock_irqrestore(&zone->lock, flags); @@ -202,7 +210,7 @@ set_page_count(page, 1); if (BAD_RANGE(zone,page)) BUG(); - DEBUG_ADD_PAGE + DEBUG_LRU_PAGE(page); return page; } curr_order++; @@ -213,305 +221,193 @@ return NULL; } -#define PAGES_MIN 0 -#define PAGES_LOW 1 -#define PAGES_HIGH 2 - -/* - * This function does the dirty work for __alloc_pages - * and is separated out to keep the code size smaller. - * (suggested by Davem at 1:30 AM, typed by Rik at 6 AM) - */ -static struct page * __alloc_pages_limit(zonelist_t *zonelist, - unsigned long order, int limit, int direct_reclaim) +#ifndef CONFIG_DISCONTIGMEM +struct page *_alloc_pages(unsigned int gfp_mask, unsigned int order) { - zone_t **zone = zonelist->zones; + return __alloc_pages(gfp_mask, order, + contig_page_data.node_zonelists+(gfp_mask & GFP_ZONEMASK)); +} +#endif - for (;;) { - zone_t *z = *(zone++); - unsigned long water_mark; +static struct page * FASTCALL(balance_classzone(zone_t *, unsigned int, unsigned int, int *)); +static struct page * balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed) +{ + struct page * page = NULL; + int __freed = 0; - if (!z) - break; - if (!z->size) - BUG(); + if (!(gfp_mask & __GFP_WAIT)) + goto out; + if (in_interrupt()) + BUG(); + + current->allocation_order = order; + current->flags |= PF_MEMALLOC | PF_FREE_PAGES; + + __freed = try_to_free_pages(classzone, gfp_mask, order); + + current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES); + + if (current->nr_local_pages) { + struct list_head * entry, * local_pages; + struct page * tmp; + int nr_pages; + + local_pages = ¤t->local_pages; + + if (__freed) { + /* pick from the last inserted so we're lifo */ + entry = local_pages->next; + do { + tmp = list_entry(entry, struct page, list); + if (tmp->index == order && memclass(tmp->zone, classzone)) { + list_del(entry); + current->nr_local_pages--; + set_page_count(tmp, 1); + page = tmp; + + if (page->buffers) + BUG(); + if (page->mapping) + BUG(); + if (!VALID_PAGE(page)) + BUG(); + if (PageSwapCache(page)) + BUG(); + if (PageLocked(page)) + BUG(); + if (PageDecrAfter(page)) + BUG(); + if (PageActive(page)) + BUG(); + if (PageInactive(page)) + BUG(); + if (PageDirty(page)) + BUG(); - /* - * We allocate if the number of free + inactive_clean - * pages is above the watermark. - */ - switch (limit) { - default: - case PAGES_MIN: - water_mark = z->pages_min; - break; - case PAGES_LOW: - water_mark = z->pages_low; - break; - case PAGES_HIGH: - water_mark = z->pages_high; + break; + } + } while ((entry = entry->next) != local_pages); } - if (z->free_pages + z->inactive_clean_pages >= water_mark) { - struct page *page = NULL; - /* If possible, reclaim a page directly. */ - if (direct_reclaim) - page = reclaim_page(z); - /* If that fails, fall back to rmqueue. */ - if (!page) - page = rmqueue(z, order); - if (page) - return page; + nr_pages = current->nr_local_pages; + /* free in reverse order so that the global order will be lifo */ + while ((entry = local_pages->prev) != local_pages) { + list_del(entry); + tmp = list_entry(entry, struct page, list); + __free_pages_ok(tmp, tmp->index); + if (!nr_pages--) + BUG(); } + current->nr_local_pages = 0; } - - /* Found nothing. */ - return NULL; + out: + *freed = __freed; + return page; } -#ifndef CONFIG_DISCONTIGMEM -struct page *_alloc_pages(unsigned int gfp_mask, unsigned long order) +static inline unsigned long zone_free_pages(zone_t * zone, unsigned int order) { - return __alloc_pages(gfp_mask, order, - contig_page_data.node_zonelists+(gfp_mask & GFP_ZONEMASK)); + long free = zone->free_pages - (1UL << order); + return free >= 0 ? free : 0; } -#endif /* * This is the 'heart' of the zoned buddy allocator: */ -struct page * __alloc_pages(unsigned int gfp_mask, unsigned long order, zonelist_t *zonelist) +struct page * __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist) { - zone_t **zone; - int direct_reclaim = 0; + zone_t **zone, * classzone; struct page * page; + int freed; - /* - * Allocations put pressure on the VM subsystem. - */ - memory_pressure++; + zone = zonelist->zones; + classzone = *zone; + for (;;) { + zone_t *z = *(zone++); + if (!z) + break; - /* - * (If anyone calls gfp from interrupts nonatomically then it - * will sooner or later tripped up by a schedule().) - * - * We are falling back to lower-level zones if allocation - * in a higher zone fails. - */ + if (zone_free_pages(z, order) > z->pages_low) { + page = rmqueue(z, order); + if (page) + return page; + } + } - /* - * Can we take pages directly from the inactive_clean - * list? - */ - if (order == 0 && (gfp_mask & __GFP_WAIT)) - direct_reclaim = 1; + classzone->need_balance = 1; + mb(); + if (waitqueue_active(&kswapd_wait)) + wake_up_interruptible(&kswapd_wait); -try_again: - /* - * First, see if we have any zones with lots of free memory. - * - * We allocate free memory first because it doesn't contain - * any data ... DUH! - */ zone = zonelist->zones; for (;;) { zone_t *z = *(zone++); if (!z) break; - if (!z->size) - BUG(); - if (z->free_pages >= z->pages_low) { + if (zone_free_pages(z, order) > (gfp_mask & __GFP_HIGH ? z->pages_min / 2 : z->pages_min)) { page = rmqueue(z, order); if (page) return page; - } else if (z->free_pages < z->pages_min && - waitqueue_active(&kreclaimd_wait)) { - wake_up_interruptible(&kreclaimd_wait); } } - /* - * Try to allocate a page from a zone with a HIGH - * amount of free + inactive_clean pages. - * - * If there is a lot of activity, inactive_target - * will be high and we'll have a good chance of - * finding a page using the HIGH limit. - */ - page = __alloc_pages_limit(zonelist, order, PAGES_HIGH, direct_reclaim); - if (page) - return page; + /* here we're in the low on memory slow path */ - /* - * Then try to allocate a page from a zone with more - * than zone->pages_low free + inactive_clean pages. - * - * When the working set is very large and VM activity - * is low, we're most likely to have our allocation - * succeed here. - */ - page = __alloc_pages_limit(zonelist, order, PAGES_LOW, direct_reclaim); - if (page) - return page; + if (current->flags & PF_MEMALLOC) { + zone = zonelist->zones; + for (;;) { + zone_t *z = *(zone++); + if (!z) + break; - /* - * OK, none of the zones on our zonelist has lots - * of pages free. - * - * We wake up kswapd, in the hope that kswapd will - * resolve this situation before memory gets tight. - * - * We also yield the CPU, because that: - * - gives kswapd a chance to do something - * - slows down allocations, in particular the - * allocations from the fast allocator that's - * causing the problems ... - * - ... which minimises the impact the "bad guys" - * have on the rest of the system - * - if we don't have __GFP_IO set, kswapd may be - * able to free some memory we can't free ourselves - */ - wakeup_kswapd(); - if (gfp_mask & __GFP_WAIT && !(current->flags & PF_ATOMICALLOC)) { - __set_current_state(TASK_RUNNING); - current->policy |= SCHED_YIELD; - schedule(); + page = rmqueue(z, order); + if (page) + return page; + } + return NULL; } - /* - * After waking up kswapd, we try to allocate a page - * from any zone which isn't critical yet. - * - * Kswapd should, in most situations, bring the situation - * back to normal in no time. - */ - page = __alloc_pages_limit(zonelist, order, PAGES_MIN, direct_reclaim); + page = balance_classzone(classzone, gfp_mask, order, &freed); if (page) return page; - /* - * Damn, we didn't succeed. - * - * This can be due to 2 reasons: - * - we're doing a higher-order allocation - * --> move pages to the free list until we succeed - * - we're /really/ tight on memory - * --> try to free pages ourselves with page_launder - */ - if (!(current->flags & PF_MEMALLOC)) { - /* - * Are we dealing with a higher order allocation? - * - * Move pages from the inactive_clean to the free list - * in the hope of creating a large, physically contiguous - * piece of free memory. - */ - if (order > 0 && (gfp_mask & __GFP_WAIT)) { - zone = zonelist->zones; - /* First, clean some dirty pages. */ - current->flags |= PF_MEMALLOC; - page_launder(gfp_mask, 1); - current->flags &= ~PF_MEMALLOC; - for (;;) { - zone_t *z = *(zone++); - if (!z) - break; - if (!z->size) - continue; - while (z->inactive_clean_pages) { - struct page * page; - /* Move one page to the free list. */ - page = reclaim_page(z); - if (!page) - break; - __free_page(page); - /* Try if the allocation succeeds. */ - page = rmqueue(z, order); - if (page) - return page; - } - } - } - /* - * When we arrive here, we are really tight on memory. - * Since kswapd didn't succeed in freeing pages for us, - * we try to help it. - * - * Single page allocs loop until the allocation succeeds. - * Multi-page allocs can fail due to memory fragmentation; - * in that case we bail out to prevent infinite loops and - * hanging device drivers ... - * - * Another issue are GFP_NOFS allocations; because they - * do not have __GFP_FS set it's possible we cannot make - * any progress freeing pages, in that case it's better - * to give up than to deadlock the kernel looping here. - */ - if (gfp_mask & __GFP_WAIT) { - if (!order || free_shortage()) { - int progress = try_to_free_pages(gfp_mask); - if (progress || (gfp_mask & __GFP_FS)) - goto try_again; - /* - * Fail in case no progress was made and the - * allocation may not be able to block on IO. - */ - return NULL; - } - } - } - - /* - * Final phase: allocate anything we can! - * - * Higher order allocations, GFP_ATOMIC allocations and - * recursive allocations (PF_MEMALLOC) end up here. - * - * Only recursive allocations can use the very last pages - * in the system, otherwise it would be just too easy to - * deadlock the system... - */ zone = zonelist->zones; - for (;;) { - zone_t *z = *(zone++); - struct page * page = NULL; - if (!z) - break; - if (!z->size) - BUG(); + if (__builtin_expect(freed, 1)) { + for (;;) { + zone_t *z = *(zone++); + if (!z) + break; - /* - * SUBTLE: direct_reclaim is only possible if the task - * becomes PF_MEMALLOC while looping above. This will - * happen when the OOM killer selects this task for - * instant execution... - */ - if (direct_reclaim) { - page = reclaim_page(z); - if (page) - return page; + if (zone_free_pages(z, order) > (gfp_mask & __GFP_HIGH ? z->pages_min / 2 : z->pages_min)) { + page = rmqueue(z, order); + if (page) + return page; + } } + } else { + for (;;) { + zone_t *z = *(zone++); + if (!z) + break; - /* XXX: is pages_min/4 a good amount to reserve for this? */ - if (z->free_pages < z->pages_min / 4 && - !(current->flags & PF_MEMALLOC)) - continue; - page = rmqueue(z, order); - if (page) - return page; + if (zone_free_pages(z, order) > z->pages_high) { + page = rmqueue(z, order); + if (page) + return page; + } + } } - /* No luck.. */ - printk(KERN_ERR "__alloc_pages: %lu-order allocation failed (gfp=0x%x/%i).\n", - order, gfp_mask, !!(current->flags & PF_MEMALLOC)); + printk(KERN_NOTICE "__alloc_pages: %u-order allocation failed (gfp=0x%x/%i) from %p\n", + order, gfp_mask, !!(current->flags & PF_MEMALLOC), __builtin_return_address(0)); return NULL; } /* * Common helper functions. */ -unsigned long __get_free_pages(int gfp_mask, unsigned long order) +unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int order) { struct page * page; @@ -521,7 +417,7 @@ return (unsigned long) page_address(page); } -unsigned long get_zeroed_page(int gfp_mask) +unsigned long get_zeroed_page(unsigned int gfp_mask) { struct page * page; @@ -534,13 +430,13 @@ return 0; } -void __free_pages(struct page *page, unsigned long order) +void __free_pages(struct page *page, unsigned int order) { if (!PageReserved(page) && put_page_testzero(page)) __free_pages_ok(page, order); } -void free_pages(unsigned long addr, unsigned long order) +void free_pages(unsigned long addr, unsigned int order) { if (addr != 0) __free_pages(virt_to_page(addr), order); @@ -613,24 +509,6 @@ } /* - * Total amount of inactive_clean (allocatable) RAM: - */ -unsigned int nr_inactive_clean_pages (void) -{ - unsigned int sum; - zone_t *zone; - pg_data_t *pgdat = pgdat_list; - - sum = 0; - while (pgdat) { - for (zone = pgdat->node_zones; zone < pgdat->node_zones + MAX_NR_ZONES; zone++) - sum += zone->inactive_clean_pages; - pgdat = pgdat->node_next; - } - return sum; -} - -/* * Amount of free RAM allocatable as buffer memory: */ unsigned int nr_free_buffer_pages (void) @@ -645,12 +523,12 @@ zonep = zonelist->zones; for (zone = *zonep++; zone; zone = *zonep++) - sum += zone->free_pages + zone->inactive_clean_pages + zone->inactive_dirty_pages; + sum += zone->free_pages; pgdat = pgdat->node_next; } while (pgdat); - return sum; + return sum + nr_active_pages + nr_inactive_pages; } #if CONFIG_HIGHMEM @@ -674,21 +552,17 @@ */ void show_free_areas_core(pg_data_t *pgdat) { - unsigned long order; + unsigned int order; unsigned type; printk("Free pages: %6dkB (%6dkB HighMem)\n", nr_free_pages() << (PAGE_SHIFT-10), nr_free_highpages() << (PAGE_SHIFT-10)); - printk("( Active: %d, inactive_dirty: %d, inactive_clean: %d, free: %d (%d %d %d) )\n", - nr_active_pages, - nr_inactive_dirty_pages, - nr_inactive_clean_pages(), - nr_free_pages(), - freepages.min, - freepages.low, - freepages.high); + printk("( Active: %d, inactive: %d, free: %d )\n", + nr_active_pages, + nr_inactive_pages, + nr_free_pages()); for (type = 0; type < MAX_NR_ZONES; type++) { struct list_head *head, *curr; @@ -808,8 +682,8 @@ printk("On node %d totalpages: %lu\n", nid, realtotalpages); - memlist_init(&active_list); - memlist_init(&inactive_dirty_list); + INIT_LIST_HEAD(&active_list); + INIT_LIST_HEAD(&inactive_list); /* * Some architectures (with lots of mem and discontinous memory @@ -828,6 +702,7 @@ pgdat->node_size = totalpages; pgdat->node_start_paddr = zone_start_paddr; pgdat->node_start_mapnr = (lmem_map - mem_map); + pgdat->nr_zones = 0; /* * Initially all pages are reserved - free ones are freed @@ -857,12 +732,11 @@ zone->lock = SPIN_LOCK_UNLOCKED; zone->zone_pgdat = pgdat; zone->free_pages = 0; - zone->inactive_clean_pages = 0; - zone->inactive_dirty_pages = 0; - memlist_init(&zone->inactive_clean_list); if (!size) continue; + pgdat->nr_zones = j+1; + mask = (realsize / zone_balance_ratio[j]); if (mask < zone_balance_min[j]) mask = zone_balance_min[j]; @@ -871,20 +745,7 @@ zone->pages_min = mask; zone->pages_low = mask*2; zone->pages_high = mask*3; - /* - * Add these free targets to the global free target; - * we have to be SURE that freepages.high is higher - * than SUM [zone->pages_min] for all zones, otherwise - * we may have bad bad problems. - * - * This means we cannot make the freepages array writable - * in /proc, but have to add a separate extra_free_target - * for people who require it to catch load spikes in eg. - * gigabit ethernet routing... - */ - freepages.min += mask; - freepages.low += mask*2; - freepages.high += mask*3; + zone->zone_mem_map = mem_map + offset; zone->zone_start_mapnr = offset; zone->zone_start_paddr = zone_start_paddr; diff -urN vm-ref/mm/shmem.c vm/mm/shmem.c --- vm-ref/mm/shmem.c Mon Sep 17 00:15:00 2001 +++ vm/mm/shmem.c Mon Sep 17 01:26:25 2001 @@ -353,7 +353,7 @@ swap_free(*entry); *entry = (swp_entry_t) {0}; delete_from_swap_cache_nolock(page); - flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_referenced) | (1 << PG_arch_1)); + flags = page->flags & ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_referenced | 1 << PG_arch_1); page->flags = flags | (1 << PG_dirty); add_to_page_cache_locked(page, mapping, idx); info->swapped--; diff -urN vm-ref/mm/slab.c vm/mm/slab.c --- vm-ref/mm/slab.c Mon Sep 17 01:26:14 2001 +++ vm/mm/slab.c Mon Sep 17 01:26:25 2001 @@ -1704,7 +1704,7 @@ * * Called from do_try_to_free_pages() and __alloc_pages() */ -void kmem_cache_reap (int gfp_mask) +int kmem_cache_reap (int gfp_mask) { slab_t *slabp; kmem_cache_t *searchp; @@ -1712,12 +1712,13 @@ unsigned int best_pages; unsigned int best_len; unsigned int scan; + int ret = 0; if (gfp_mask & __GFP_WAIT && !(current->flags & PF_ATOMICALLOC)) down(&cache_chain_sem); else if (down_trylock(&cache_chain_sem)) - return; + return 0; scan = REAP_SCANLEN; best_len = 0; @@ -1821,9 +1822,10 @@ spin_lock_irq(&best_cachep->spinlock); } spin_unlock_irq(&best_cachep->spinlock); + ret = scan * (1 << best_cachep->gfporder); out: up(&cache_chain_sem); - return; + return ret; } #ifdef CONFIG_PROC_FS diff -urN vm-ref/mm/swap.c vm/mm/swap.c --- vm-ref/mm/swap.c Mon Sep 17 01:26:13 2001 +++ vm/mm/swap.c Mon Sep 17 01:26:25 2001 @@ -24,50 +24,13 @@ #include /* for copy_to/from_user */ #include -/* - * We identify three levels of free memory. We never let free mem - * fall below the freepages.min except for atomic allocations. We - * start background swapping if we fall below freepages.high free - * pages, and we begin intensive swapping below freepages.low. - * - * Actual initialization is done in mm/page_alloc.c - */ -freepages_t freepages = { - 0, /* freepages.min */ - 0, /* freepages.low */ - 0 /* freepages.high */ -}; - /* How many pages do we try to swap or page in/out together? */ int page_cluster; -/* - * This variable contains the amount of page steals the system - * is doing, averaged over a minute. We use this to determine how - * many inactive pages we should have. - * - * In reclaim_page and __alloc_pages: memory_pressure++ - * In __free_pages_ok: memory_pressure-- - * In recalculate_vm_stats the value is decayed (once a second) - */ -int memory_pressure; - /* We track the number of pages currently being asynchronously swapped out, so that we don't try to swap TOO many pages out at once */ atomic_t nr_async_pages = ATOMIC_INIT(0); -buffer_mem_t buffer_mem = { - 2, /* minimum percent buffer */ - 10, /* borrow percent buffer */ - 60 /* maximum percent buffer */ -}; - -buffer_mem_t page_cache = { - 2, /* minimum percent page cache */ - 15, /* borrow percent page cache */ - 75 /* maximum */ -}; - pager_daemon_t pager_daemon = { 512, /* base number for calculating the number of tries */ SWAP_CLUSTER_MAX, /* minimum number of tries */ @@ -87,25 +50,9 @@ */ void deactivate_page_nolock(struct page * page) { - /* - * One for the cache, one for the extra reference the - * caller has and (maybe) one for the buffers. - * - * This isn't perfect, but works for just about everything. - * Besides, as long as we don't move unfreeable pages to the - * inactive_clean list it doesn't need to be perfect... - */ - int maxcount = (page->buffers ? 3 : 2); - page->age = 0; - ClearPageReferenced(page); - - /* - * Don't touch it if it's not on the active list. - * (some pages aren't on any list at all) - */ - if (PageActive(page) && page_count(page) <= maxcount) { + if (PageActive(page)) { del_page_from_active_list(page); - add_page_to_inactive_dirty_list(page); + add_page_to_inactive_list(page); } } @@ -121,22 +68,10 @@ */ void activate_page_nolock(struct page * page) { - if (PageInactiveDirty(page)) { - del_page_from_inactive_dirty_list(page); - add_page_to_active_list(page); - } else if (PageInactiveClean(page)) { - del_page_from_inactive_clean_list(page); + if (PageInactive(page)) { + del_page_from_inactive_list(page); add_page_to_active_list(page); - } else { - /* - * The page was not on any list, so we take care - * not to do anything. - */ } - - /* Make sure the page gets a fair chance at staying active. */ - if (page->age < PAGE_AGE_START) - page->age = PAGE_AGE_START; } void activate_page(struct page * page) @@ -152,11 +87,10 @@ */ void lru_cache_add(struct page * page) { - spin_lock(&pagemap_lru_lock); if (!PageLocked(page)) BUG(); - add_page_to_inactive_dirty_list(page); - page->age = 0; + spin_lock(&pagemap_lru_lock); + add_page_to_inactive_list(page); spin_unlock(&pagemap_lru_lock); } @@ -171,14 +105,11 @@ { if (PageActive(page)) { del_page_from_active_list(page); - } else if (PageInactiveDirty(page)) { - del_page_from_inactive_dirty_list(page); - } else if (PageInactiveClean(page)) { - del_page_from_inactive_clean_list(page); - } else { + } else if (PageInactive(page)) { + del_page_from_inactive_list(page); + } else printk("VM: __lru_cache_del, found unknown page ?!\n"); - } - DEBUG_ADD_PAGE + DEBUG_LRU_PAGE(page); } /** @@ -192,22 +123,6 @@ spin_lock(&pagemap_lru_lock); __lru_cache_del(page); spin_unlock(&pagemap_lru_lock); -} - -/** - * recalculate_vm_stats - recalculate VM statistics - * - * This function should be called once a second to recalculate - * some useful statistics the VM subsystem uses to determine - * its behaviour. - */ -void recalculate_vm_stats(void) -{ - /* - * Substract one second worth of memory_pressure from - * memory_pressure. - */ - memory_pressure -= (memory_pressure >> INACTIVE_SHIFT); } /* diff -urN vm-ref/mm/swap_state.c vm/mm/swap_state.c --- vm-ref/mm/swap_state.c Mon Sep 17 00:15:00 2001 +++ vm/mm/swap_state.c Mon Sep 17 01:26:25 2001 @@ -23,17 +23,6 @@ */ static int swap_writepage(struct page *page) { - /* One for the page cache, one for this user, one for page->buffers */ - if (page_count(page) > 2 + !!page->buffers) - goto in_use; - if (swap_count(page) > 1) - goto in_use; - - delete_from_swap_cache_nolock(page); - UnlockPage(page); - return 0; - -in_use: rw_swap_page(WRITE, page); return 0; } @@ -81,9 +70,8 @@ BUG(); /* clear PG_dirty so a subsequent set_page_dirty takes effect */ - flags = page->flags & ~((1 << PG_error) | (1 << PG_dirty) | (1 << PG_arch_1)); + flags = page->flags & ~(1 << PG_error | 1 << PG_dirty | 1 << PG_arch_1 | 1 << PG_referenced); page->flags = flags | (1 << PG_uptodate); - page->age = PAGE_AGE_START; add_to_page_cache_locked(page, &swapper_space, entry.val); } diff -urN vm-ref/mm/swapfile.c vm/mm/swapfile.c --- vm-ref/mm/swapfile.c Mon Sep 17 00:15:00 2001 +++ vm/mm/swapfile.c Mon Sep 17 01:26:25 2001 @@ -31,25 +31,6 @@ struct swap_info_struct swap_info[MAX_SWAPFILES]; -/* - * When swap space gets filled up, we will set this flag. - * This will make do_swap_page(), in the page fault path, - * free swap entries on swapin so we'll reclaim swap space - * in order to be able to swap something out. - * - * At the moment we start reclaiming when swap usage goes - * over 80% of swap space. - * - * XXX: Random numbers, fixme. - */ -#define SWAP_FULL_PCT 80 -int vm_swap_full (void) -{ - int swap_used = total_swap_pages - nr_swap_pages; - - return swap_used * 100 > total_swap_pages * SWAP_FULL_PCT; -} - #define SWAPFILE_CLUSTER 256 static inline int scan_swap_map(struct swap_info_struct *si, unsigned short count) @@ -471,7 +452,6 @@ lock_page(page); if (PageSwapCache(page)) delete_from_swap_cache_nolock(page); - SetPageDirty(page); UnlockPage(page); flush_page_to_ram(page); @@ -512,6 +492,7 @@ mmput(start_mm); start_mm = new_start_mm; } + ClearPageDirty(page); page_cache_release(page); /* diff -urN vm-ref/mm/vmscan.c vm/mm/vmscan.c --- vm-ref/mm/vmscan.c Mon Sep 17 01:26:13 2001 +++ vm/mm/vmscan.c Mon Sep 17 01:26:25 2001 @@ -32,19 +32,6 @@ */ #define DEF_PRIORITY (6) -static inline void age_page_up(struct page *page) -{ - unsigned age = page->age + PAGE_AGE_ADV; - if (age > PAGE_AGE_MAX) - age = PAGE_AGE_MAX; - page->age = age; -} - -static inline void age_page_down(struct page * page) -{ - page->age /= 2; -} - /* * The swap-out function returns 1 if it successfully * scanned all the pages it was asked to (`count'). @@ -54,55 +41,24 @@ * doesn't count as having freed a page. */ -/* - * Estimate whether a zone has enough inactive or free pages.. - */ -static unsigned int zone_inactive_plenty(zone_t *zone) -{ - unsigned int inactive; - - if (!zone->size) - return 0; - - inactive = zone->inactive_dirty_pages; - inactive += zone->inactive_clean_pages; - inactive += zone->free_pages; - - return (inactive > (zone->size / 3)); -} - -static unsigned int zone_free_plenty(zone_t *zone) -{ - unsigned int free; - - free = zone->free_pages; - free += zone->inactive_clean_pages; - - return free > zone->pages_high*2; -} - /* mm->page_table_lock is held. mmap_sem is not held */ -static void try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page) +static inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page, zone_t * classzone) { pte_t pte; swp_entry_t entry; - /* - * If we are doing a zone-specific scan, do not - * touch pages from zones which don't have a - * shortage. - */ - if (zone_inactive_plenty(page->zone)) - return; - /* Don't look at this pte if it's been accessed recently. */ if (ptep_test_and_clear_young(page_table)) { - mark_page_accessed(page); - return; + flush_tlb_page(vma, address); + SetPageReferenced(page); + return 0; } + if (!memclass(page->zone, classzone)) + return 0; + if (TryLockPage(page)) - return; + return 0; /* From this point on, the odds are that we're going to * nuke this pte, so read and clear the pte. This hook @@ -127,11 +83,14 @@ set_pte(page_table, swp_entry_to_pte(entry)); drop_pte: mm->rss--; - if (!PageReferenced(page)) - deactivate_page(page); UnlockPage(page); - page_cache_release(page); - return; + { + int freeable = page_count(page) - !!page->buffers <= 2; + if (freeable) + deactivate_page(page); + page_cache_release(page); + return freeable; + } } /* @@ -178,11 +137,11 @@ out_unlock_restore: set_pte(page_table, pte); UnlockPage(page); - return; + return 0; } /* mm->page_table_lock is held. mmap_sem is not held */ -static int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count) +static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone) { pte_t * pte; unsigned long pmd_end; @@ -206,20 +165,22 @@ struct page *page = pte_page(*pte); if (VALID_PAGE(page) && !PageReserved(page)) { - try_to_swap_out(mm, vma, address, pte, page); - if (!--count) + count -= try_to_swap_out(mm, vma, address, pte, page, classzone); + if (!count) { + address += PAGE_SIZE; break; + } } } address += PAGE_SIZE; pte++; } while (address && (address < end)); - mm->swap_address = address + PAGE_SIZE; + mm->swap_address = address; return count; } /* mm->page_table_lock is held. mmap_sem is not held */ -static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count) +static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone) { pmd_t * pmd; unsigned long pgd_end; @@ -239,7 +200,7 @@ end = pgd_end; do { - count = swap_out_pmd(mm, vma, pmd, address, end, count); + count = swap_out_pmd(mm, vma, pmd, address, end, count, classzone); if (!count) break; address = (address + PMD_SIZE) & PMD_MASK; @@ -249,7 +210,7 @@ } /* mm->page_table_lock is held. mmap_sem is not held */ -static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count) +static inline int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count, zone_t * classzone) { pgd_t *pgdir; unsigned long end; @@ -264,7 +225,7 @@ if (address >= end) BUG(); do { - count = swap_out_pgd(mm, vma, pgdir, address, end, count); + count = swap_out_pgd(mm, vma, pgdir, address, end, count, classzone); if (!count) break; address = (address + PGDIR_SIZE) & PGDIR_MASK; @@ -273,25 +234,26 @@ return count; } +/* Placeholder for swap_out(): may be updated by fork.c:mmput() */ +struct mm_struct *swap_mm = &init_mm; + /* * Returns non-zero if we scanned all `count' pages */ -static int swap_out_mm(struct mm_struct * mm, int count) +static inline int swap_out_mm(struct mm_struct * mm, int count, int * race, zone_t * classzone) { unsigned long address; struct vm_area_struct* vma; - if (!count) - return 1; - /* - * Go through process' page directory. - */ - /* * Find the proper vm-area after freezing the vma chain * and ptes. */ spin_lock(&mm->page_table_lock); + *race = 1; + if (swap_mm != mm) + goto out_unlock; + *race = 0; address = mm->swap_address; vma = find_vma(mm, address); if (vma) { @@ -299,7 +261,7 @@ address = vma->vm_start; for (;;) { - count = swap_out_vma(mm, vma, address, count); + count = swap_out_vma(mm, vma, address, count, classzone); if (!count) goto out_unlock; vma = vma->vm_next; @@ -311,224 +273,106 @@ /* Reset to 0 when we reach the end of address space */ mm->swap_address = 0; + spin_lock(&mmlist_lock); + swap_mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist); + spin_unlock(&mmlist_lock); + out_unlock: spin_unlock(&mm->page_table_lock); - return !count; -} - -#define SWAP_MM_SHIFT 4 -#define SWAP_SHIFT 5 -#define SWAP_MIN 8 -static inline int swap_amount(struct mm_struct *mm) -{ - int nr = mm->rss >> SWAP_SHIFT; - if (nr < SWAP_MIN) { - nr = SWAP_MIN; - if (nr > mm->rss) - nr = mm->rss; - } - return nr; + return count; } -/* Placeholder for swap_out(): may be updated by fork.c:mmput() */ -struct mm_struct *swap_mm = &init_mm; - -static void swap_out(unsigned int priority, int gfp_mask) +static int FASTCALL(swap_out(unsigned int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages)); +static int swap_out(unsigned int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages) { - int counter; - int retval = 0; - struct mm_struct *mm = current->mm; - - /* Always start by trying to penalize the process that is allocating memory */ - if (mm) - retval = swap_out_mm(mm, swap_amount(mm)); + int counter, race; + struct mm_struct *mm; /* Then, look at the other mm's */ - counter = (mmlist_nr << SWAP_MM_SHIFT) >> priority; + counter = mmlist_nr / priority; do { + if (current->need_resched) + schedule(); + spin_lock(&mmlist_lock); mm = swap_mm; if (mm == &init_mm) { mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist); if (mm == &init_mm) goto empty; + swap_mm = mm; } - /* Set pointer for next call to next in the list */ - swap_mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist); /* Make sure the mm doesn't disappear when we drop the lock.. */ atomic_inc(&mm->mm_users); spin_unlock(&mmlist_lock); - /* Walk about 6% of the address space each time */ - retval |= swap_out_mm(mm, swap_amount(mm)); + nr_pages = swap_out_mm(mm, nr_pages, &race, classzone); + mmput(mm); - } while (--counter >= 0); - return; + + if (!nr_pages) + return 1; + } while (race || --counter >= 0); + + return 0; empty: spin_unlock(&mmlist_lock); + return 0; } - -/** - * reclaim_page - reclaims one page from the inactive_clean list - * @zone: reclaim a page from this zone - * - * The pages on the inactive_clean can be instantly reclaimed. - * The tests look impressive, but most of the time we'll grab - * the first page of the list and exit successfully. - */ -struct page * reclaim_page(zone_t * zone) +static int FASTCALL(shrink_cache(struct list_head * lru, int * max_scan, int nr_pages, zone_t * classzone, unsigned int gfp_mask)); +static int shrink_cache(struct list_head * lru, int * max_scan, int nr_pages, zone_t * classzone, unsigned int gfp_mask) { - struct page * page = NULL; - struct list_head * page_lru; - int maxscan; + LIST_HEAD(active_local_lru); + LIST_HEAD(inactive_local_lru); + struct list_head * entry; + int __max_scan = *max_scan; - /* - * We only need the pagemap_lru_lock if we don't reclaim the page, - * but we have to grab the pagecache_lock before the pagemap_lru_lock - * to avoid deadlocks and most of the time we'll succeed anyway. - */ - spin_lock(&pagecache_lock); spin_lock(&pagemap_lru_lock); - maxscan = zone->inactive_clean_pages; - while ((page_lru = zone->inactive_clean_list.prev) != - &zone->inactive_clean_list && maxscan--) { - page = list_entry(page_lru, struct page, lru); + while (__max_scan && (entry = lru->prev) != lru) { + struct page * page; - /* Wrong page on list?! (list corruption, should not happen) */ - if (!PageInactiveClean(page)) { - printk("VM: reclaim_page, wrong page on list.\n"); - list_del(page_lru); - page->zone->inactive_clean_pages--; + if (__builtin_expect(current->need_resched, 0)) { + spin_unlock(&pagemap_lru_lock); + schedule(); + spin_lock(&pagemap_lru_lock); continue; } - /* Page is referenced? Clear and move to the head of the list.. */ - if (PageTestandClearReferenced(page)) { - list_del(page_lru); - list_add(page_lru, &zone->inactive_clean_list); - } + page = list_entry(entry, struct page, lru); - /* The page is dirty, or locked, move to inactive_dirty list. */ - if (page->buffers || PageDirty(page) || TryLockPage(page)) { - del_page_from_inactive_clean_list(page); - add_page_to_inactive_dirty_list(page); - continue; - } + if (__builtin_expect(!PageInactive(page) && !PageActive(page), 0)) + BUG(); - /* Page is in use? Move it to the active list. */ - if (page_count(page) > 1) { - UnlockPage(page); - del_page_from_inactive_clean_list(page); - add_page_to_active_list(page); + if (PageTestandClearReferenced(page)) { + if (PageInactive(page)) { + del_page_from_inactive_list(page); + add_page_to_active_list(page); + } else if (PageActive(page)) { + list_del(entry); + list_add(entry, &active_list); + } else + BUG(); continue; } - /* OK, remove the page from the caches. */ - if (PageSwapCache(page)) { - __delete_from_swap_cache(page); - goto found_page; - } - - if (page->mapping) { - __remove_inode_page(page); - goto found_page; - } - - /* We should never ever get here. */ - printk(KERN_ERR "VM: reclaim_page, found unknown page\n"); - list_del(page_lru); - zone->inactive_clean_pages--; - UnlockPage(page); - } - /* Reset page pointer, maybe we encountered an unfreeable page. */ - page = NULL; - goto out; - -found_page: - memory_pressure++; - del_page_from_inactive_clean_list(page); - UnlockPage(page); - page->age = PAGE_AGE_START; - if (page_count(page) != 1) - printk("VM: reclaim_page, found page with count %d!\n", - page_count(page)); -out: - spin_unlock(&pagemap_lru_lock); - spin_unlock(&pagecache_lock); - return page; -} - -/** - * page_launder - clean dirty inactive pages, move to inactive_clean list - * @gfp_mask: what operations we are allowed to do - * @sync: are we allowed to do synchronous IO in emergencies ? - * - * When this function is called, we are most likely low on free + - * inactive_clean pages. Since we want to refill those pages as - * soon as possible, we'll make two loops over the inactive list, - * one to move the already cleaned pages to the inactive_clean lists - * and one to (often asynchronously) clean the dirty inactive pages. - * - * In situations where kswapd cannot keep up, user processes will - * end up calling this function. Since the user process needs to - * have a page before it can continue with its allocation, we'll - * do synchronous page flushing in that case. - * - * This code used to be heavily inspired by the FreeBSD source code. - * Thanks go out to Matthew Dillon. - */ -#define CAN_DO_FS (gfp_mask & __GFP_FS) -int page_launder(int gfp_mask, int sync) -{ - int maxscan, cleaned_pages; - struct list_head * page_lru; - struct page * page; - - cleaned_pages = 0; - - /* Will we wait on IO? */ - if (!sync) - gfp_mask &= ~__GFP_WAIT; - - spin_lock(&pagemap_lru_lock); - maxscan = nr_inactive_dirty_pages >> DEF_PRIORITY; - while ((page_lru = inactive_dirty_list.prev) != &inactive_dirty_list && - maxscan-- > 0) { - page = list_entry(page_lru, struct page, lru); + deactivate_page_nolock(page); + list_del(entry); + list_add_tail(entry, &inactive_local_lru); - /* Wrong page on list?! (list corruption, should not happen) */ - if (!PageInactiveDirty(page)) { - printk("VM: page_launder, wrong page on list.\n"); - list_del(page_lru); - nr_inactive_dirty_pages--; - page->zone->inactive_dirty_pages--; + if (__builtin_expect(!memclass(page->zone, classzone), 0)) continue; - } - /* Page is referenced? Clear and move to the head of the list.. */ - if (PageTestandClearReferenced(page)) { - list_del(page_lru); - list_add(page_lru, &inactive_dirty_list); - } - - /* Page is in use? Move it to the active list. */ - if ((!page->buffers && page_count(page) > 1)) { - del_page_from_inactive_dirty_list(page); - add_page_to_active_list(page); - continue; - } + __max_scan--; - /* - * If this zone has plenty of pages free, - * don't spend time on cleaning it. - */ - if (zone_free_plenty(page->zone)) { - list_del(page_lru); - list_add(page_lru, &inactive_dirty_list); + /* Racy check to avoid trylocking when not worthwhile */ + if (!page->buffers && page_count(page) != 1) { + activate_page_nolock(page); + list_del(entry); + list_add_tail(entry, &active_local_lru); continue; } @@ -536,362 +380,252 @@ * The page is locked. IO in progress? * Move it to the back of the list. */ - if (TryLockPage(page)) { - list_del(page_lru); - list_add(page_lru, &inactive_dirty_list); + if (__builtin_expect(TryLockPage(page), 0)) continue; - } - /* - * Dirty swap-cache page? Write it out if - * last copy.. - */ - if (PageDirty(page)) { + if (PageDirty(page) && is_page_cache_freeable(page)) { + /* + * It is not critical here to write it only if + * the page is unmapped beause any direct writer + * like O_DIRECT would set the PG_dirty bitflag + * on the phisical page after having successfully + * pinned it and after the I/O to the page is finished, + * so the direct writes to the page cannot get lost. + */ int (*writepage)(struct page *); - /* Can a page get here without page->mapping? */ - if (!page->mapping) - goto page_active; writepage = page->mapping->a_ops->writepage; - if (!writepage) - goto page_active; + if (gfp_mask & __GFP_FS && writepage) { + spin_unlock(&pagemap_lru_lock); - /* Can't do it? Move it to the back of the list */ - if (!CAN_DO_FS) { - list_del(page_lru); - list_add(page_lru, &inactive_dirty_list); - UnlockPage(page); + ClearPageDirty(page); + writepage(page); + + spin_lock(&pagemap_lru_lock); continue; } - - /* OK, do a physical asynchronous write to swap. */ - ClearPageDirty(page); - page_cache_get(page); - spin_unlock(&pagemap_lru_lock); - - writepage(page); - page_cache_release(page); - - /* And re-start the thing.. */ - spin_lock(&pagemap_lru_lock); - continue; } /* * If the page has buffers, try to free the buffer mappings - * associated with this page. If we succeed we either free - * the page (in case it was a buffercache only page) or we - * move the page to the inactive_clean list. - * - * On the first round, we should free all previously cleaned - * buffer pages + * associated with this page. If we succeed we try to free + * the page as well. */ if (page->buffers) { - int clearedbuf; - int freed_page = 0; - - /* - * Since we might be doing disk IO, we have to - * drop the spinlock and take an extra reference - * on the page so it doesn't go away from under us. - */ - del_page_from_inactive_dirty_list(page); - page_cache_get(page); spin_unlock(&pagemap_lru_lock); - /* Try to free the page buffers. */ - clearedbuf = try_to_free_buffers(page, gfp_mask); + /* avoid to free a locked page */ + page_cache_get(page); - /* - * Re-take the spinlock. Note that we cannot - * unlock the page yet since we're still - * accessing the page_struct here... - */ - spin_lock(&pagemap_lru_lock); + if (try_to_free_buffers(page, gfp_mask)) { + if (!page->mapping) { + UnlockPage(page); + + /* + * Account we successfully freed a page + * of buffer cache. + */ + atomic_dec(&buffermem_pages); - /* The buffers were not freed. */ - if (!clearedbuf) { - add_page_to_inactive_dirty_list(page); - - /* The page was only in the buffer cache. */ - } else if (!page->mapping) { - atomic_dec(&buffermem_pages); - freed_page = 1; - cleaned_pages++; + spin_lock(&pagemap_lru_lock); + __lru_cache_del(page); - /* The page has more users besides the cache and us. */ - } else if (page_count(page) > 2) { - add_page_to_active_list(page); + /* effectively free the page here */ + page_cache_release(page); - /* OK, we "created" a freeable page. */ - } else /* page->mapping && page_count(page) == 2 */ { - add_page_to_inactive_clean_list(page); - cleaned_pages++; - } + if (--nr_pages) + continue; + break; + } else { + /* + * The page is still in pagecache so undo the stuff + * before the try_to_free_buffers since we've not + * finished and we can now try the next step. + */ + page_cache_release(page); - /* - * Unlock the page and drop the extra reference. - * We can only do it here because we are accessing - * the page struct above. - */ - UnlockPage(page); - page_cache_release(page); + spin_lock(&pagemap_lru_lock); + } + } else { + /* failed to drop the buffers so stop here */ + UnlockPage(page); + page_cache_release(page); - continue; - } else if (page->mapping && !PageDirty(page)) { - /* - * If a page had an extra reference in - * deactivate_page(), we will find it here. - * Now the page is really freeable, so we - * move it to the inactive_clean list. - */ - del_page_from_inactive_dirty_list(page); - add_page_to_inactive_clean_list(page); - UnlockPage(page); - cleaned_pages++; - } else { -page_active: - /* - * OK, we don't know what to do with the page. - * It's no use keeping it here, so we move it to - * the active list. - */ - del_page_from_inactive_dirty_list(page); - add_page_to_active_list(page); - UnlockPage(page); + spin_lock(&pagemap_lru_lock); + continue; + } } - } - spin_unlock(&pagemap_lru_lock); - - /* Return the number of pages moved to the inactive_clean list. */ - return cleaned_pages; -} -/** - * refill_inactive_scan - scan the active list and find pages to deactivate - * @priority: the priority at which to scan - * - * This function will scan a portion of the active list to find - * unused pages, those pages will then be moved to the inactive list. - */ -static int refill_inactive_scan(unsigned int priority) -{ - struct list_head * page_lru; - struct page * page; - int maxscan = nr_active_pages >> priority; - int page_active = 0; - int nr_deactivated = 0; + if (__builtin_expect(!page->mapping, 0)) + BUG(); - /* Take the lock while messing with the list... */ - spin_lock(&pagemap_lru_lock); - while (maxscan-- > 0 && (page_lru = active_list.prev) != &active_list) { - page = list_entry(page_lru, struct page, lru); + if (__builtin_expect(!spin_trylock(&pagecache_lock), 0)) { + /* we hold the page lock so the page cannot go away from under us */ + spin_unlock(&pagemap_lru_lock); - /* Wrong page on list?! (list corruption, should not happen) */ - if (!PageActive(page)) { - printk("VM: refill_inactive, wrong page on list.\n"); - list_del(page_lru); - nr_active_pages--; - continue; + spin_lock(&pagecache_lock); + spin_lock(&pagemap_lru_lock); } /* - * Do not deactivate pages from zones which - * have plenty inactive pages. + * this is the non-racy check, it is critical to check + * PageDirty _after_ we made sure the page is freeable + * so not in use by anybody. */ - - if (zone_inactive_plenty(page->zone)) { - page_active = 1; - goto skip_page; + if (!is_page_cache_freeable(page) || PageDirty(page)) { + spin_unlock(&pagecache_lock); + UnlockPage(page); + continue; } - /* Do aging on the pages. */ - if (PageTestandClearReferenced(page)) { - age_page_up(page); - page_active = 1; - } else { - age_page_down(page); - /* - * Since we don't hold a reference on the page - * ourselves, we have to do our test a bit more - * strict then deactivate_page(). This is needed - * since otherwise the system could hang shuffling - * unfreeable pages from the active list to the - * inactive_dirty list and back again... - * - * SUBTLE: we can have buffer pages with count 1. - */ - if (page_count(page) <= (page->buffers ? 2 : 1)) { - deactivate_page_nolock(page); - page_active = 0; - } else { - page_active = 1; - } - } - /* - * If the page is still on the active list, move it - * to the other end of the list. Otherwise we exit if - * we have done enough work. - */ - if (page_active || PageActive(page)) { -skip_page: - list_del(page_lru); - list_add(page_lru, &active_list); - } else { - nr_deactivated++; - } + /* point of no return */ + if (__builtin_expect(!PageSwapCache(page), 1)) + __remove_inode_page(page); + else + __delete_from_swap_cache(page); + spin_unlock(&pagecache_lock); + + __lru_cache_del(page); + + UnlockPage(page); + + /* effectively free the page here */ + page_cache_release(page); + + if (--nr_pages) + continue; + break; } + + list_splice(&inactive_local_lru, &inactive_list); + list_splice(&active_local_lru, &active_list); spin_unlock(&pagemap_lru_lock); - return nr_deactivated; + *max_scan = __max_scan; + return nr_pages; } -/* - * Check if there are zones with a severe shortage of free pages, - * or if all zones have a minor shortage. - */ -int free_shortage(void) +static int FASTCALL(shrink_caches(int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages)); +static int shrink_caches(int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages) { - pg_data_t *pgdat; - unsigned int global_free = 0; - unsigned int global_target = freepages.high; - - /* Are we low on free pages anywhere? */ - pgdat = pgdat_list; - do { - int i; - for(i = 0; i < MAX_NR_ZONES; i++) { - zone_t *zone = pgdat->node_zones+ i; - unsigned int free; + int max_scan = (nr_inactive_pages + nr_active_pages / priority) / priority; - if (!zone->size) - continue; + nr_pages -= kmem_cache_reap(gfp_mask); + if (nr_pages <= 0) + return 0; - free = zone->free_pages; - free += zone->inactive_clean_pages; + nr_pages = shrink_cache(&inactive_list, &max_scan, nr_pages, classzone, gfp_mask); + if (nr_pages <= 0) + return 0; - /* Local shortage? */ - if (free < zone->pages_low) - return 1; + shrink_dcache_memory(priority, gfp_mask); + shrink_icache_memory(priority, gfp_mask); - global_free += free; - } - pgdat = pgdat->node_next; - } while (pgdat); + nr_pages = shrink_cache(&active_list, &max_scan, nr_pages, classzone, gfp_mask); + if (nr_pages <= 0) + return 0; - /* Global shortage? */ - return global_free < global_target; + return nr_pages; } -/* - * Are we low on inactive pages globally or in any zone? - */ -int inactive_shortage(void) +int try_to_free_pages(zone_t * classzone, unsigned int gfp_mask, unsigned int order) { - pg_data_t *pgdat; - unsigned int global_target = freepages.high + inactive_target; - unsigned int global_inactive = 0; + int priority = DEF_PRIORITY; - pgdat = pgdat_list; do { - int i; - for(i = 0; i < MAX_NR_ZONES; i++) { - zone_t *zone = pgdat->node_zones + i; - unsigned int inactive; + int nr_pages = SWAP_CLUSTER_MAX; + nr_pages = shrink_caches(priority, classzone, gfp_mask, nr_pages); + if (nr_pages <= 0) + return 1; - if (!zone->size) - continue; + swap_out(priority, classzone, gfp_mask, SWAP_CLUSTER_MAX); + } while (--priority); - inactive = zone->inactive_dirty_pages; - inactive += zone->inactive_clean_pages; - inactive += zone->free_pages; + return 0; +} - /* Local shortage? */ - if (inactive < zone->pages_high) - return 1; +DECLARE_WAIT_QUEUE_HEAD(kswapd_wait); - global_inactive += inactive; - } - pgdat = pgdat->node_next; - } while (pgdat); +static int check_classzone_need_balance(zone_t * classzone) +{ + zone_t * first_classzone; - /* Global shortage? */ - return global_inactive < global_target; + first_classzone = classzone->zone_pgdat->node_zones; + while (classzone >= first_classzone) { + if (classzone->free_pages > classzone->pages_high) + return 0; + classzone--; + } + return 1; } -/* - * Loop until we are no longer under an inactive or free - * shortage. Return 1 on success, 0 if we failed to get - * there even after "maxtry" loops. - */ -#define INACTIVE_SHORTAGE 1 -#define FREE_SHORTAGE 2 -#define GENERAL_SHORTAGE 4 -static int do_try_to_free_pages(unsigned int gfp_mask, int user) +static int kswapd_balance_pgdat(pg_data_t * pgdat) { - int shortage = 0; - int maxtry; + int need_more_balance = 0, i; + zone_t * zone; - /* Always walk at least the active queue when called */ - refill_inactive_scan(DEF_PRIORITY); + for (i = pgdat->nr_zones-1; i >= 0; i--) { + zone = pgdat->node_zones + i; + if (current->need_resched) + schedule(); + if (!zone->need_balance) + continue; + if (!try_to_free_pages(zone, GFP_KSWAPD, 0)) { + zone->need_balance = 0; + continue; + } + if (check_classzone_need_balance(zone)) + need_more_balance = 1; + else + zone->need_balance = 0; + } - maxtry = 1 << DEF_PRIORITY; - do { - /* - * If needed, we move pages from the active list - * to the inactive list. - */ - if (shortage & INACTIVE_SHORTAGE) { - /* Walk the VM space for a bit.. */ - swap_out(DEF_PRIORITY, gfp_mask); + return need_more_balance; +} - /* ..and refill the inactive list */ - refill_inactive_scan(DEF_PRIORITY); - } +static void kswapd_balance(void) +{ + int need_more_balance; + pg_data_t * pgdat; - /* - * If we're low on free pages, move pages from the - * inactive_dirty list to the inactive_clean list. - * - * Usually bdflush will have pre-cleaned the pages - * before we get around to moving them to the other - * list, so this is a relatively cheap operation. - */ - if (shortage & FREE_SHORTAGE) - page_launder(gfp_mask, user); + do { + need_more_balance = 0; + pgdat = pgdat_list; + do + need_more_balance |= kswapd_balance_pgdat(pgdat); + while ((pgdat = pgdat->node_next)); + } while (need_more_balance); +} - /* - * Reclaim unused slab cache if we were short on memory. - */ - if (shortage & GENERAL_SHORTAGE) { - shrink_dcache_memory(DEF_PRIORITY, gfp_mask); - shrink_icache_memory(DEF_PRIORITY, gfp_mask); +static int kswapd_can_sleep_pgdat(pg_data_t * pgdat) +{ + zone_t * zone; + int i; - kmem_cache_reap(gfp_mask); - } + for (i = pgdat->nr_zones-1; i >= 0; i--) { + zone = pgdat->node_zones + i; + if (!zone->need_balance) + continue; + return 0; + } - if (current->need_resched) { - __set_current_state(TASK_RUNNING); - schedule(); - } + return 1; +} - shortage = 0; - if (inactive_shortage()) - shortage |= INACTIVE_SHORTAGE | GENERAL_SHORTAGE; - if (free_shortage()) - shortage |= FREE_SHORTAGE | GENERAL_SHORTAGE; +static int kswapd_can_sleep(void) +{ + pg_data_t * pgdat; - if (--maxtry <= 0) - break; - } while (shortage); + pgdat = pgdat_list; + do { + if (kswapd_can_sleep_pgdat(pgdat)) + continue; + return 0; + } while ((pgdat = pgdat->node_next)); - /* Return success if we're not "totally short" */ - return shortage != (FREE_SHORTAGE | INACTIVE_SHORTAGE | GENERAL_SHORTAGE); + return 1; } -DECLARE_WAIT_QUEUE_HEAD(kswapd_wait); -DECLARE_WAIT_QUEUE_HEAD(kswapd_done); - /* * The background pageout daemon, started as a kernel thread * from the init process. @@ -908,6 +642,7 @@ int kswapd(void *unused) { struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); daemonize(); strcpy(tsk->comm, "kswapd"); @@ -931,107 +666,31 @@ * Kswapd main loop. */ for (;;) { - static long recalc = 0; - - /* Once a second ... */ - if (time_after(jiffies, recalc + HZ)) { - recalc = jiffies; - - /* Recalculate VM statistics. */ - recalculate_vm_stats(); - } - - if (!do_try_to_free_pages(GFP_KSWAPD, 1)) { - if (out_of_memory()) - oom_kill(); - continue; - } - - run_task_queue(&tq_disk); - interruptible_sleep_on_timeout(&kswapd_wait, HZ); - } -} - -void wakeup_kswapd(void) -{ - if (waitqueue_active(&kswapd_wait)) - wake_up_interruptible(&kswapd_wait); -} - -/* - * Called by non-kswapd processes when they want more - * memory but are unable to sleep on kswapd because - * they might be holding some IO locks ... - */ -int try_to_free_pages(unsigned int gfp_mask) -{ - int ret = 1; - - if (gfp_mask & __GFP_WAIT) { - current->flags |= PF_MEMALLOC; - ret = do_try_to_free_pages(gfp_mask, 1); - current->flags &= ~PF_MEMALLOC; - } - - return ret; -} + __set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&kswapd_wait, &wait); -DECLARE_WAIT_QUEUE_HEAD(kreclaimd_wait); -/* - * Kreclaimd will move pages from the inactive_clean list to the - * free list, in order to keep atomic allocations possible under - * all circumstances. - */ -int kreclaimd(void *unused) -{ - struct task_struct *tsk = current; - pg_data_t *pgdat; - - daemonize(); - strcpy(tsk->comm, "kreclaimd"); - sigfillset(&tsk->blocked); - current->flags |= PF_MEMALLOC; - - while (1) { + mb(); + if (kswapd_can_sleep()) + schedule(); - /* - * We sleep until someone wakes us up from - * page_alloc.c::__alloc_pages(). - */ - interruptible_sleep_on(&kreclaimd_wait); + __set_current_state(TASK_RUNNING); + remove_wait_queue(&kswapd_wait, &wait); /* - * Move some pages from the inactive_clean lists to - * the free lists, if it is needed. + * If we actually get into a low-memory situation, + * the processes needing more memory will wake us + * up on a more timely basis. */ - pgdat = pgdat_list; - do { - int i; - for(i = 0; i < MAX_NR_ZONES; i++) { - zone_t *zone = pgdat->node_zones + i; - if (!zone->size) - continue; - - while (zone->free_pages < zone->pages_low) { - struct page * page; - page = reclaim_page(zone); - if (!page) - break; - __free_page(page); - } - } - pgdat = pgdat->node_next; - } while (pgdat); + kswapd_balance(); + run_task_queue(&tq_disk); } } - static int __init kswapd_init(void) { - printk("Starting kswapd v1.8\n"); + printk("Starting kswapd\n"); swap_setup(); kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); - kernel_thread(kreclaimd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); return 0; }