diff -urN 2.4.0-test1-ac7/arch/i386/mm/init.c 2.4.0-test1-ac7-VM-31/arch/i386/mm/init.c --- 2.4.0-test1-ac7/arch/i386/mm/init.c Fri May 26 22:46:46 2000 +++ 2.4.0-test1-ac7-VM-31/arch/i386/mm/init.c Sat Jun 3 14:53:14 2000 @@ -608,7 +608,7 @@ initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; printk("Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n", - (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), + nr_free_pages() << (PAGE_SHIFT-10), max_mapnr << (PAGE_SHIFT-10), codesize >> 10, reservedpages << (PAGE_SHIFT-10), diff -urN 2.4.0-test1-ac7/fs/buffer.c 2.4.0-test1-ac7-VM-31/fs/buffer.c --- 2.4.0-test1-ac7/fs/buffer.c Sat Jun 3 14:52:33 2000 +++ 2.4.0-test1-ac7-VM-31/fs/buffer.c Sat Jun 3 15:17:06 2000 @@ -495,17 +495,6 @@ __remove_from_lru_list(bh, bh->b_list); } -static void insert_into_queues(struct buffer_head *bh) -{ - struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr); - - spin_lock(&lru_list_lock); - write_lock(&hash_table_lock); - __hash_link(bh, head); - __insert_into_lru_list(bh, bh->b_list); - write_unlock(&hash_table_lock); - spin_unlock(&lru_list_lock); -} /* This function must only run if there are no other * references _anywhere_ to this buffer head. @@ -537,12 +526,11 @@ * will force it bad). This shouldn't really happen currently, but * the code is ready. */ -struct buffer_head * get_hash_table(kdev_t dev, int block, int size) +static struct buffer_head * __get_hash_table(kdev_t dev, int block, int size, + struct buffer_head **head) { - struct buffer_head **head = &hash(dev, block); struct buffer_head *bh; - read_lock(&hash_table_lock); for(bh = *head; bh; bh = bh->b_next) if (bh->b_blocknr == block && bh->b_size == size && @@ -550,11 +538,45 @@ break; if (bh) atomic_inc(&bh->b_count); + + return bh; +} + +struct buffer_head * get_hash_table(kdev_t dev, int block, int size) +{ + struct buffer_head **head = &hash(dev, block); + struct buffer_head *bh; + + read_lock(&hash_table_lock); + bh = __get_hash_table(dev, block, size, head); read_unlock(&hash_table_lock); return bh; } +static int insert_into_queues_unique(struct buffer_head *bh) +{ + struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr); + struct buffer_head *alias; + int err = 0; + + spin_lock(&lru_list_lock); + write_lock(&hash_table_lock); + + alias = __get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size, head); + err = 1; + if (!alias) { + __hash_link(bh, head); + __insert_into_lru_list(bh, bh->b_list); + err = 0; + } + + write_unlock(&hash_table_lock); + spin_unlock(&lru_list_lock); + + return err; +} + unsigned int get_hardblocksize(kdev_t dev) { /* @@ -713,6 +735,7 @@ static void refill_freelist(int size) { if (!grow_buffers(size)) { + current->state = TASK_RUNNING; wakeup_bdflush(1); current->policy |= SCHED_YIELD; schedule(); @@ -841,8 +864,16 @@ bh->b_blocknr = block; bh->b_state = 1 << BH_Mapped; - /* Insert the buffer into the regular lists */ - insert_into_queues(bh); + /* Insert the buffer into the regular lists; check noone + else added it first */ + + if (!insert_into_queues_unique(bh)) + goto out; + + /* someone added it after we last check the hash table */ + put_last_free(bh); + goto repeat; + out: touch_buffer(bh); return bh; @@ -858,7 +889,7 @@ /* -1 -> no need to flush 0 -> async flush - 1 -> sync flush (wait for I/O completation) */ + 1 -> sync flush (wait for I/O completion) */ static int balance_dirty_state(kdev_t dev) { unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit; @@ -1235,7 +1266,7 @@ goto try_again; } -static int create_page_buffers(int rw, struct page *page, kdev_t dev, int b[], int size) +static void create_page_buffers(int rw, struct page *page, kdev_t dev, int b[], int size) { struct buffer_head *head, *bh, *tail; int block; @@ -1247,6 +1278,7 @@ * They don't show up in the buffer hash table, but they *are* * registered in page->buffers. */ + /* FIXME: create_buffers should fail if there's no enough memory */ head = create_buffers(page, size, 1); if (page->buffers) BUG(); @@ -1266,7 +1298,7 @@ tail->b_this_page = head; page_cache_get(page); page->buffers = head; - return 0; + lru_cache_buf(page, LRU_SWAP_CACHE); } static void unmap_buffer(struct buffer_head * bh) @@ -1289,44 +1321,33 @@ * any IO, we are not interested in the contents of the buffer. This * function can block if the buffer is locked. */ -static struct buffer_head *discard_buffer(struct buffer_head * bh) +static inline struct buffer_head *discard_buffer(struct buffer_head * bh) { - int index = BUFSIZE_INDEX(bh->b_size); struct buffer_head *next; - /* grab the lru lock here to block bdflush. */ - atomic_inc(&bh->b_count); - lock_buffer(bh); + if (bh->b_dev == B_FREE) + BUG(); + next = bh->b_this_page; - clear_bit(BH_Uptodate, &bh->b_state); - clear_bit(BH_Mapped, &bh->b_state); - clear_bit(BH_Req, &bh->b_state); - clear_bit(BH_New, &bh->b_state); + + unmap_buffer(bh); spin_lock(&lru_list_lock); write_lock(&hash_table_lock); - spin_lock(&free_list[index].lock); spin_lock(&unused_list_lock); - if (!atomic_dec_and_test(&bh->b_count)) + if (atomic_read(&bh->b_count)) BUG(); __hash_unlink(bh); - /* The bunffer can be either on the regular - * queues or on the free list.. - */ - if (bh->b_dev != B_FREE) - __remove_from_queues(bh); - else - __remove_from_free_list(bh, index); - __put_unused_buffer_head(bh); - spin_unlock(&unused_list_lock); write_unlock(&hash_table_lock); - spin_unlock(&free_list[index].lock); + + __remove_from_queues(bh); spin_unlock(&lru_list_lock); - /* We can unlock the buffer, we have just returned it. - * Ditto for the counter - */ + + __put_unused_buffer_head(bh); + spin_unlock(&unused_list_lock); + return next; } @@ -1400,6 +1421,7 @@ /* And free the page */ page->buffers = NULL; page_cache_release(page); + lru_cache_unbuf(page); } static void create_empty_buffers(struct page *page, struct inode *inode, unsigned long blocksize) @@ -1421,6 +1443,7 @@ tail->b_this_page = head; page->buffers = head; page_cache_get(page); + lru_cache_buf(page, LRU_NORMAL_CACHE); } static void unmap_underlying_metadata(struct buffer_head * bh) @@ -1868,6 +1891,7 @@ } spin_unlock(&unused_list_lock); + wake_up(&buffer_wait); return iosize; } @@ -2004,6 +2028,8 @@ __put_unused_buffer_head(bh[bhind]); } spin_unlock(&unused_list_lock); + wake_up(&buffer_wait); + goto finished; } @@ -2170,7 +2196,8 @@ spin_unlock(&free_list[isize].lock); page->buffers = bh; - lru_cache_add(page); + page->flags &= ~(1 << PG_referenced); + lru_cache_add(page, LRU_NORMAL_CACHE); atomic_inc(&buffermem_pages); return 1; @@ -2181,35 +2208,29 @@ } /* - * Sync all the buffers on one page.. - * - * If we have old buffers that are locked, we'll - * wait on them, but we won't wait on the new ones - * we're writing out now. - * - * This all is required so that we can free up memory - * later. + * Can the buffer be thrown out? */ -static void sync_page_buffers(struct buffer_head *bh, int wait) +#define BUFFER_BUSY_BITS ((1<b_count) | ((bh)->b_state & BUFFER_BUSY_BITS)) + +static int sync_page_buffers(struct buffer_head * bh) { struct buffer_head * tmp = bh; do { - struct buffer_head *p = tmp; + if (buffer_dirty(tmp) && !buffer_locked(tmp)) + ll_rw_block(WRITE, 1, &tmp); tmp = tmp->b_this_page; - if (buffer_locked(p)) { - if (wait) - __wait_on_buffer(p); - } else if (buffer_dirty(p)) - ll_rw_block(WRITE, 1, &p); } while (tmp != bh); -} -/* - * Can the buffer be thrown out? - */ -#define BUFFER_BUSY_BITS ((1<b_count) | ((bh)->b_state & BUFFER_BUSY_BITS)) + do { + if (buffer_busy(tmp)) + return 1; + tmp = tmp->b_this_page; + } while (tmp != bh); + + return 0; +} /* * try_to_free_buffers() checks if all the buffers on this particular page @@ -2222,21 +2243,20 @@ * obtain a reference to a buffer head within a page. So we must * lock out all of these paths to cleanly toss the page. */ -int try_to_free_buffers(struct page * page, int wait) +int try_to_free_buffers(struct page * page) { struct buffer_head * tmp, * bh = page->buffers; int index = BUFSIZE_INDEX(bh->b_size); + again: spin_lock(&lru_list_lock); write_lock(&hash_table_lock); spin_lock(&free_list[index].lock); tmp = bh; do { - struct buffer_head *p = tmp; - - tmp = tmp->b_this_page; - if (buffer_busy(p)) + if (buffer_busy(tmp)) goto busy_buffer_page; + tmp = tmp->b_this_page; } while (tmp != bh); spin_lock(&unused_list_lock); @@ -2272,7 +2292,8 @@ spin_unlock(&free_list[index].lock); write_unlock(&hash_table_lock); spin_unlock(&lru_list_lock); - sync_page_buffers(bh, wait); + if (!sync_page_buffers(bh)) + goto again; return 0; } diff -urN 2.4.0-test1-ac7/fs/dcache.c 2.4.0-test1-ac7-VM-31/fs/dcache.c --- 2.4.0-test1-ac7/fs/dcache.c Fri May 26 22:47:00 2000 +++ 2.4.0-test1-ac7-VM-31/fs/dcache.c Sat Jun 3 14:53:14 2000 @@ -512,21 +512,22 @@ * ... * 6 - base-level: try to shrink a bit. */ -int shrink_dcache_memory(int priority, unsigned int gfp_mask) +int shrink_dcache_memory(int priority, unsigned int gfp_mask, zone_t * zone) { - int count = 0; - lock_kernel(); - if (priority) - count = dentry_stat.nr_unused / priority; - prune_dcache(count); - unlock_kernel(); - /* FIXME: kmem_cache_shrink here should tell us - the number of pages freed, and it should - work in a __GFP_DMA/__GFP_HIGHMEM behaviour - to free only the interesting pages in - function of the needs of the current allocation. */ - kmem_cache_shrink(dentry_cache); - + if (gfp_mask & __GFP_IO) { + int count = 0; + lock_kernel(); + if (priority) + count = dentry_stat.nr_unused / priority; + prune_dcache(count); + unlock_kernel(); + /* FIXME: kmem_cache_shrink here should tell us + the number of pages freed, and it should + work in a __GFP_DMA/__GFP_HIGHMEM behaviour + to free only the interesting pages in + function of the needs of the current allocation. */ + kmem_cache_shrink(dentry_cache); + } return 0; } diff -urN 2.4.0-test1-ac7/fs/exec.c 2.4.0-test1-ac7-VM-31/fs/exec.c --- 2.4.0-test1-ac7/fs/exec.c Sat Jun 3 14:52:34 2000 +++ 2.4.0-test1-ac7-VM-31/fs/exec.c Sat Jun 3 14:53:14 2000 @@ -266,6 +266,7 @@ return; } flush_page_to_ram(page); + page_anon_init_map_wmb(page); set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, PAGE_COPY)))); /* no need for flush_tlb */ } @@ -308,6 +309,7 @@ if (bprm->page[i]) { current->mm->rss++; put_dirty_page(current,bprm->page[i],stack_base); + bprm->page[i] = NULL; } stack_base += PAGE_SIZE; } @@ -860,9 +862,11 @@ /* Assumes that free_page() can take a NULL argument. */ /* I hope this is ok for all architectures */ - for (i = 0 ; i < MAX_ARG_PAGES ; i++) - if (bprm.page[i]) - __free_page(bprm.page[i]); + for (i = 0 ; i < MAX_ARG_PAGES ; i++) { + struct page * page = bprm.page[i]; + if (page) + __free_page(page); + } return retval; } diff -urN 2.4.0-test1-ac7/fs/inode.c 2.4.0-test1-ac7-VM-31/fs/inode.c --- 2.4.0-test1-ac7/fs/inode.c Sat Jun 3 14:52:34 2000 +++ 2.4.0-test1-ac7-VM-31/fs/inode.c Sat Jun 3 14:53:14 2000 @@ -450,20 +450,21 @@ dispose_list(freeable); } -int shrink_icache_memory(int priority, int gfp_mask) +int shrink_icache_memory(int priority, int gfp_mask, zone_t *zone) { - int count = 0; + if (gfp_mask & __GFP_IO) { + int count = 0; - if (priority) - count = inodes_stat.nr_unused / priority; - prune_icache(count); - /* FIXME: kmem_cache_shrink here should tell us - the number of pages freed, and it should - work in a __GFP_DMA/__GFP_HIGHMEM behaviour - to free only the interesting pages in - function of the needs of the current allocation. */ - kmem_cache_shrink(inode_cachep); - + if (priority) + count = inodes_stat.nr_unused / priority; + prune_icache(count); + /* FIXME: kmem_cache_shrink here should tell us + the number of pages freed, and it should + work in a __GFP_DMA/__GFP_HIGHMEM behaviour + to free only the interesting pages in + function of the needs of the current allocation. */ + kmem_cache_shrink(inode_cachep); + } return 0; } diff -urN 2.4.0-test1-ac7/include/linux/cache.h 2.4.0-test1-ac7-VM-31/include/linux/cache.h --- 2.4.0-test1-ac7/include/linux/cache.h Sun May 28 20:52:34 2000 +++ 2.4.0-test1-ac7-VM-31/include/linux/cache.h Sat Jun 3 15:45:16 2000 @@ -1,6 +1,7 @@ #ifndef __LINUX_CACHE_H #define __LINUX_CACHE_H +#include #include #ifndef L1_CACHE_ALIGN @@ -13,6 +14,14 @@ #ifndef ____cacheline_aligned #define ____cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES))) +#endif + +#ifndef ____cacheline_aligned_in_smp +#ifdef CONFIG_SMP +#define ____cacheline_aligned_in_smp ____cacheline_aligned +#else +#define ____cacheline_aligned_in_smp +#endif /* CONFIG_SMP */ #endif #ifndef __cacheline_aligned diff -urN 2.4.0-test1-ac7/include/linux/dcache.h 2.4.0-test1-ac7-VM-31/include/linux/dcache.h --- 2.4.0-test1-ac7/include/linux/dcache.h Sun May 28 20:52:34 2000 +++ 2.4.0-test1-ac7-VM-31/include/linux/dcache.h Sat Jun 3 15:45:15 2000 @@ -150,11 +150,11 @@ #define shrink_dcache() prune_dcache(0) struct zone_struct; /* dcache memory management */ -extern int shrink_dcache_memory(int, unsigned int); +extern int shrink_dcache_memory(int, unsigned int, struct zone_struct *); extern void prune_dcache(int); /* icache memory management (defined in linux/fs/inode.c) */ -extern int shrink_icache_memory(int, int); +extern int shrink_icache_memory(int, int, struct zone_struct *); extern void prune_icache(int); /* only used at mount-time */ diff -urN 2.4.0-test1-ac7/include/linux/fs.h 2.4.0-test1-ac7-VM-31/include/linux/fs.h --- 2.4.0-test1-ac7/include/linux/fs.h Sat Jun 3 14:52:34 2000 +++ 2.4.0-test1-ac7-VM-31/include/linux/fs.h Sat Jun 3 15:45:16 2000 @@ -910,7 +910,7 @@ extern int fs_may_remount_ro(struct super_block *); -extern int try_to_free_buffers(struct page *, int); +extern int try_to_free_buffers(struct page *); extern void refile_buffer(struct buffer_head * buf); #define BUF_CLEAN 0 diff -urN 2.4.0-test1-ac7/include/linux/highmem.h 2.4.0-test1-ac7-VM-31/include/linux/highmem.h --- 2.4.0-test1-ac7/include/linux/highmem.h Sun May 28 20:52:35 2000 +++ 2.4.0-test1-ac7-VM-31/include/linux/highmem.h Sat Jun 3 15:45:16 2000 @@ -12,7 +12,7 @@ /* declarations for linux/mm/highmem.c */ extern unsigned long highmem_mapnr; -FASTCALL(unsigned int nr_free_highpages(void)); +extern unsigned long nr_free_highpages(void); extern struct page * prepare_highmem_swapout(struct page *); extern struct page * replace_with_highmem(struct page *); @@ -20,7 +20,7 @@ #else /* CONFIG_HIGHMEM */ -extern inline unsigned int nr_free_highpages(void) { return 0; } +#define nr_free_highpages() 0UL #define prepare_highmem_swapout(page) page #define replace_with_highmem(page) page #define kmap(page) page_address(page) diff -urN 2.4.0-test1-ac7/include/linux/mm.h 2.4.0-test1-ac7-VM-31/include/linux/mm.h --- 2.4.0-test1-ac7/include/linux/mm.h Sat Jun 3 14:52:34 2000 +++ 2.4.0-test1-ac7-VM-31/include/linux/mm.h Sat Jun 3 15:52:27 2000 @@ -15,7 +15,6 @@ extern unsigned long num_physpages; extern void * high_memory; extern int page_cluster; -extern struct list_head lru_cache; #include #include @@ -146,6 +145,7 @@ unsigned long index; struct page *next_hash; atomic_t count; + int map_count; unsigned long flags; /* atomic flags, some possibly updated asynchronously */ struct list_head lru; wait_queue_head_t wait; @@ -153,11 +153,11 @@ struct buffer_head * buffers; unsigned long virtual; /* nonzero if kmapped */ struct zone_struct *zone; - unsigned int age; } mem_map_t; #define get_page(p) atomic_inc(&(p)->count) #define put_page(p) __free_page(p) +#define put_page_raw(p) atomic_dec(&(p)->count) #define put_page_testzero(p) atomic_dec_and_test(&(p)->count) #define page_count(p) atomic_read(&(p)->count) #define set_page_count(p,v) atomic_set(&(p)->count, v) @@ -169,8 +169,8 @@ #define PG_uptodate 3 #define PG_dirty 4 #define PG_decr_after 5 -#define PG_unused_01 6 -#define PG_active 7 +#define PG_out_lru 6 +#define PG__unused_02 7 #define PG_slab 8 #define PG_swap_cache 9 #define PG_skip 10 @@ -194,9 +194,6 @@ clear_bit(PG_locked, &(page)->flags); \ wake_up(&page->wait); \ } while (0) -#define PageActive(page) test_bit(PG_active, &(page)->flags) -#define SetPageActive(page) set_bit(PG_active, &(page)->flags) -#define ClearPageActive(page) clear_bit(PG_active, &(page)->flags) #define PageError(page) test_bit(PG_error, &(page)->flags) #define SetPageError(page) set_bit(PG_error, &(page)->flags) #define ClearPageError(page) clear_bit(PG_error, &(page)->flags) @@ -229,6 +226,12 @@ #define SetPageReserved(page) set_bit(PG_reserved, &(page)->flags) #define ClearPageReserved(page) clear_bit(PG_reserved, &(page)->flags) +#define PageSetOutLru(page) set_bit(PG_out_lru, &(page)->flags) +#define PageClearOutLru(page) clear_bit(PG_out_lru, &(page)->flags) +#define PageTestandSetOutLru(page) test_and_set_bit(PG_out_lru, &(page)->flags) +#define PageTestandClearOutLru(page) test_and_clear_bit(PG_out_lru, &(page)->flags) +#define PageOutLru(page) test_bit(PG_out_lru, &(page)->flags) + /* * Error return values for the *_nopage functions */ @@ -314,21 +317,21 @@ * can allocate highmem pages, the *get*page*() variants return * virtual kernel addresses to the allocated page(s). */ -extern struct page * FASTCALL(__alloc_pages(zonelist_t *zonelist, unsigned long order)); +extern struct page * FASTCALL(__alloc_pages(gfpmask_zone_t *, unsigned long order)); extern struct page * alloc_pages_node(int nid, int gfp_mask, unsigned long order); #ifndef CONFIG_DISCONTIGMEM -static inline struct page * alloc_pages(int gfp_mask, unsigned long order) +extern inline struct page * alloc_pages(int gfp_mask, unsigned long order) { /* temporary check. */ - if (contig_page_data.node_zonelists[gfp_mask].gfp_mask != (gfp_mask)) + if (contig_page_data.node_gfpmask_zone[gfp_mask].gfp_mask != (gfp_mask)) BUG(); /* * Gets optimized away by the compiler. */ if (order >= MAX_ORDER) return NULL; - return __alloc_pages(contig_page_data.node_zonelists+(gfp_mask), order); + return __alloc_pages(contig_page_data.node_gfpmask_zone+gfp_mask, order); } #else /* !CONFIG_DISCONTIGMEM */ extern struct page * alloc_pages(int gfp_mask, unsigned long order); @@ -460,9 +463,9 @@ /* filemap.c */ extern void remove_inode_page(struct page *); extern unsigned long page_unuse(struct page *); -extern int shrink_mmap(int, int); +extern int shrink_mmap(int, zone_t *); extern void truncate_inode_pages(struct address_space *, loff_t); -extern void truncate_all_inode_pages(struct address_space *); +#define truncate_all_inode_pages(x) truncate_inode_pages(x, 0) /* generic vm_area_ops exported for stackable file systems */ extern int filemap_swapout(struct page * page, struct file *file); @@ -491,7 +494,7 @@ #define GFP_ATOMIC (__GFP_HIGH) #define GFP_USER (__GFP_WAIT | __GFP_IO) #define GFP_HIGHUSER (GFP_USER | __GFP_HIGHMEM) -#define GFP_KERNEL (__GFP_HIGH | __GFP_WAIT | __GFP_IO) +#define GFP_KERNEL (__GFP_WAIT | __GFP_IO) #define GFP_NFS (__GFP_HIGH | __GFP_WAIT | __GFP_IO) #define GFP_KSWAPD (__GFP_IO) @@ -542,15 +545,183 @@ extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr); -#define buffer_under_min() (atomic_read(&buffermem_pages) * 100 < \ - buffer_mem.min_percent * num_physpages) -#define pgcache_under_min() (atomic_read(&page_cache_size) * 100 < \ - page_cache.min_percent * num_physpages) +#define lru_cache_under_min(lru_pages) ((lru_pages) * 100 < \ + lru_cache_mem.min_percent * num_physpages) #define vmlist_access_lock(mm) spin_lock(&mm->page_table_lock) #define vmlist_access_unlock(mm) spin_unlock(&mm->page_table_lock) #define vmlist_modify_lock(mm) vmlist_access_lock(mm) #define vmlist_modify_unlock(mm) vmlist_access_unlock(mm) + +/* + * Helper macros for lru_cache handling. + */ + +/* + * lru_cache_add can be run on a page that is just mapped + * (this precisely will always happen when adding an anonymous page + * shared by lots of read-only users after a COW to the swap cache). + * In the current VM design it never happens that buffer cache page + * is mapped or that we insert in the lru a page that has still I/O + * buffers on it. + */ +#define lru_cache_add(page, lru_type) \ +do { \ + lru_cache_t * this_lru = &(page)->zone->zone_pgdat->lru_cache; \ + if ((page)->map_count < 0 || \ + (!(page)->mapping && !(page)->buffers) || \ + ((page)->map_count && (page)->buffers) || \ + PageOutLru(page)) \ + BUG(); \ + spin_lock(&this_lru->lock); \ + if (!(page)->map_count) { \ + list_add(&(page)->lru, &this_lru->heads[lru_type]); \ + this_lru->nr_cache_pages++; \ + } else \ + this_lru->nr_map_pages++; \ + spin_unlock(&this_lru->lock); \ +} while (0) + +/* + * Only the pages without overlapped buffer headers can be removed from the + * lru or we won't be able to reclaim the buffer header memory anymore and + * we would make them unfreeable. + * + * Locking: if the page isn't under shrink mmap processing + * then page->buffers can't change from under us. If the the page is under + * shrink_mmap processing, then we can simply update the map count and + * shrink_mmap will finish the work later. With this locking protocol we + * avoid getting the lock while mapping pages. + */ +#define lru_cache_map(page) \ +do { \ + lru_cache_t * this_lru = &(page)->zone->zone_pgdat->lru_cache; \ + if ((page)->map_count < 0 || !(page)->mapping) \ + BUG(); \ + spin_lock(&this_lru->lock); \ + if (!(page)->map_count++ && !(page)->buffers && \ + !PageOutLru(page)) { \ + list_del(&(page)->lru); \ + this_lru->nr_cache_pages--; \ + this_lru->nr_map_pages++; \ + } \ + spin_unlock(&this_lru->lock); \ +} while (0) + +/* no need of any lock, at this time the page is still local to us */ +#define page_anon_init_map_wmb(page) \ +do { \ + if ((page)->map_count) \ + BUG(); \ + set_wmb((page)->map_count, 1); \ +} while (0) + +/* dup the mapping for a page known to be just referenced as mapped */ +#define page_map(page) \ +do { \ + lru_cache_t * this_lru = &(page)->zone->zone_pgdat->lru_cache; \ + if ((page)->map_count <= 0) \ + BUG(); \ + spin_lock(&this_lru->lock); \ + (page)->map_count++; \ + spin_unlock(&this_lru->lock); \ +} while (0) + +/* drop a map reference from an _anonymoys_ page mapped in our current MM */ +#define page_unmap(page) \ +do { \ + lru_cache_t * this_lru = &(page)->zone->zone_pgdat->lru_cache; \ + if ((page)->mapping || (page)->map_count <= 0) \ + BUG(); \ + spin_lock(&this_lru->lock); \ + (page)->map_count--; \ + spin_unlock(&this_lru->lock); \ +} while (0) + +/* + * Don't discriminate between lru and put all pages that were mapped + * in the normal lru. + * + * This function can be called also on all kind of pages provided they + * were accounted in the mapping logic. + * + * Locking: if the page isn't under shrink mmap processing + * then page->buffers can't change from under us. It may be possible + * we did both the map and the unmap while the page was out of the lru, + * shrink_mmap will get this case right too later. + */ +#define lru_cache_unmap(page, lru_type) \ +do { \ + lru_cache_t * this_lru = &(page)->zone->zone_pgdat->lru_cache; \ + if ((page)->map_count <= 0) \ + BUG(); \ + spin_lock(&this_lru->lock); \ + if (!--(page)->map_count && (page)->mapping && \ + !(page)->buffers && !PageOutLru(page)) { \ + list_add(&(page)->lru, &this_lru->heads[lru_type]); \ + this_lru->nr_cache_pages++; \ + this_lru->nr_map_pages--; \ + } \ + spin_unlock(&this_lru->lock); \ +} while (0) + +/* + * This puts the page in the lru in case it was out of the lru since + * we overlapped some buffer head on the page. + * We hold the lock the per-page lock here. + */ +#define lru_cache_buf(page, lru_type) \ +do { \ + lru_cache_t * this_lru = &(page)->zone->zone_pgdat->lru_cache; \ + if (!(page)->mapping || !(page)->buffers || PageOutLru(page)) \ + BUG(); \ + spin_lock(&this_lru->lock); \ + if ((page)->map_count) { \ + list_add(&(page)->lru, &this_lru->heads[lru_type]); \ + this_lru->nr_cache_pages++; \ + this_lru->nr_map_pages--; \ + } \ + spin_unlock(&this_lru->lock); \ +} while (0) + +/* + * This is called when we drop the buffer headers from the page. We must + * remove the page from the lru if it wasn't mapped. We hold the per-page + * lock here. + */ +#define lru_cache_unbuf(page) \ +do { \ + lru_cache_t * this_lru = &(page)->zone->zone_pgdat->lru_cache; \ + if (!(page)->mapping || (page)->buffers || PageOutLru(page)) \ + BUG(); \ + spin_lock(&this_lru->lock); \ + if ((page)->map_count) { \ + list_del(&(page)->lru); \ + this_lru->nr_cache_pages--; \ + this_lru->nr_map_pages++; \ + } \ + spin_unlock(&this_lru->lock); \ +} while (0) + +/* + * Needs the lock on the page to be sure the page is in the lru list. + * swapoff is the only caller that can drop a mapped cache from the lru + * in order to do the swap-cache-page to anonymous-page conversion. + */ +#define lru_cache_del(page) \ +do { \ + lru_cache_t * this_lru = &(page)->zone->zone_pgdat->lru_cache; \ + if (!PageLocked(page) || (page)->map_count < 0 || \ + !(page)->mapping || (page)->buffers || PageOutLru(page)) \ + BUG(); \ + spin_lock(&this_lru->lock); \ + if (!(page)->map_count) { \ + list_del(&(page)->lru); \ + this_lru->nr_cache_pages--; \ + } else \ + this_lru->nr_map_pages--; \ + spin_unlock(&this_lru->lock); \ +} while (0) #endif /* __KERNEL__ */ diff -urN 2.4.0-test1-ac7/include/linux/mmzone.h 2.4.0-test1-ac7-VM-31/include/linux/mmzone.h --- 2.4.0-test1-ac7/include/linux/mmzone.h Sun May 28 20:52:35 2000 +++ 2.4.0-test1-ac7-VM-31/include/linux/mmzone.h Sat Jun 3 15:45:16 2000 @@ -21,16 +21,26 @@ struct pglist_data; +/* + * Memory balancing internally to the node can work correctly only on + * classzone basis while handling overlapped classzones. + */ typedef struct zone_struct { /* * Commonly accessed fields: */ - spinlock_t lock; unsigned long offset; unsigned long free_pages; - char low_on_memory; - char zone_wake_kswapd; + + /* + * Memory balancing is all classzone based, all the below + * fields refer to the classzone. The classzone includes + * the current zone plus all the lower zones in the MM. + */ + unsigned long classzone_free_pages; unsigned long pages_min, pages_low, pages_high; + int nr_zone; + char zone_wake_kswapd; /* * free areas of different sizes @@ -57,27 +67,34 @@ #define MAX_NR_ZONES 3 /* - * One allocation request operates on a zonelist. A zonelist - * is a list of zones, the first one is the 'goal' of the - * allocation, the other zones are fallback zones, in decreasing - * priority. - * - * Right now a zonelist takes up less than a cacheline. We never - * modify it apart from boot-up, and only a few indices are used, - * so despite the zonelist table being relatively big, the cache - * footprint of this construct is very small. + * The pgdat->node_gfpmask_zone[] array tell us which classzone + * we should allocate from given a certain gfpmask. It translates + * the gfpmask to a classzone. */ -typedef struct zonelist_struct { - zone_t * zones [MAX_NR_ZONES+1]; // NULL delimited +typedef struct gfpmask_zone_s { + zone_t * classzone; int gfp_mask; -} zonelist_t; +} gfpmask_zone_t; #define NR_GFPINDEX 0x100 +#define LRU_SWAP_CACHE 0 +#define LRU_NORMAL_CACHE 1 +#define NR_LRU_CACHE 2 +typedef struct lru_cache_s { + struct list_head heads[NR_LRU_CACHE]; + unsigned long nr_cache_pages; /* pages in the lrus */ + unsigned long nr_map_pages; /* pages temporarly out of the lru */ + /* keep lock in a separate cacheline to avoid ping pong in SMP */ + spinlock_t lock ____cacheline_aligned_in_smp; +} lru_cache_t; + struct bootmem_data; typedef struct pglist_data { + int nr_zones; zone_t node_zones[MAX_NR_ZONES]; - zonelist_t node_zonelists[NR_GFPINDEX]; + gfpmask_zone_t node_gfpmask_zone[NR_GFPINDEX]; + lru_cache_t lru_cache; struct page *node_mem_map; unsigned long *valid_addr_bitmap; struct bootmem_data *bdata; @@ -86,14 +103,14 @@ unsigned long node_size; int node_id; struct pglist_data *node_next; + spinlock_t freelist_lock ____cacheline_aligned_in_smp; } pg_data_t; extern int numnodes; extern pg_data_t *pgdat_list; #define memclass(pgzone, tzone) (((pgzone)->zone_pgdat == (tzone)->zone_pgdat) \ - && (((pgzone) - (pgzone)->zone_pgdat->node_zones) <= \ - ((tzone) - (pgzone)->zone_pgdat->node_zones))) + && ((pgzone) <= (tzone))) /* * The following two are not meant for general usage. They are here as diff -urN 2.4.0-test1-ac7/include/linux/pagemap.h 2.4.0-test1-ac7-VM-31/include/linux/pagemap.h --- 2.4.0-test1-ac7/include/linux/pagemap.h Sun May 28 20:52:35 2000 +++ 2.4.0-test1-ac7-VM-31/include/linux/pagemap.h Sat Jun 3 15:52:31 2000 @@ -80,8 +80,7 @@ extern void __add_page_to_hash_queue(struct page * page, struct page **p); -extern void add_to_page_cache(struct page * page, struct address_space *mapping, unsigned long index); -extern void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index); +extern void add_to_swap_cache_locked(struct page * page, struct address_space *mapping, unsigned long index); extern inline void add_page_to_hash_queue(struct page * page, struct inode * inode, unsigned long index) { diff -urN 2.4.0-test1-ac7/include/linux/sched.h 2.4.0-test1-ac7-VM-31/include/linux/sched.h --- 2.4.0-test1-ac7/include/linux/sched.h Sun May 28 20:52:35 2000 +++ 2.4.0-test1-ac7-VM-31/include/linux/sched.h Sat Jun 3 15:45:16 2000 @@ -309,6 +309,7 @@ long per_cpu_utime[NR_CPUS], per_cpu_stime[NR_CPUS]; /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap; + int low_on_memory:1; int swappable:1; /* process credentials */ uid_t uid,euid,suid,fsuid; diff -urN 2.4.0-test1-ac7/include/linux/swap.h 2.4.0-test1-ac7-VM-31/include/linux/swap.h --- 2.4.0-test1-ac7/include/linux/swap.h Sat Jun 3 14:52:35 2000 +++ 2.4.0-test1-ac7-VM-31/include/linux/swap.h Sat Jun 3 16:23:22 2000 @@ -64,10 +64,8 @@ }; extern int nr_swap_pages; -FASTCALL(unsigned int nr_free_pages(void)); -FASTCALL(unsigned int nr_free_buffer_pages(void)); -FASTCALL(unsigned int nr_free_highpages(void)); -extern int nr_lru_pages; +extern unsigned long nr_free_pages(void); +extern unsigned long nr_free_buffer_pages(void); extern atomic_t nr_async_pages; extern struct address_space swapper_space; extern atomic_t page_cache_size; @@ -80,13 +78,13 @@ struct zone_t; /* linux/ipc/shm.c */ -extern int shm_swap(int, int); +extern int shm_swap(int, int, zone_t *); /* linux/mm/swap.c */ extern void swap_setup(void); /* linux/mm/vmscan.c */ -extern int try_to_free_pages(unsigned int gfp_mask); +extern int try_to_free_pages(unsigned int gfp_mask, zone_t *zone); /* linux/mm/page_io.c */ extern void rw_swap_page(int, struct page *, int); @@ -147,57 +145,15 @@ * swap IO on this page. (The page cache _does_ count as another valid * reference to the page, however.) */ -static inline int is_page_shared(struct page *page) +static inline int is_swap_cache_shared(struct page *page) { unsigned int count; - if (PageReserved(page)) - return 1; + if (PageReserved(page) || !PageSwapCache(page) || !PageLocked(page)) + BUG(); count = page_count(page); - if (PageSwapCache(page)) - count += swap_count(page) - 2 - !!page->buffers; + count += swap_count(page) - 2 - !!page->buffers; return count > 1; } - -extern spinlock_t pagemap_lru_lock; - -/* - * Magic constants for page aging. If the system is programmed - * right, tweaking these should have almost no effect... - * The 2.4 code, however, is mostly simple and stable ;) - */ -#define PG_AGE_MAX 64 -#define PG_AGE_START 5 -#define PG_AGE_ADV 3 -#define PG_AGE_DECL 1 - -/* - * Helper macros for lru_pages handling. - */ -#define lru_cache_add(page) \ -do { \ - spin_lock(&pagemap_lru_lock); \ - list_add(&(page)->lru, &lru_cache); \ - nr_lru_pages++; \ - page->age = PG_AGE_START; \ - SetPageActive(page); \ - spin_unlock(&pagemap_lru_lock); \ -} while (0) - -#define __lru_cache_del(page) \ -do { \ - list_del(&(page)->lru); \ - ClearPageActive(page); \ - nr_lru_pages--; \ -} while (0) - -#define lru_cache_del(page) \ -do { \ - if (!PageLocked(page)) \ - BUG(); \ - spin_lock(&pagemap_lru_lock); \ - __lru_cache_del(page); \ - spin_unlock(&pagemap_lru_lock); \ -} while (0) extern spinlock_t swaplock; diff -urN 2.4.0-test1-ac7/include/linux/swapctl.h 2.4.0-test1-ac7-VM-31/include/linux/swapctl.h --- 2.4.0-test1-ac7/include/linux/swapctl.h Sun May 28 20:52:35 2000 +++ 2.4.0-test1-ac7-VM-31/include/linux/swapctl.h Sat Jun 3 15:45:16 2000 @@ -11,8 +11,7 @@ unsigned int max_percent; } buffer_mem_v1; typedef buffer_mem_v1 buffer_mem_t; -extern buffer_mem_t buffer_mem; -extern buffer_mem_t page_cache; +extern buffer_mem_t lru_cache_mem; typedef struct freepages_v1 { diff -urN 2.4.0-test1-ac7/include/linux/sysctl.h 2.4.0-test1-ac7-VM-31/include/linux/sysctl.h --- 2.4.0-test1-ac7/include/linux/sysctl.h Sat May 13 10:15:20 2000 +++ 2.4.0-test1-ac7-VM-31/include/linux/sysctl.h Sat Jun 3 14:53:15 2000 @@ -119,15 +119,18 @@ enum { VM_SWAPCTL=1, /* struct: Set vm swapping control */ - VM_SWAPOUT=2, /* int: Linear or sqrt() swapout for hogs */ + VM_SWAPOUT=2, /* int: Background pageout interval */ VM_FREEPG=3, /* struct: Set free page thresholds */ VM_BDFLUSH=4, /* struct: Control buffer cache flushing */ VM_OVERCOMMIT_MEMORY=5, /* Turn off the virtual memory safety limit */ +#if 0 /* obsolete but don't reuse */ VM_BUFFERMEM=6, /* struct: Set buffer memory thresholds */ VM_PAGECACHE=7, /* struct: Set cache memory thresholds */ +#endif VM_PAGERDAEMON=8, /* struct: Control kswapd behaviour */ VM_PGT_CACHE=9, /* struct: Set page table cache parameters */ - VM_PAGE_CLUSTER=10 /* int: set number of pages to swap together */ + VM_PAGE_CLUSTER=10, /* int: set number of pages to swap together */ + VM_LRU_CACHE=11, /* struct: Set lru cache memory thresholds */ }; diff -urN 2.4.0-test1-ac7/ipc/shm.c 2.4.0-test1-ac7-VM-31/ipc/shm.c --- 2.4.0-test1-ac7/ipc/shm.c Fri May 26 22:47:09 2000 +++ 2.4.0-test1-ac7-VM-31/ipc/shm.c Sat Jun 3 14:53:15 2000 @@ -132,7 +132,7 @@ static int sysvipc_shm_read_proc(char *buffer, char **start, off_t offset, int length, int *eof, void *data); #endif -static void zshm_swap (int prio, int gfp_mask); +static void zshm_swap (int prio, zone_t *zone); static void zmap_unuse(swp_entry_t entry, struct page *page); static void shmzero_open(struct vm_area_struct *shmd); static void shmzero_close(struct vm_area_struct *shmd); @@ -1411,7 +1411,7 @@ #define RETRY 1 #define FAILED 2 -static int shm_swap_core(struct shmid_kernel *shp, unsigned long idx, swp_entry_t swap_entry, int *counter, struct page **outpage) +static int shm_swap_core(struct shmid_kernel *shp, unsigned long idx, swp_entry_t swap_entry, zone_t *zone, int *counter, struct page **outpage) { pte_t page; struct page *page_map; @@ -1420,7 +1420,7 @@ if (!pte_present(page)) return RETRY; page_map = pte_page(page); - if (page_map->zone->free_pages > page_map->zone->pages_high) + if (!memclass(page_map->zone, zone)) return RETRY; if (shp->id != zero_id) swap_attempts++; @@ -1473,22 +1473,26 @@ static unsigned long swap_id; /* currently being swapped */ static unsigned long swap_idx; /* next to swap */ -int shm_swap (int prio, int gfp_mask) +int shm_swap (int prio, int gfp_mask, zone_t *zone) { struct shmid_kernel *shp; swp_entry_t swap_entry; unsigned long id, idx; - int loop = 0; + int loop; int counter; struct page * page_map; - zshm_swap(prio, gfp_mask); + if (!(gfp_mask & __GFP_IO)) + return 0; + + zshm_swap(prio, zone); counter = shm_rss / (prio + 1); if (!counter) return 0; if (shm_swap_preop(&swap_entry)) return 0; + loop = 0; shm_lockall(); check_id: shp = shm_get(swap_id); @@ -1514,7 +1518,7 @@ if (idx >= shp->shm_npages) goto next_id; - switch (shm_swap_core(shp, idx, swap_entry, &counter, &page_map)) { + switch (shm_swap_core(shp, idx, swap_entry, zone, &counter, &page_map)) { case RETRY: goto check_table; case FAILED: goto failed; } @@ -1800,7 +1804,7 @@ spin_unlock(&zmap_list_lock); } -static void zshm_swap (int prio, int gfp_mask) +static void zshm_swap (int prio, zone_t *zone) { struct shmid_kernel *shp; swp_entry_t swap_entry; @@ -1845,7 +1849,7 @@ goto next_id; } - switch (shm_swap_core(shp, idx, swap_entry, &counter, &page_map)) { + switch (shm_swap_core(shp, idx, swap_entry, zone, &counter, &page_map)) { case RETRY: goto check_table; case FAILED: goto failed; } diff -urN 2.4.0-test1-ac7/ipc/util.c 2.4.0-test1-ac7-VM-31/ipc/util.c --- 2.4.0-test1-ac7/ipc/util.c Fri May 26 22:47:09 2000 +++ 2.4.0-test1-ac7-VM-31/ipc/util.c Sat Jun 3 14:53:15 2000 @@ -243,7 +243,7 @@ return; } -int shm_swap (int prio, int gfp_mask) +int shm_swap (int prio, int gfp_mask, zone_t *zone) { return 0; } diff -urN 2.4.0-test1-ac7/kernel/sysctl.c 2.4.0-test1-ac7-VM-31/kernel/sysctl.c --- 2.4.0-test1-ac7/kernel/sysctl.c Sat Jun 3 14:52:35 2000 +++ 2.4.0-test1-ac7-VM-31/kernel/sysctl.c Sat Jun 3 14:53:15 2000 @@ -233,16 +233,14 @@ &bdflush_min, &bdflush_max}, {VM_OVERCOMMIT_MEMORY, "overcommit_memory", &sysctl_overcommit_memory, sizeof(sysctl_overcommit_memory), 0644, NULL, &proc_dointvec}, - {VM_BUFFERMEM, "buffermem", - &buffer_mem, sizeof(buffer_mem_t), 0644, NULL, &proc_dointvec}, - {VM_PAGECACHE, "pagecache", - &page_cache, sizeof(buffer_mem_t), 0644, NULL, &proc_dointvec}, {VM_PAGERDAEMON, "kswapd", &pager_daemon, sizeof(pager_daemon_t), 0644, NULL, &proc_dointvec}, {VM_PGT_CACHE, "pagetable_cache", &pgt_cache_water, 2*sizeof(int), 0644, NULL, &proc_dointvec}, {VM_PAGE_CLUSTER, "page-cluster", &page_cluster, sizeof(int), 0644, NULL, &proc_dointvec}, + {VM_LRU_CACHE, "lru_cache", + &lru_cache_mem, sizeof(buffer_mem_t), 0644, NULL, &proc_dointvec}, {0} }; diff -urN 2.4.0-test1-ac7/mm/filemap.c 2.4.0-test1-ac7-VM-31/mm/filemap.c --- 2.4.0-test1-ac7/mm/filemap.c Sat Jun 3 14:52:35 2000 +++ 2.4.0-test1-ac7-VM-31/mm/filemap.c Sat Jun 3 15:11:13 2000 @@ -44,20 +44,12 @@ atomic_t page_cache_size = ATOMIC_INIT(0); unsigned int page_hash_bits; struct page **page_hash_table; -struct list_head lru_cache; static spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED; -/* - * NOTE: to avoid deadlocking you must never acquire the pagecache_lock with - * the pagemap_lru_lock held. - */ -spinlock_t pagemap_lru_lock = SPIN_LOCK_UNLOCKED; #define CLUSTER_PAGES (1 << page_cluster) #define CLUSTER_OFFSET(x) (((x) >> page_cluster) << page_cluster) -#define min(a,b) ((a < b) ? a : b) - void __add_page_to_hash_queue(struct page * page, struct page **p) { atomic_inc(&page_cache_size); @@ -69,7 +61,7 @@ PAGE_BUG(page); } -static inline void remove_page_from_hash_queue(struct page * page) +static void remove_page_from_hash_queue(struct page * page) { if(page->pprev_hash) { if(page->next_hash) @@ -127,9 +119,7 @@ struct page * page; head = &inode->i_mapping->pages; -repeat: spin_lock(&pagecache_lock); - spin_lock(&pagemap_lru_lock); curr = head->next; while (curr != head) { @@ -139,53 +129,19 @@ /* We cannot invalidate a locked page */ if (TryLockPage(page)) continue; - if (page->buffers) { - page_cache_get(page); - spin_unlock(&pagemap_lru_lock); - spin_unlock(&pagecache_lock); - block_destroy_buffers(page); - remove_inode_page(page); - lru_cache_del(page); - page_cache_release(page); - UnlockPage(page); - page_cache_release(page); - goto repeat; - } - __remove_inode_page(page); - __lru_cache_del(page); + + lru_cache_del(page); + + remove_page_from_inode_queue(page); + remove_page_from_hash_queue(page); + page->mapping = NULL; + UnlockPage(page); page_cache_release(page); } - spin_unlock(&pagemap_lru_lock); spin_unlock(&pagecache_lock); } -static inline void truncate_partial_page(struct page *page, unsigned partial) -{ - memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial); - - if (page->buffers) - block_flushpage(page, partial); - -} - -static inline void truncate_complete_page(struct page *page) -{ - if (page->buffers) - block_destroy_buffers(page); - lru_cache_del(page); - - /* - * We remove the page from the page cache _after_ we have - * destroyed all buffer-cache references to it. Otherwise some - * other process might think this inode page is not in the - * page cache and creates a buffer-cache alias to it causing - * all sorts of fun problems ... - */ - remove_inode_page(page); - page_cache_release(page); -} - /** * truncate_inode_pages - truncate *all* the pages from an offset * @mapping: mapping to truncate @@ -201,39 +157,47 @@ struct page * page; unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); unsigned long start; + /* + * Only one truncate can run at once so we can hide + * processed pages into our local dispose list to decrease + * the complexity of the `repeat` path. + */ + LIST_HEAD(dispose); start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; -repeat: head = &mapping->pages; +repeat: spin_lock(&pagecache_lock); - curr = head->next; - while (curr != head) { + while ((curr = head->next) != head) { unsigned long offset; page = list_entry(curr, struct page, list); - curr = curr->next; offset = page->index; - /* Is one of the pages to truncate? */ - if ((offset >= start) || (partial && (offset + 1) == start)) { - if (TryLockPage(page)) { - page_cache_get(page); - spin_unlock(&pagecache_lock); - wait_on_page(page); - page_cache_release(page); - goto repeat; - } - page_cache_get(page); + /* page wholly truncated - free it */ + if (offset >= start) { + get_page(page); + if (TryLockPage(page)) + goto wait_unlock; spin_unlock(&pagecache_lock); - if (partial && (offset + 1) == start) { - truncate_partial_page(page, partial); - partial = 0; - } else - truncate_complete_page(page); + if (page->buffers) + block_destroy_buffers(page); + lru_cache_del(page); + + /* + * We remove the page from the page cache + * _after_ we have destroyed all buffer-cache + * references to it. Otherwise some other process + * might think this inode page is not in the + * page cache and creates a buffer-cache alias + * to it causing all sorts of fun problems ... + */ + remove_inode_page(page); UnlockPage(page); + put_page_raw(page); page_cache_release(page); /* @@ -245,128 +209,134 @@ */ goto repeat; } - } - spin_unlock(&pagecache_lock); -} -/** - * truncate_all_inode_pages - truncate *all* the pages - * @mapping: mapping to truncate - * - * Truncate all the inode pages. If any page is locked we wait for it - * to become unlocked. This function can block. - */ -void truncate_all_inode_pages(struct address_space * mapping) -{ - struct list_head *head, *curr; - struct page * page; + /* + * there is only one partial page possible and it's the + * one preceeding the first wholly truncated page. + */ + if (!partial || (offset + 1) != start) { + list_del(curr); list_add(curr, &dispose); + continue; + } - head = &mapping->pages; -repeat: - spin_lock(&pagecache_lock); - spin_lock(&pagemap_lru_lock); - curr = head->next; + /* partial truncate, clear end of page */ + get_page(page); + if (TryLockPage(page)) + goto wait_unlock; + list_del(curr); /* page cache can grow under truncate */ + spin_unlock(&pagecache_lock); - while (curr != head) { - page = list_entry(curr, struct page, list); - curr = curr->next; + /* + * Nobody can try to list_del() the page pointed by `curr' + * from under us (we hold a reference on the page) and + * so we don't need the lock held while adding the page + * to the local dispose list. We only need to insert curr + * into our internal dispose list before releasing our + * reference on the page. + */ +#if 1 + curr->next = curr->prev = NULL; /* + * Trigger an oops if somebody + * tries to unlink the page + * under processing from the + * cache. + */ +#endif - if (TryLockPage(page)) { - page_cache_get(page); - spin_unlock(&pagemap_lru_lock); - spin_unlock(&pagecache_lock); - wait_on_page(page); - page_cache_release(page); - goto repeat; - } - if (page->buffers) { - page_cache_get(page); - spin_unlock(&pagemap_lru_lock); - spin_unlock(&pagecache_lock); - block_destroy_buffers(page); - remove_inode_page(page); - lru_cache_del(page); - page_cache_release(page); - UnlockPage(page); - page_cache_release(page); - goto repeat; - } - __lru_cache_del(page); - __remove_inode_page(page); + memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial); + if (page->buffers) + block_flushpage(page, partial); + + partial = 0; + + /* + * we have dropped the spinlock so we have to + * restart. + */ UnlockPage(page); + /* + * This is time to add `curr' to a valid list to allow + * somebody else to unlink the page later. + */ + list_add(curr, &dispose); page_cache_release(page); - } + goto repeat; - spin_unlock(&pagemap_lru_lock); + wait_unlock: + spin_unlock(&pagecache_lock); + ___wait_on_page(page); + put_page(page); + goto repeat; + } + list_splice(&dispose, head); spin_unlock(&pagecache_lock); } -/* - * nr_dirty represents the number of dirty pages that we will write async - * before doing sync writes. We can only do sync writes if we can - * wait for IO (__GFP_IO set). - */ -int shrink_mmap(int priority, int gfp_mask) -{ - int ret = 0, count, nr_dirty; - struct list_head * page_lru; - struct page * page = NULL; - - count = nr_lru_pages / (priority + 1); - nr_dirty = priority; +static int FASTCALL(__shrink_mmap(int priority, zone_t *zone, + unsigned long * __count, + lru_cache_t * this_lru, + int lru_type)); +static int __shrink_mmap(int priority, zone_t *zone, + unsigned long * __count, + lru_cache_t * this_lru, + int lru_type) +{ + int ret = 0; + unsigned long count = *__count; + LIST_HEAD(young); + LIST_HEAD(old); + LIST_HEAD(forget); + struct list_head * page_lru, * dispose; + struct page * page; + spinlock_t * lru_lock = &this_lru->lock; + struct list_head * lru_head = &this_lru->heads[lru_type]; - /* we need pagemap_lru_lock for list_del() ... subtle code below */ - spin_lock(&pagemap_lru_lock); - while (count > 0 && (page_lru = lru_cache.prev) != &lru_cache) { + spin_lock(lru_lock); + + while (count > 0 && (page_lru = lru_head->prev) != lru_head) { page = list_entry(page_lru, struct page, lru); list_del(page_lru); - if (PageTestandClearReferenced(page)) { - page->age += PG_AGE_ADV; - if (page->age > PG_AGE_MAX) - page->age = PG_AGE_MAX; - goto dispose_continue; - } - page->age -= min(PG_AGE_DECL, page->age); - - if (page->age) + dispose = &old; + /* don't account passes over not DMA pages */ + if (!memclass(page->zone, zone)) goto dispose_continue; count--; - /* - * Page is from a zone we don't care about. - * Don't drop page cache entries in vain. - */ - if (page->zone->free_pages > page->zone->pages_high) + + dispose = lru_head; + if (PageTestandClearReferenced(page)) + /* Roll the page at the top of the lru list, + * we could also be more aggressive putting + * the page in the young-dispose-list, so + * avoiding to free young pages in each pass. + */ goto dispose_continue; - /* - * Avoid unscalable SMP locking for pages we can - * immediate tell are untouchable.. - */ + dispose = &young; + + /* avoid unscalable SMP locking */ if (!page->buffers && page_count(page) > 1) goto dispose_continue; if (TryLockPage(page)) goto dispose_continue; - /* Release the pagemap_lru lock even if the page is not yet - queued in any lru queue since we have just locked down - the page so nobody else may SMP race with us running - a lru_cache_del() (lru_cache_del() always run with the - page locked down ;). */ - spin_unlock(&pagemap_lru_lock); + if (PageTestandSetOutLru(page)) + BUG(); + /* + * We can release the lru_cache lock even if the page is not + * queued in any list because we have just locked down + * the page and marked the page as out of the lru list. + */ + spin_unlock(lru_lock); /* avoid freeing the page while it's locked */ - page_cache_get(page); + get_page(page); - /* - * Is it a buffer page? Try to clean it up regardless - * of zone - it's old. - */ + /* Is it a buffer page? */ if (page->buffers) { - int wait = ((gfp_mask & __GFP_IO) && (nr_dirty-- < 0)); - if (!try_to_free_buffers(page, wait)) + if (!try_to_free_buffers(page)) goto unlock_continue; /* page was locked, inode can't go away under us */ if (!page->mapping) { @@ -394,45 +364,45 @@ * were to be marked referenced.. */ if (PageSwapCache(page)) { - if (!PageDirty(page)) { - spin_unlock(&pagecache_lock); - __delete_from_swap_cache(page); - goto made_inode_progress; - } - /* PageDeferswap -> we swap out the page now. */ - if (gfp_mask & __GFP_IO) - goto async_swap_continue; - goto cache_unlock_continue; - } + spin_unlock(&pagecache_lock); + __delete_from_swap_cache(page); + goto made_inode_progress; + } /* is it a page-cache page? */ if (page->mapping) { - if (!PageDirty(page) && !pgcache_under_min()) { - __remove_inode_page(page); + if (!PageDirty(page)) { + remove_page_from_inode_queue(page); + remove_page_from_hash_queue(page); + page->mapping = NULL; spin_unlock(&pagecache_lock); goto made_inode_progress; } goto cache_unlock_continue; } + dispose = &forget; printk(KERN_ERR "shrink_mmap: unknown LRU page!\n"); cache_unlock_continue: spin_unlock(&pagecache_lock); unlock_continue: - spin_lock(&pagemap_lru_lock); + spin_lock(lru_lock); UnlockPage(page); - page_cache_release(page); - goto dispose_continue; -async_swap_continue: - spin_unlock(&pagecache_lock); - /* Do NOT unlock the page ... that is done after IO. */ - ClearPageDirty(page); - rw_swap_page(WRITE, page, 0); - spin_lock(&pagemap_lru_lock); - page_cache_release(page); + put_page(page); + + if (!page->map_count || page->buffers) + list_add(page_lru, dispose); + else { + this_lru->nr_cache_pages--; + this_lru->nr_map_pages++; + } + if (!PageTestandClearOutLru(page)) + BUG(); + continue; + dispose_continue: - list_add(page_lru, &lru_cache); + list_add(page_lru, dispose); } goto out; @@ -440,18 +410,44 @@ page_cache_release(page); made_buffer_progress: UnlockPage(page); - page_cache_release(page); + if (!PageTestandClearOutLru(page)) + BUG(); + put_page(page); ret = 1; - spin_lock(&pagemap_lru_lock); - /* nr_lru_pages needs the spinlock */ - nr_lru_pages--; + spin_lock(lru_lock); + /* nr_pages needs the spinlock */ + this_lru->nr_cache_pages--; out: - spin_unlock(&pagemap_lru_lock); + list_splice(&young, lru_head); + list_splice(&old, lru_head->prev); + + spin_unlock(lru_lock); + *__count = count; return ret; } +int shrink_mmap(int priority, zone_t *zone) +{ + lru_cache_t * this_lru; + unsigned long count; + int i; + + this_lru = &zone->zone_pgdat->lru_cache; + + count = this_lru->nr_cache_pages; + if (lru_cache_under_min(count)) + return 0; + + count /= priority + 1; + + for (i = 0; i < NR_LRU_CACHE; i++) + if (__shrink_mmap(priority, zone, &count, this_lru, i)) + return 1; + return 0; +} + static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page) { goto inside; @@ -526,7 +522,7 @@ if (page->index < start) continue; - page_cache_get(page); + get_page(page); spin_unlock(&pagecache_lock); lock_page(page); @@ -563,18 +559,18 @@ * The caller must have locked the page and * set all the page flags correctly.. */ -void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index) +void add_to_swap_cache_locked(struct page * page, struct address_space *mapping, unsigned long index) { if (!PageLocked(page)) BUG(); - page_cache_get(page); - spin_lock(&pagecache_lock); + get_page(page); page->index = index; + spin_lock(&pagecache_lock); add_page_to_inode_queue(mapping, page); __add_page_to_hash_queue(page, page_hash(mapping, index)); - lru_cache_add(page); spin_unlock(&pagecache_lock); + lru_cache_add(page, LRU_SWAP_CACHE); } /* @@ -591,25 +587,18 @@ if (PageLocked(page)) BUG(); - flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty)); - page->flags = flags | (1 << PG_locked) | (1 << PG_referenced); - page_cache_get(page); + flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty) | (1 << PG_referenced)); + page->flags = flags | (1 << PG_locked); + get_page(page); page->index = offset; add_page_to_inode_queue(mapping, page); __add_page_to_hash_queue(page, hash); - lru_cache_add(page); + lru_cache_add(page, LRU_NORMAL_CACHE); alias = __find_page_nolock(mapping, offset, *hash); if (alias != page) BUG(); } -void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset) -{ - spin_lock(&pagecache_lock); - __add_to_page_cache(page, mapping, offset, page_hash(mapping, offset)); - spin_unlock(&pagecache_lock); -} - static int add_to_page_cache_unique(struct page * page, struct address_space *mapping, unsigned long offset, struct page **hash) @@ -735,7 +724,7 @@ spin_lock(&pagecache_lock); page = __find_page_nolock(mapping, offset, *hash); if (page) - page_cache_get(page); + get_page(page); spin_unlock(&pagecache_lock); /* Found the page, sleep if locked. */ @@ -785,7 +774,7 @@ spin_lock(&pagecache_lock); page = __find_page_nolock(mapping, offset, *hash); if (page) - page_cache_get(page); + get_page(page); spin_unlock(&pagecache_lock); /* Found the page, sleep if locked. */ @@ -1143,7 +1132,7 @@ if (!page) goto no_cached_page; found_page: - page_cache_get(page); + get_page(page); spin_unlock(&pagecache_lock); if (!Page_Uptodate(page)) @@ -1521,6 +1510,7 @@ struct page *new_page = page_cache_alloc(); if (new_page) { + page_anon_init_map_wmb(new_page); copy_user_highpage(new_page, old_page, address); flush_page_to_ram(new_page); } else @@ -1530,6 +1520,7 @@ } flush_page_to_ram(old_page); + lru_cache_map(old_page); return old_page; no_cached_page: @@ -1646,7 +1637,8 @@ set_pte(ptep, pte_mkclean(pte)); flush_tlb_page(vma, address); page = pte_page(pte); - page_cache_get(page); + page_map(page); + get_page(page); } else { if (pte_none(pte)) return 0; @@ -1659,6 +1651,7 @@ } page = pte_page(pte); if (!pte_dirty(pte) || flags == MS_INVALIDATE) { + lru_cache_unmap(page, LRU_NORMAL_CACHE); page_cache_free(page); return 0; } @@ -1672,6 +1665,7 @@ lock_page(page); error = filemap_write_page(vma->vm_file, page, 1); UnlockPage(page); + lru_cache_unmap(page, LRU_NORMAL_CACHE); page_cache_free(page); return error; } diff -urN 2.4.0-test1-ac7/mm/highmem.c 2.4.0-test1-ac7-VM-31/mm/highmem.c --- 2.4.0-test1-ac7/mm/highmem.c Fri May 26 22:47:10 2000 +++ 2.4.0-test1-ac7-VM-31/mm/highmem.c Sat Jun 3 14:53:15 2000 @@ -29,8 +29,7 @@ */ struct page * prepare_highmem_swapout(struct page * page) { - struct page *new_page; - unsigned long regular_page; + struct page * regular_page; unsigned long vaddr; /* * If this is a highmem page so it can't be swapped out directly @@ -48,22 +47,25 @@ * across a fork(). */ UnlockPage(page); - regular_page = __get_free_page(GFP_ATOMIC); + regular_page = alloc_page(GFP_ATOMIC); if (!regular_page) return NULL; vaddr = kmap(page); - copy_page((void *)regular_page, (void *)vaddr); + copy_page((void *)page_address(regular_page), (void *)vaddr); kunmap(page); /* * ok, we can just forget about our highmem page since * we stored its data into the new regular_page. */ + if (page->map_count) { + regular_page->map_count = 1; + page_unmap(page); + } page_cache_release(page); - new_page = mem_map + MAP_NR(regular_page); - LockPage(new_page); - return new_page; + LockPage(regular_page); + return regular_page; } struct page * replace_with_highmem(struct page * page) @@ -86,9 +88,8 @@ copy_page((void *)vaddr, (void *)page_address(page)); kunmap(highpage); - /* Preserve the caching of the swap_entry. */ - highpage->index = page->index; - highpage->mapping = page->mapping; + if (page->mapping) + BUG(); /* * We can just forget the old page since diff -urN 2.4.0-test1-ac7/mm/memory.c 2.4.0-test1-ac7-VM-31/mm/memory.c --- 2.4.0-test1-ac7/mm/memory.c Sat Jun 3 14:52:35 2000 +++ 2.4.0-test1-ac7-VM-31/mm/memory.c Sat Jun 3 15:27:14 2000 @@ -156,6 +156,7 @@ unsigned long address = vma->vm_start; unsigned long end = vma->vm_end; unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; + spinlock_t * pte_lock = &vma->vm_mm->page_table_lock; src_pgd = pgd_offset(src, address)-1; dst_pgd = pgd_offset(dst, address)-1; @@ -208,9 +209,16 @@ src_pte = pte_offset(src_pmd, address); dst_pte = pte_offset(dst_pmd, address); + /* + * This spin_lock wouldn't be necessary right now + * since everybody holds the kernel lock but + * it will become necessary later. + */ + spin_lock(pte_lock); do { pte_t pte = *src_pte; unsigned long page_nr; + struct page * page; /* copy_one_pte */ @@ -235,15 +243,19 @@ /* If it's a shared mapping, mark it clean in the child */ if (vma->vm_flags & VM_SHARED) pte = pte_mkclean(pte); + page = &mem_map[page_nr]; + if (page->map_count) + page_map(page); + get_page(page); set_pte(dst_pte, pte_mkold(pte)); - get_page(mem_map + page_nr); cont_copy_pte_range: address += PAGE_SIZE; if (address >= end) - goto out; + goto out_unlock; src_pte++; dst_pte++; } while ((unsigned long)src_pte & PTE_TABLE_MASK); + spin_unlock(pte_lock); cont_copy_pmd_range: src_pmd++; dst_pmd++; @@ -251,6 +263,9 @@ } out: return 0; +out_unlock: + spin_unlock(pte_lock); + return 0; nomem: return -ENOMEM; @@ -259,20 +274,21 @@ /* * Return indicates whether a page was freed so caller can adjust rss */ -static inline int free_pte(pte_t page) +static inline int free_pte(pte_t pte) { - if (pte_present(page)) { - unsigned long nr = pte_pagenr(page); - if (nr >= max_mapnr || PageReserved(mem_map+nr)) + if (pte_present(pte)) { + unsigned long nr = pte_pagenr(pte); + struct page * page = mem_map + nr; + if (nr >= max_mapnr || PageReserved(page)) return 0; /* * free_page() used to be able to clear swap cache * entries. We may now have to do it manually. */ - free_page_and_swap_cache(mem_map+nr); + free_page_and_swap_cache(page); return 1; } - swap_free(pte_to_swp_entry(page)); + swap_free(pte_to_swp_entry(pte)); return 0; } @@ -781,8 +797,14 @@ */ static inline void establish_pte(struct vm_area_struct * vma, unsigned long address, pte_t *page_table, pte_t entry) { - flush_tlb_page(vma, address); set_pte(page_table, entry); + /* + * Sorry but with the current linux VM design the tlb flush have + * to happen after setting the pte or threads will break in SMP + * for everybody (yes, also in the architectures that can't flush + * the tlb without the valid pte entry in place). + */ + flush_tlb_page(vma, address); update_mmu_cache(vma, address, entry); } @@ -792,7 +814,7 @@ copy_cow_page(old_page,new_page,address); flush_page_to_ram(new_page); flush_cache_page(vma, address); - establish_pte(vma, address, page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)))); + establish_pte(vma, address, page_table, pte_mkyoung(pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))))); } /* @@ -836,6 +858,7 @@ */ switch (page_count(old_page)) { case 2: + case 3: /* * Lock the page so that no one can look it up from * the swap cache, grab a reference and start using it. @@ -843,14 +866,20 @@ */ if (!PageSwapCache(old_page) || TryLockPage(old_page)) break; - if (is_page_shared(old_page)) { + if (is_swap_cache_shared(old_page)) { UnlockPage(old_page); break; } - SetPageDirty(old_page); + lru_cache_unmap(old_page, LRU_SWAP_CACHE); + delete_from_swap_cache_nolock(old_page); UnlockPage(old_page); + page_anon_init_map_wmb(old_page); /* FallThrough */ case 1: + if (PageReserved(old_page)) + break; + if (old_page->map_count != 1) + BUG(); flush_cache_page(vma, address); establish_pte(vma, address, page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte)))); spin_unlock(&mm->page_table_lock); @@ -870,8 +899,20 @@ * Re-check the pte - we dropped the lock */ if (pte_val(*page_table) == pte_val(pte)) { - if (PageReserved(old_page)) + if (!PageReserved(old_page)) { + /* SHM memory is never write protected */ + if (!old_page->map_count) + BUG(); + lru_cache_unmap(old_page, + !PageSwapCache(old_page) ? + LRU_NORMAL_CACHE : + LRU_SWAP_CACHE); + } else { + if (old_page->map_count) + BUG(); ++mm->rss; + } + page_anon_init_map_wmb(new_page); break_cow(vma, old_page, new_page, address, page_table); /* Free the old page.. */ @@ -1058,14 +1099,22 @@ */ lock_page(page); swap_free(entry); - if (write_access && !is_page_shared(page) && nr_free_highpages()) { + if (write_access && !is_swap_cache_shared(page)) { delete_from_swap_cache_nolock(page); UnlockPage(page); page = replace_with_highmem(page); + page_anon_init_map_wmb(page); pte = mk_pte(page, vma->vm_page_prot); pte = pte_mkwrite(pte_mkdirty(pte)); - } else + } else { UnlockPage(page); + /* + * No need of the page lock, we have the PG_out_lru + * that avoids us to list_del a page that isn't in the + * lru. + */ + lru_cache_map(page); + } set_pte(page_table, pte); /* No need to invalidate - it was non-present before */ @@ -1078,15 +1127,13 @@ */ static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr) { - int high = 0; struct page *page = NULL; pte_t entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); if (write_access) { page = alloc_page(GFP_HIGHUSER); if (!page) return -1; - if (PageHighMem(page)) - high = 1; + page_anon_init_map_wmb(page); clear_user_highpage(page, addr); entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); mm->rss++; diff -urN 2.4.0-test1-ac7/mm/mmap.c 2.4.0-test1-ac7-VM-31/mm/mmap.c --- 2.4.0-test1-ac7/mm/mmap.c Thu Apr 27 08:56:45 2000 +++ 2.4.0-test1-ac7-VM-31/mm/mmap.c Sat Jun 3 14:53:15 2000 @@ -56,7 +56,7 @@ * of num_physpages for safety margin. */ - long free; + unsigned long free; /* Sometimes we want to use more memory than we have. */ if (sysctl_overcommit_memory) diff -urN 2.4.0-test1-ac7/mm/numa.c 2.4.0-test1-ac7-VM-31/mm/numa.c --- 2.4.0-test1-ac7/mm/numa.c Tue Apr 18 07:11:42 2000 +++ 2.4.0-test1-ac7-VM-31/mm/numa.c Sat Jun 3 14:53:15 2000 @@ -33,7 +33,7 @@ struct page * alloc_pages_node(int nid, int gfp_mask, unsigned long order) { - return __alloc_pages(NODE_DATA(nid)->node_zonelists + gfp_mask, order); + return __alloc_pages(NODE_DATA(nid)->node_gfpmask_zone + gfp_mask, order); } #ifdef CONFIG_DISCONTIGMEM diff -urN 2.4.0-test1-ac7/mm/page_alloc.c 2.4.0-test1-ac7-VM-31/mm/page_alloc.c --- 2.4.0-test1-ac7/mm/page_alloc.c Sat Jun 3 14:52:35 2000 +++ 2.4.0-test1-ac7-VM-31/mm/page_alloc.c Sat Jun 3 16:26:49 2000 @@ -25,11 +25,10 @@ #endif int nr_swap_pages; -int nr_lru_pages; pg_data_t *pgdat_list; static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; -static int zone_balance_ratio[MAX_NR_ZONES] = { 128, 128, 512, }; +static int zone_balance_ratio[MAX_NR_ZONES] = { 128, 128, 128, }; static int zone_balance_min[MAX_NR_ZONES] = { 10 , 10, 10, }; static int zone_balance_max[MAX_NR_ZONES] = { 255 , 255, 255, }; @@ -70,6 +69,8 @@ free_area_t *area; struct page *base; zone_t *zone; + spinlock_t * freelist_lock; + pg_data_t * pgdat; /* * Subtle. We do not want to test this in the inlined part of @@ -95,6 +96,10 @@ BUG(); if (PageDirty(page)) BUG(); + if (PageOutLru(page)) + BUG(); + if (page->map_count) + BUG(); zone = page->zone; @@ -107,10 +112,25 @@ area = zone->free_area + order; - spin_lock_irqsave(&zone->lock, flags); + pgdat = zone->zone_pgdat; + freelist_lock = &pgdat->freelist_lock; + spin_lock_irqsave(freelist_lock, flags); zone->free_pages -= mask; + /* update the classzone */ + { + int nr_zone = zone->nr_zone; + register zone_t * z = zone; + do { + z->classzone_free_pages -= mask; + if (z->zone_wake_kswapd && + z->classzone_free_pages > z->pages_high) + z->zone_wake_kswapd = 0; + z++; + } while (++nr_zone < pgdat->nr_zones); + } + while (mask + (1 << (MAX_ORDER-1))) { struct page *buddy1, *buddy2; @@ -138,16 +158,7 @@ page_idx &= mask; } memlist_add_head(&(base + page_idx)->list, &area->free_list); - - spin_unlock_irqrestore(&zone->lock, flags); - - if (zone->free_pages >= zone->pages_low) { - zone->low_on_memory = 0; - } - - if (zone->free_pages >= zone->pages_high) { - zone->zone_wake_kswapd = 0; - } + spin_unlock_irqrestore(freelist_lock, flags); } #define MARK_USED(index, order, area) \ @@ -174,16 +185,14 @@ return page; } -static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned long order)); -static struct page * rmqueue(zone_t *zone, unsigned long order) +static inline struct page * rmqueue(zone_t *zone, unsigned long order, unsigned long flags) { free_area_t * area = zone->free_area + order; unsigned long curr_order = order; struct list_head *head, *curr; - unsigned long flags; struct page *page; + pg_data_t * pgdat; - spin_lock_irqsave(&zone->lock, flags); do { head = &area->free_list; curr = memlist_next(head); @@ -197,10 +206,21 @@ memlist_del(curr); index = (page - mem_map) - zone->offset; MARK_USED(index, curr_order, area); - zone->free_pages -= 1 << order; + + zone->free_pages -= 1UL << order; + pgdat = zone->zone_pgdat; + /* update the classzone */ + { + int nr_zone = zone->nr_zone; + register zone_t * z = zone; + do { + z->classzone_free_pages -= 1UL<nr_zones); + } page = expand(zone, page, index, order, curr_order, area); - spin_unlock_irqrestore(&zone->lock, flags); + spin_unlock_irqrestore(&pgdat->freelist_lock, flags); set_page_count(page, 1); if (BAD_RANGE(zone,page)) @@ -210,7 +230,6 @@ curr_order++; area++; } while (curr_order < MAX_ORDER); - spin_unlock_irqrestore(&zone->lock, flags); return NULL; } @@ -218,141 +237,130 @@ /* * This is the 'heart' of the zoned buddy allocator: */ -struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order) +struct page * __alloc_pages(gfpmask_zone_t * gfpmask_zone, unsigned long order) { - zone_t **zone = zonelist->zones; - extern wait_queue_head_t kswapd_wait; + zone_t * classzone = gfpmask_zone->classzone; + pg_data_t * pgdat = classzone->zone_pgdat; + int freed; + spinlock_t * freelist_lock = &pgdat->freelist_lock; + long flags; + long free_pages; + unsigned long size = 1UL << order; + + spin_lock_irqsave(freelist_lock, flags); /* - * (If anyone calls gfp from interrupts nonatomically then it - * will sooner or later tripped up by a schedule().) - * - * We are falling back to lower-level zones if allocation - * in a higher zone fails. + * If this is a recursive call, we'd better + * do our best to just allocate things without + * further thought. */ - for (;;) { - zone_t *z = *(zone++); - if (!z) - break; - if (!z->size) - BUG(); + if (current->flags & PF_MEMALLOC) + goto allocate_ok; - /* Are we supposed to free memory? Don't make it worse.. */ - if (!z->zone_wake_kswapd) { - struct page *page = rmqueue(z, order); - if (z->free_pages < z->pages_low) { - z->zone_wake_kswapd = 1; + /* classzone based memory balancing */ + free_pages = classzone->classzone_free_pages; + if (!current->low_on_memory && + free_pages > classzone->pages_low) { + int nr_zone; + zone_t * z; + + allocate_ok: + z = classzone; + for (nr_zone = classzone->nr_zone; + nr_zone >= 0; + nr_zone--, z--) { + if (z->free_pages >= size) { + struct page *page = rmqueue(z, order, flags); + if (page) + return page; } - if (page) - return page; } - } + } else { + extern wait_queue_head_t kswapd_wait; - /* All zones are in need of kswapd. */ - if (waitqueue_active(&kswapd_wait)) - wake_up_interruptible(&kswapd_wait); + if (free_pages > classzone->pages_low) { + high_mem: + if (current->low_on_memory) + current->low_on_memory = 0; + goto allocate_ok; + } - /* - * Ok, we don't have any zones that don't need some - * balancing.. See if we have any that aren't critical.. - */ - zone = zonelist->zones; - for (;;) { - zone_t *z = *(zone++); - if (!z) - break; - if (!z->low_on_memory) { - struct page *page = rmqueue(z, order); - if (z->free_pages < (z->pages_min + z->pages_low) / 2) - z->low_on_memory = 1; - if (page) - return page; + if (!classzone->zone_wake_kswapd) { + classzone->zone_wake_kswapd = 1; + wake_up_interruptible(&kswapd_wait); } - } - /* - * Uhhuh. All the zones have been critical, which means that - * we'd better do some synchronous swap-out. kswapd has not - * been able to cope.. - */ - if (!(current->flags & PF_MEMALLOC)) { - int gfp_mask = zonelist->gfp_mask; - if (!try_to_free_pages(gfp_mask)) { - if (!(gfp_mask & __GFP_HIGH)) - goto fail; + /* Are we reaching the critical stage? */ + if (!current->low_on_memory) { + /* Not yet critical, so let kswapd handle it.. */ + if (free_pages > classzone->pages_min) + goto allocate_ok; + current->low_on_memory = 1; } - } - /* - * We freed something, so we're allowed to allocate anything we can! - */ - zone = zonelist->zones; - for (;;) { - struct page *page; + spin_unlock_irqrestore(freelist_lock, flags); + freed = try_to_free_pages(gfpmask_zone->gfp_mask, classzone); + spin_lock_irq(freelist_lock); - zone_t *z = *(zone++); - if (!z) - break; - page = rmqueue(z, order); - if (page) - return page; - } + if (freed || gfpmask_zone->gfp_mask & __GFP_HIGH) + goto allocate_ok; -fail: - /* Last try, zone->low_on_memory isn't reset until we hit pages_low */ - zone = zonelist->zones; - for (;;) { - zone_t *z = *(zone++); - int gfp_mask = zonelist->gfp_mask; - if (!z) - break; - if (z->free_pages > z->pages_min) { - struct page *page = rmqueue(z, order); - if (page) - return page; - } + /* + * Re-check we're low on memory keeping the spinlock held + * before failing. Somebody may have released + * lots of memory from under us while we was trying + * to free the pages. We check against pages_high + * to be sure to succeed only if lots of memory is been + * released. + */ + free_pages = classzone->classzone_free_pages; + if (free_pages > classzone->pages_high) + goto high_mem; } - /* No luck.. */ + spin_unlock_irqrestore(freelist_lock, flags); return NULL; } /* * Total amount of free (allocatable) RAM: */ -unsigned int nr_free_pages (void) +unsigned long nr_free_pages (void) { - unsigned int sum; - zone_t *zone; + unsigned long sum; int i; sum = 0; - for (i = 0; i < NUMNODES; i++) - for (zone = NODE_DATA(i)->node_zones; zone < NODE_DATA(i)->node_zones + MAX_NR_ZONES; zone++) - sum += zone->free_pages; + for (i = 0; i < NUMNODES; i++) { + pg_data_t * pgdat = NODE_DATA(i); + zone_t * node_zones = pgdat->node_zones; + sum += node_zones[pgdat->nr_zones-1].classzone_free_pages; + } return sum; } /* * Amount of free RAM allocatable as buffer memory: */ -unsigned int nr_free_buffer_pages (void) +unsigned long nr_free_buffer_pages (void) { - unsigned int sum; - zone_t *zone; + unsigned long sum = 0; int i; - sum = nr_lru_pages; - for (i = 0; i < NUMNODES; i++) - for (zone = NODE_DATA(i)->node_zones; zone <= NODE_DATA(i)->node_zones+ZONE_NORMAL; zone++) - sum += zone->free_pages; + for (i = 0; i < NUMNODES; i++) { + pg_data_t * pgdat = NODE_DATA(i); + zone_t * node_zones = pgdat->node_zones; + int higher_zone = pgdat->nr_zones-1; + sum += pgdat->lru_cache.nr_cache_pages; + sum += node_zones[higher_zone <= ZONE_NORMAL ? higher_zone : ZONE_NORMAL].classzone_free_pages; + } return sum; } #if CONFIG_HIGHMEM -unsigned int nr_free_highpages (void) +unsigned long nr_free_highpages (void) { int i; - unsigned int pages = 0; + unsigned long pages = 0; for (i = 0; i < NUMNODES; i++) pages += NODE_DATA(i)->node_zones[ZONE_HIGHMEM].free_pages; @@ -367,30 +375,33 @@ */ void show_free_areas_core(int nid) { - unsigned long order; + unsigned long order, flags; unsigned type; + pg_data_t * pgdat = NODE_DATA(nid); + spinlock_t * freelist_lock = &pgdat->freelist_lock; - printk("Free pages: %6dkB (%6dkB HighMem)\n", + printk("Free pages: %6lukB (%6lukB HighMem)\n", nr_free_pages() << (PAGE_SHIFT-10), nr_free_highpages() << (PAGE_SHIFT-10)); - printk("( Free: %d, lru_cache: %d (%d %d %d) )\n", + printk("( Free: %lu, cache: %lu map: %lu (%d %d %d) )\n", nr_free_pages(), - nr_lru_pages, + NODE_DATA(nid)->lru_cache.nr_cache_pages, + NODE_DATA(nid)->lru_cache.nr_map_pages, freepages.min, freepages.low, freepages.high); + spin_lock_irqsave(freelist_lock, flags); for (type = 0; type < MAX_NR_ZONES; type++) { struct list_head *head, *curr; - zone_t *zone = NODE_DATA(nid)->node_zones + type; - unsigned long nr, total, flags; + zone_t *zone = pgdat->node_zones + type; + unsigned long nr, total; - printk(" %s: ", zone->name); + printk("%s: ", zone->name); total = 0; if (zone->size) { - spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < MAX_ORDER; order++) { head = &(zone->free_area + order)->free_list; curr = head; @@ -405,10 +416,15 @@ printk("%lu*%lukB ", nr, (PAGE_SIZE>>10) << order); } - spin_unlock_irqrestore(&zone->lock, flags); + if (total != zone->free_pages) + printk("error %lu ", + zone->free_pages * (PAGE_SIZE>>10)); } - printk("= %lukB)\n", total * (PAGE_SIZE>>10)); + printk("= %lukB", total * (PAGE_SIZE>>10)); + printk(" class %ldkB\n", + zone->classzone_free_pages * (PAGE_SIZE>>10)); } + spin_unlock_irqrestore(freelist_lock, flags); #ifdef SWAP_CACHE_INFO show_swap_cache_info(); @@ -423,18 +439,17 @@ /* * Builds allocation fallback zone lists. */ -static inline void build_zonelists(pg_data_t *pgdat) +static void __init build_gfpmask_zone(pg_data_t *pgdat) { int i, j, k; for (i = 0; i < NR_GFPINDEX; i++) { - zonelist_t *zonelist; + gfpmask_zone_t * gfpmask_zone; zone_t *zone; - zonelist = pgdat->node_zonelists + i; - memset(zonelist, 0, sizeof(*zonelist)); + gfpmask_zone = pgdat->node_gfpmask_zone + i; - zonelist->gfp_mask = i; + gfpmask_zone->gfp_mask = i; j = 0; k = ZONE_NORMAL; if (i & __GFP_HIGHMEM) @@ -454,21 +469,37 @@ #ifndef CONFIG_HIGHMEM BUG(); #endif - zonelist->zones[j++] = zone; + gfpmask_zone->classzone = zone; + break; } case ZONE_NORMAL: zone = pgdat->node_zones + ZONE_NORMAL; - if (zone->size) - zonelist->zones[j++] = zone; + if (zone->size) { + gfpmask_zone->classzone = zone; + break; + } case ZONE_DMA: zone = pgdat->node_zones + ZONE_DMA; - if (zone->size) - zonelist->zones[j++] = zone; + if (zone->size) { + gfpmask_zone->classzone = zone; + break; + } } - zonelist->zones[j++] = NULL; } } +static void __init lru_cache_init(pg_data_t * pgdat) +{ + int i; + lru_cache_t * this_lru = &pgdat->lru_cache; + + for (i = 0; i < NR_LRU_CACHE; i++) + INIT_LIST_HEAD(&this_lru->heads[i]); + this_lru->nr_cache_pages = 0; + this_lru->nr_map_pages = 0; + spin_lock_init(&this_lru->lock); +} + #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) /* @@ -485,7 +516,7 @@ unsigned long i, j; unsigned long map_size; unsigned long totalpages, offset, realtotalpages; - unsigned int cumulative = 0; + unsigned long classzonepages; pgdat->node_next = pgdat_list; pgdat_list = pgdat; @@ -517,7 +548,6 @@ freepages.min += i; freepages.low += i * 2; freepages.high += i * 3; - memlist_init(&lru_cache); /* * Some architectures (with lots of mem and discontinous memory @@ -534,6 +564,8 @@ pgdat->node_size = totalpages; pgdat->node_start_paddr = zone_start_paddr; pgdat->node_start_mapnr = (lmem_map - mem_map); + pgdat->nr_zones = 0; + spin_lock_init(&pgdat->freelist_lock); /* * Initially all pages are reserved - free ones are freed @@ -548,6 +580,7 @@ } offset = lmem_map - mem_map; + classzonepages = 0; for (j = 0; j < MAX_NR_ZONES; j++) { zone_t *zone = pgdat->node_zones + j; unsigned long mask; @@ -556,19 +589,22 @@ realsize = size = zones_size[j]; if (zholes_size) realsize -= zholes_size[j]; + classzonepages += realsize; printk("zone(%lu): %lu pages.\n", j, size); zone->size = size; zone->name = zone_names[j]; - zone->lock = SPIN_LOCK_UNLOCKED; zone->zone_pgdat = pgdat; + zone->nr_zone = j; zone->free_pages = 0; + zone->zone_wake_kswapd = 0; + zone->classzone_free_pages = 0; if (!size) continue; + pgdat->nr_zones = j+1; zone->offset = offset; - cumulative += size; - mask = (realsize / zone_balance_ratio[j]); + mask = (classzonepages / zone_balance_ratio[j]); if (mask < zone_balance_min[j]) mask = zone_balance_min[j]; else if (mask > zone_balance_max[j]) @@ -576,8 +612,6 @@ zone->pages_min = mask; zone->pages_low = mask*2; zone->pages_high = mask*3; - zone->low_on_memory = 0; - zone->zone_wake_kswapd = 0; zone->zone_mem_map = mem_map + offset; zone->zone_start_mapnr = offset; zone->zone_start_paddr = zone_start_paddr; @@ -606,7 +640,8 @@ (unsigned int *) alloc_bootmem_node(nid, bitmap_size); } } - build_zonelists(pgdat); + build_gfpmask_zone(pgdat); + lru_cache_init(pgdat); } void __init free_area_init(unsigned long *zones_size) diff -urN 2.4.0-test1-ac7/mm/swap.c 2.4.0-test1-ac7-VM-31/mm/swap.c --- 2.4.0-test1-ac7/mm/swap.c Tue Dec 7 15:05:28 1999 +++ 2.4.0-test1-ac7-VM-31/mm/swap.c Sat Jun 3 14:53:15 2000 @@ -46,13 +46,7 @@ out, so that we don't try to swap TOO many pages out at once */ atomic_t nr_async_pages = ATOMIC_INIT(0); -buffer_mem_t buffer_mem = { - 2, /* minimum percent buffer */ - 10, /* borrow percent buffer */ - 60 /* maximum percent buffer */ -}; - -buffer_mem_t page_cache = { +buffer_mem_t lru_cache_mem = { 2, /* minimum percent page cache */ 15, /* borrow percent page cache */ 75 /* maximum */ diff -urN 2.4.0-test1-ac7/mm/swap_state.c 2.4.0-test1-ac7-VM-31/mm/swap_state.c --- 2.4.0-test1-ac7/mm/swap_state.c Sat Jun 3 14:52:35 2000 +++ 2.4.0-test1-ac7-VM-31/mm/swap_state.c Sat Jun 3 15:24:36 2000 @@ -47,8 +47,6 @@ void add_to_swap_cache(struct page *page, swp_entry_t entry) { - unsigned long flags; - #ifdef SWAP_CACHE_INFO swap_cache_add_total++; #endif @@ -58,9 +56,8 @@ BUG(); if (page->mapping) BUG(); - flags = page->flags & ~((1 << PG_error) | (1 << PG_dirty)); - page->flags = flags | (1 << PG_referenced) | (1 << PG_uptodate); - add_to_page_cache_locked(page, &swapper_space, entry.val); + page->flags &= ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty) | (1 << PG_referenced)); + add_to_swap_cache_locked(page, &swapper_space, entry.val); } static inline void remove_from_swap_cache(struct page *page) @@ -73,7 +70,6 @@ PAGE_BUG(page); PageClearSwapCache(page); - ClearPageDirty(page); remove_inode_page(page); } @@ -132,13 +128,22 @@ /* * If we are the only user, then try to free up the swap cache. */ - if (PageSwapCache(page) && !TryLockPage(page)) { - if (!is_page_shared(page)) { - delete_from_swap_cache_nolock(page); + if (!PageSwapCache(page)) { + if (page->map_count) + lru_cache_unmap(page, LRU_NORMAL_CACHE); + } else { + if (page->map_count <= 0) + BUG(); + lru_cache_unmap(page, LRU_SWAP_CACHE); + + if (!TryLockPage(page)) { + if (!is_swap_cache_shared(page)) + delete_from_swap_cache_nolock(page); + UnlockPage(page); } - UnlockPage(page); } - page_cache_release(page); + + __free_page(page); } @@ -205,7 +210,6 @@ struct page * read_swap_cache_async(swp_entry_t entry, int wait) { struct page *found_page = 0, *new_page; - unsigned long new_page_addr; /* * Make sure the swap entry is still in use. @@ -219,10 +223,9 @@ if (found_page) goto out_free_swap; - new_page_addr = __get_free_page(GFP_USER); - if (!new_page_addr) + new_page = alloc_page(GFP_USER); + if (!new_page) goto out_free_swap; /* Out of memory */ - new_page = mem_map + MAP_NR(new_page_addr); /* * Check the swap cache again, in case we stalled above. @@ -233,7 +236,8 @@ /* * Add it to the swap cache and read its contents. */ - lock_page(new_page); + if (TryLockPage(new_page)) + BUG(); add_to_swap_cache(new_page, entry); rw_swap_page(READ, new_page, wait); return new_page; diff -urN 2.4.0-test1-ac7/mm/swapfile.c 2.4.0-test1-ac7-VM-31/mm/swapfile.c --- 2.4.0-test1-ac7/mm/swapfile.c Fri May 26 22:47:10 2000 +++ 2.4.0-test1-ac7-VM-31/mm/swapfile.c Sat Jun 3 14:53:15 2000 @@ -230,6 +230,7 @@ return; set_pte(dir, pte_mkdirty(mk_pte(page, vma->vm_page_prot))); swap_free(entry); + lru_cache_map(page); get_page(page); ++vma->vm_mm->rss; } @@ -315,10 +316,20 @@ */ if (!mm) return; + /* + * Avoid the vmas to go away from under us + * and also avoids the task to play with + * pagetables under do_wp_page(). If the + * vmlist_modify_lock wouldn't acquire the + * mm->page_table_lock spinlock we should + * acquire it by hand. + */ + vmlist_access_lock(mm); for (vma = mm->mmap; vma; vma = vma->vm_next) { pgd_t * pgd = pgd_offset(mm, vma->vm_start); unuse_vma(vma, pgd, entry, page); } + vmlist_access_unlock(mm); return; } diff -urN 2.4.0-test1-ac7/mm/vmscan.c 2.4.0-test1-ac7-VM-31/mm/vmscan.c --- 2.4.0-test1-ac7/mm/vmscan.c Sat Jun 3 14:52:35 2000 +++ 2.4.0-test1-ac7-VM-31/mm/vmscan.c Sat Jun 3 15:23:41 2000 @@ -34,7 +34,7 @@ * using a process that no longer actually exists (it might * have died while we slept). */ -static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask) +static int try_to_swap_out(struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask) { pte_t pte; swp_entry_t entry; @@ -48,9 +48,6 @@ if ((page-mem_map >= max_mapnr) || PageReserved(page)) goto out_failed; - if (mm->swap_cnt) - mm->swap_cnt--; - /* Don't look at this pte if it's been accessed recently. */ if (pte_young(pte)) { /* @@ -62,10 +59,6 @@ goto out_failed; } - /* Can only do this if we age all active pages. */ - if (PageActive(page) && page->age > 1) - goto out_failed; - if (TryLockPage(page)) goto out_failed; @@ -78,10 +71,9 @@ * memory, and we should just continue our scan. */ if (PageSwapCache(page)) { - if (pte_dirty(pte)) - SetPageDirty(page); entry.val = page->index; swap_duplicate(entry); + lru_cache_unmap(page, LRU_SWAP_CACHE); set_pte(page_table, swp_entry_to_pte(entry)); drop_pte: UnlockPage(page); @@ -106,6 +98,8 @@ */ if (!pte_dirty(pte)) { flush_cache_page(vma, address); + if (page->map_count) + lru_cache_unmap(page, LRU_NORMAL_CACHE); pte_clear(page_table); goto drop_pte; } @@ -119,13 +113,6 @@ goto out_unlock; /* - * Don't do any of the expensive stuff if - * we're not really interested in this zone. - */ - if (page->zone->free_pages > page->zone->pages_high) - goto out_unlock; - - /* * Ok, it's really dirty. That means that * we should either create a new swap cache * entry for it, or we should write it back @@ -148,6 +135,8 @@ if (vma->vm_ops && (swapout = vma->vm_ops->swapout)) { int error; struct file *file = vma->vm_file; + if (page->map_count) + lru_cache_unmap(page, LRU_NORMAL_CACHE); if (file) get_file(file); pte_clear(page_table); vma->vm_mm->rss--; @@ -174,6 +163,11 @@ if (!(page = prepare_highmem_swapout(page))) goto out_swap_free; + if (page->map_count <= 0) { + printk("not mapped anonymous page, please report to andrea@suse.de: mapping %p, index %lu, flags %lx, count %d, map_count %d, flags %lx, buffers %p\n", page->mapping, page->index, page->flags, page_count(page), page->map_count, page->flags, page->buffers); + BUG(); + } + page_unmap(page); swap_duplicate(entry); /* One for the process, one for the swap cache */ @@ -187,10 +181,7 @@ vmlist_access_unlock(vma->vm_mm); /* OK, do a physical asynchronous write to swap. */ - // rw_swap_page(WRITE, page, 0); - /* Let shrink_mmap handle this swapout. */ - SetPageDirty(page); - UnlockPage(page); + rw_swap_page(WRITE, page, 0); out_free_success: page_cache_release(page); @@ -218,7 +209,7 @@ * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de */ -static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask) +static inline int swap_out_pmd(struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask) { pte_t * pte; unsigned long pmd_end; @@ -240,18 +231,16 @@ do { int result; vma->vm_mm->swap_address = address + PAGE_SIZE; - result = try_to_swap_out(mm, vma, address, pte, gfp_mask); + result = try_to_swap_out(vma, address, pte, gfp_mask); if (result) return result; - if (!mm->swap_cnt) - return 0; address += PAGE_SIZE; pte++; } while (address && (address < end)); return 0; } -static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask) +static inline int swap_out_pgd(struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask) { pmd_t * pmd; unsigned long pgd_end; @@ -271,18 +260,16 @@ end = pgd_end; do { - int result = swap_out_pmd(mm, vma, pmd, address, end, gfp_mask); + int result = swap_out_pmd(vma, pmd, address, end, gfp_mask); if (result) return result; - if (!mm->swap_cnt) - return 0; address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); return 0; } -static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int gfp_mask) +static int swap_out_vma(struct vm_area_struct * vma, unsigned long address, int gfp_mask) { pgd_t *pgdir; unsigned long end; @@ -297,11 +284,9 @@ if (address >= end) BUG(); do { - int result = swap_out_pgd(mm, vma, pgdir, address, end, gfp_mask); + int result = swap_out_pgd(vma, pgdir, address, end, gfp_mask); if (result) return result; - if (!mm->swap_cnt) - return 0; address = (address + PGDIR_SIZE) & PGDIR_MASK; pgdir++; } while (address && (address < end)); @@ -329,7 +314,7 @@ address = vma->vm_start; for (;;) { - int result = swap_out_vma(mm, vma, address, gfp_mask); + int result = swap_out_vma(vma, address, gfp_mask); if (result) return result; vma = vma->vm_next; @@ -356,6 +341,7 @@ struct task_struct * p; int counter; int __ret = 0; + int assign = 0; lock_kernel(); /* @@ -372,7 +358,7 @@ * Think of swap_cnt as a "shadow rss" - it tells us which process * we want to page out (always try largest first). */ - counter = (nr_threads << 2) >> (priority >> 2); + counter = nr_threads / (priority+1); if (counter < 1) counter = 1; @@ -380,7 +366,6 @@ unsigned long max_cnt = 0; struct mm_struct *best = NULL; int pid = 0; - int assign = 0; select: read_lock(&tasklist_lock); p = init_task.next_task; @@ -400,6 +385,8 @@ } } read_unlock(&tasklist_lock); + if (assign == 1) + assign = 2; if (!best) { if (!assign) { assign = 1; @@ -432,75 +419,118 @@ * now we need this so that we can do page allocations * without holding the kernel lock etc. * - * We want to try to free "count" pages, and we want to - * cluster them so that we get good swap-out behaviour. - * - * Don't try _too_ hard, though. We don't want to have bad - * latency. + * We want to try to free "count" pages, and we need to + * cluster them so that we get good swap-out behaviour. See + * the "free_memory()" macro for details. */ -#define FREE_COUNT 8 -#define SWAP_COUNT 16 -static int do_try_to_free_pages(unsigned int gfp_mask) +static int do_try_to_free_pages(unsigned int gfp_mask, zone_t *zone) { int priority; - int count = FREE_COUNT; - int swap_count; + int count = SWAP_CLUSTER_MAX; /* Always trim SLAB caches when memory gets low. */ kmem_cache_reap(gfp_mask); - priority = 64; + priority = 6; do { - while (shrink_mmap(priority, gfp_mask)) { + while (shrink_mmap(priority, zone)) { if (!--count) goto done; } + /* + * don't be too light against the d/i cache since + * shrink_mmap() almost never fail when there's + * really plenty of memory free. + */ + count -= shrink_dcache_memory(priority, gfp_mask, zone); + count -= shrink_icache_memory(priority, gfp_mask, zone); + if (count <= 0) + goto done; + /* Try to get rid of some shared memory pages.. */ - if (gfp_mask & __GFP_IO) { - /* - * don't be too light against the d/i cache since - * shrink_mmap() almost never fail when there's - * really plenty of memory free. - */ - count -= shrink_dcache_memory(priority, gfp_mask); - count -= shrink_icache_memory(priority, gfp_mask); - if (count <= 0) + while (shm_swap(priority, gfp_mask, zone)) { + if (!--count) goto done; - while (shm_swap(priority, gfp_mask)) { - if (!--count) - goto done; - } } - /* - * Then, try to page stuff out.. - * - * This will not actually free any pages (they get - * put in the swap cache), so we must not count this - * as a "count" success. - */ - swap_count = SWAP_COUNT; - while (swap_out(priority, gfp_mask)) - if (--swap_count < 0) - break; - + /* Then, try to page stuff out.. */ + while (swap_out(priority, gfp_mask)) { + if (!--count) + goto done; + } } while (--priority >= 0); +done: - /* Always end on a shrink_mmap.. */ - while (shrink_mmap(0, gfp_mask)) { - if (!--count) - goto done; + return priority >= 0; +} + +DECLARE_WAIT_QUEUE_HEAD(kswapd_wait); + +static int kswapd_work_pgdat(pg_data_t * pgdat) +{ + int worked = 0, i; + zone_t * zone; + + for (i = pgdat->nr_zones-1; i >= 0; i--) { + zone = pgdat->node_zones + i; + if (current->need_resched) + schedule(); + if (!zone->zone_wake_kswapd) + continue; + if (!do_try_to_free_pages(GFP_KSWAPD, zone)) { + zone->zone_wake_kswapd = 0; + continue; + } + worked = 1; + } + + return worked; +} + +static void kswapd_work(void) +{ + int worked; + pg_data_t * pgdat; + + do { + worked = 0; + pgdat = pgdat_list; + do + worked |= kswapd_work_pgdat(pgdat); + while ((pgdat = pgdat->node_next)); + } while (worked); +} + +static int kswapd_can_sleep_pgdat(pg_data_t * pgdat) +{ + zone_t * zone; + int i; + + for (i = pgdat->nr_zones-1; i >= 0; i--) { + zone = pgdat->node_zones + i; + if (!zone->zone_wake_kswapd) + continue; + return 0; } - /* We return 1 if we are freed some page */ - return (count != FREE_COUNT); -done: return 1; } -DECLARE_WAIT_QUEUE_HEAD(kswapd_wait); +static int kswapd_can_sleep(void) +{ + pg_data_t * pgdat; + + pgdat = pgdat_list; + do { + if (kswapd_can_sleep_pgdat(pgdat)) + continue; + return 0; + } while ((pgdat = pgdat->node_next)); + + return 1; +} /* * The background pageout daemon, started as a kernel thread @@ -518,11 +548,13 @@ int kswapd(void *unused) { struct task_struct *tsk = current; + wait_queue_t wait; tsk->session = 1; tsk->pgrp = 1; strcpy(tsk->comm, "kswapd"); sigfillset(&tsk->blocked); + init_waitqueue_entry(&wait, tsk); /* * Tell the memory management that we're a "memory allocator", @@ -538,30 +570,23 @@ */ tsk->flags |= PF_MEMALLOC; - for (;;) { - pg_data_t *pgdat; - int something_to_do = 0; + while (1) { + /* + * If we actually get into a low-memory situation, + * the processes needing more memory will wake us + * up on a more timely basis. + */ + kswapd_work(); + run_task_queue(&tq_disk); - pgdat = pgdat_list; - do { - int i; - for(i = 0; i < MAX_NR_ZONES; i++) { - zone_t *zone = pgdat->node_zones+ i; - if (tsk->need_resched) - schedule(); - if (!zone->size || !zone->zone_wake_kswapd) - continue; - if (zone->free_pages < zone->pages_low) - something_to_do = 1; - do_try_to_free_pages(GFP_KSWAPD); - } - pgdat = pgdat->node_next; - } while (pgdat); + __set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&kswapd_wait, &wait); - if (!something_to_do) { - tsk->state = TASK_INTERRUPTIBLE; - interruptible_sleep_on(&kswapd_wait); - } + if (kswapd_can_sleep()) + schedule(); + + __set_current_state(TASK_RUNNING); + remove_wait_queue(&kswapd_wait, &wait); } } @@ -580,13 +605,13 @@ * can be done by just dropping cached pages without having * any deadlock issues. */ -int try_to_free_pages(unsigned int gfp_mask) +int try_to_free_pages(unsigned int gfp_mask, zone_t *zone) { int retval = 1; if (gfp_mask & __GFP_WAIT) { current->flags |= PF_MEMALLOC; - retval = do_try_to_free_pages(gfp_mask); + retval = do_try_to_free_pages(gfp_mask, zone); current->flags &= ~PF_MEMALLOC; } return retval;