Binary files 2.4.0-test7-pre5aa1/ID and 2.4.0-test7-pre5aa1-cz/ID differ diff -urN 2.4.0-test7-pre5aa1/arch/i386/mm/init.c 2.4.0-test7-pre5aa1-cz/arch/i386/mm/init.c --- 2.4.0-test7-pre5aa1/arch/i386/mm/init.c Thu Aug 17 19:57:23 2000 +++ 2.4.0-test7-pre5aa1-cz/arch/i386/mm/init.c Tue Aug 22 13:09:05 2000 @@ -606,7 +606,7 @@ initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; printk("Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n", - (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), + nr_free_pages() << (PAGE_SHIFT-10), max_mapnr << (PAGE_SHIFT-10), codesize >> 10, reservedpages << (PAGE_SHIFT-10), diff -urN 2.4.0-test7-pre5aa1/fs/buffer.c 2.4.0-test7-pre5aa1-cz/fs/buffer.c --- 2.4.0-test7-pre5aa1/fs/buffer.c Tue Aug 22 01:23:50 2000 +++ 2.4.0-test7-pre5aa1-cz/fs/buffer.c Tue Aug 22 13:14:18 2000 @@ -119,12 +119,12 @@ when trying to refill buffers. */ int interval; /* jiffies delay between kupdate flushes */ int age_buffer; /* Time for normal buffer to age before we flush it */ - int dummy1; /* unused, was age_super */ + int age_super; /* Time for superblock to age before we flush it */ int dummy2; /* unused */ int dummy3; /* unused */ } b_un; unsigned int data[N_PARAM]; -} bdf_prm = {{40, 500, 64, 256, 5*HZ, 30*HZ, 5*HZ, 1884, 2}}; +} bdf_prm = {{40, 500, 64, 256, 5*HZ, 30*HZ, 30*HZ, 1884, 2}}; /* These are the min and max parameter values that we will allow to be assigned */ int bdflush_min[N_PARAM] = { 0, 10, 5, 25, 0, 1*HZ, 1*HZ, 1, 1}; @@ -894,7 +894,7 @@ static __inline__ void __mark_dirty(struct buffer_head *bh, int flag) { - bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer; + bh->b_flushtime = jiffies + (flag ? bdf_prm.b_un.age_super : bdf_prm.b_un.age_buffer); refile_buffer(bh); } @@ -1078,6 +1078,21 @@ } /* + * After reaping some pages from the page-cache, vmscan may call + * this function to flush buffer-heads out of their slab cache. + */ +int shrink_buffer_headers(zone_t * zone) +{ + int nr_pages = 0; + /* + * Must not be called before the buffer-head cache is set-up. + */ + kmem_cache_shrink(bh_cachep, zone, &nr_pages); + + return nr_pages; +} + +/* * Reserve NR_RESERVED buffer heads for async IO requests to avoid * no-buffer-head deadlock. Return NULL on failure; waiting for * buffer heads is now handled in create_buffers(). @@ -1322,6 +1337,7 @@ */ if (!offset) { if (!try_to_free_buffers(page, 0)) { + BUG(); atomic_inc(&buffermem_pages); return 0; } @@ -1330,7 +1346,14 @@ return 1; } -static void create_empty_buffers(struct page *page, struct inode *inode, unsigned long blocksize) +#define create_empty_buffers(page, inode, blocksize) \ +do { \ + if (!(page)->buffers) \ + __create_empty_buffers(page, inode, blocksize); \ + SetPageBufferAge(page); \ +} while(0) + +static void __create_empty_buffers(struct page *page, struct inode *inode, unsigned long blocksize) { struct buffer_head *bh, *head, *tail; @@ -1351,27 +1374,13 @@ page_cache_get(page); } -/* - * We are taking a block for data and we don't want any output from any - * buffer-cache aliases starting from return from that function and - * until the moment when something will explicitly mark the buffer - * dirty (hopefully that will not happen until we will free that block ;-) - * We don't even need to mark it not-uptodate - nobody can expect - * anything from a newly allocated buffer anyway. We used to used - * unmap_buffer() for such invalidation, but that was wrong. We definitely - * don't want to mark the alias unmapped, for example - it would confuse - * anyone who might pick it with bread() afterwards... - */ - static void unmap_underlying_metadata(struct buffer_head * bh) { struct buffer_head *old_bh; old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size); if (old_bh) { - mark_buffer_clean(old_bh); - wait_on_buffer(old_bh); - clear_bit(BH_Req, &old_bh->b_state); + unmap_buffer(old_bh); /* Here we could run brelse or bforget. We use bforget because it will try to put the buffer in the freelist. */ @@ -1392,8 +1401,7 @@ if (!PageLocked(page)) BUG(); - if (!page->buffers) - create_empty_buffers(page, inode, inode->i_sb->s_blocksize); + create_empty_buffers(page, inode, inode->i_sb->s_blocksize); head = page->buffers; block = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); @@ -1448,8 +1456,7 @@ char *kaddr = (char *)kmap(page); blocksize = inode->i_sb->s_blocksize; - if (!page->buffers) - create_empty_buffers(page, inode, blocksize); + create_empty_buffers(page, inode, blocksize); head = page->buffers; bbits = inode->i_sb->s_blocksize_bits; @@ -1558,8 +1565,7 @@ if (!PageLocked(page)) PAGE_BUG(page); blocksize = inode->i_sb->s_blocksize; - if (!page->buffers) - create_empty_buffers(page, inode, blocksize); + create_empty_buffers(page, inode, blocksize); head = page->buffers; blocks = PAGE_CACHE_SIZE >> inode->i_sb->s_blocksize_bits; @@ -2136,25 +2142,53 @@ * * This all is required so that we can free up memory * later. - * - * Wait: - * 0 - no wait (this does not get called - see try_to_free_buffers below) - * 1 - start IO for dirty buffers - * 2 - wait for completion of locked buffers */ -static void sync_page_buffers(struct buffer_head *bh, int wait) +static int sync_page_buffers(struct buffer_head *bh) { struct buffer_head * tmp = bh; + int ret, i; +#if BITS_PER_LONG < (MAX_BUF_PER_PAGE+1) +#error wait_IO is too short, convert to it to array for your architecture in this define +#else + unsigned long wait_IO = 0, clean = 0; +#endif + i = 0; do { struct buffer_head *p = tmp; tmp = tmp->b_this_page; - if (buffer_locked(p)) { - if (wait > 1) - __wait_on_buffer(p); - } else if (buffer_dirty(p)) + + if (buffer_dirty(p)) ll_rw_block(WRITE, 1, &p); + + if (buffer_locked(p)) { + if (test_and_set_bit(BH_Wait_IO, &p->b_state)) { + if (buffer_locked(p)) + wait_IO |= 1UL << i; + else { + clear_bit(BH_Wait_IO, &p->b_state); + clean |= 1UL << i; + } + } + } else + clean |= 1UL << i; + + i++; } while (tmp != bh); + + ret = (clean | wait_IO) == ((1UL << i) - 1); + + while (wait_IO) { + struct buffer_head *p = tmp; + tmp = tmp->b_this_page; + if (wait_IO & 1) + wait_on_buffer(p); + if (tmp == bh) + break; + wait_IO >>= 1; + } + + return ret; } /* @@ -2174,11 +2208,13 @@ * obtain a reference to a buffer head within a page. So we must * lock out all of these paths to cleanly toss the page. */ -int try_to_free_buffers(struct page * page, int wait) +int try_to_free_buffers(struct page * page, int gfp_mask) { struct buffer_head * tmp, * bh = page->buffers; int index = BUFSIZE_INDEX(bh->b_size); + int pass = 0; + again: spin_lock(&lru_list_lock); write_lock(&hash_table_lock); spin_lock(&free_list[index].lock); @@ -2223,9 +2259,11 @@ /* Uhhuh, start writeback so that we don't end up with all dirty pages */ spin_unlock(&free_list[index].lock); write_unlock(&hash_table_lock); - spin_unlock(&lru_list_lock); - if (wait) - sync_page_buffers(bh, wait); + spin_unlock(&lru_list_lock); + if ((gfp_mask & __GFP_IO) && !pass && sync_page_buffers(bh)) { + pass = 1; + goto again; + } return 0; } diff -urN 2.4.0-test7-pre5aa1/fs/dcache.c 2.4.0-test7-pre5aa1-cz/fs/dcache.c --- 2.4.0-test7-pre5aa1/fs/dcache.c Tue Aug 22 01:23:50 2000 +++ 2.4.0-test7-pre5aa1-cz/fs/dcache.c Tue Aug 22 13:16:28 2000 @@ -339,9 +339,17 @@ if (tmp == &dentry_unused) break; - dentry_stat.nr_unused--; - list_del_init(tmp); dentry = list_entry(tmp, struct dentry, d_lru); + list_del(tmp); + + if (dentry->d_flags & DCACHE_REFERENCED) { + dentry->d_flags &= ~DCACHE_REFERENCED; + list_add(tmp, &dentry_unused); + continue; + } + + dentry_stat.nr_unused--; + INIT_LIST_HEAD(tmp); /* Unused dentry with a count? */ if (atomic_read(&dentry->d_count)) @@ -495,6 +503,7 @@ if (!atomic_read(&dentry->d_count)) { list_del(&dentry->d_lru); list_add(&dentry->d_lru, dentry_unused.prev); + dentry->d_flags &= ~DCACHE_REFERENCED; found++; } /* @@ -551,20 +560,17 @@ * ... * 6 - base-level: try to shrink a bit. */ -int shrink_dcache_memory(int priority, unsigned int gfp_mask) +int shrink_dcache_memory(int priority, zone_t * zone) { - int count = 0; + int count = 0, nr_pages = 0; + if (priority) count = dentry_stat.nr_unused / priority; prune_dcache(count); - /* FIXME: kmem_cache_shrink here should tell us - the number of pages freed, and it should - work in a __GFP_DMA/__GFP_HIGHMEM behaviour - to free only the interesting pages in - function of the needs of the current allocation. */ - kmem_cache_shrink(dentry_cache); - return 0; + kmem_cache_shrink(dentry_cache, zone, &nr_pages); + + return nr_pages; } #define NAME_ALLOC_LEN(len) ((len+16) & ~15) @@ -723,6 +729,7 @@ continue; } __dget_locked(dentry); + dentry->d_flags |= DCACHE_REFERENCED; spin_unlock(&dcache_lock); return dentry; } diff -urN 2.4.0-test7-pre5aa1/fs/inode.c 2.4.0-test7-pre5aa1-cz/fs/inode.c --- 2.4.0-test7-pre5aa1/fs/inode.c Tue Aug 22 01:23:51 2000 +++ 2.4.0-test7-pre5aa1-cz/fs/inode.c Tue Aug 22 13:09:05 2000 @@ -417,8 +417,9 @@ void prune_icache(int goal) { - LIST_HEAD(list); - struct list_head *entry, *freeable = &list; + LIST_HEAD(freeable); + LIST_HEAD(unfreeable); + struct list_head *entry; int count = 0; struct inode * inode; @@ -426,49 +427,53 @@ /* go simple and safe syncing everything before starting */ sync_all_inodes(); - entry = inode_unused.prev; - while (entry != &inode_unused) + while ((entry = inode_unused.prev) != &inode_unused) { - struct list_head *tmp = entry; + list_del(entry); + inode = INODE(entry); - entry = entry->prev; - inode = INODE(tmp); if (inode->i_state & (I_FREEING|I_CLEAR)) BUG(); - if (!CAN_UNUSE(inode)) - continue; if (atomic_read(&inode->i_count)) BUG(); - list_del(tmp); + + if (inode->i_state & I_REFERENCED) { + inode->i_state &= ~I_REFERENCED; + list_add(entry, &inode_unused); + continue; + } + + if (!CAN_UNUSE(inode)) { + list_add(entry, &unfreeable); + continue; + } + list_del(&inode->i_hash); INIT_LIST_HEAD(&inode->i_hash); - list_add(tmp, freeable); + list_add(entry, &freeable); inode->i_state |= I_FREEING; count++; if (!--goal) break; } inodes_stat.nr_unused -= count; + list_splice(&unfreeable, &inode_unused); spin_unlock(&inode_lock); - dispose_list(freeable); + dispose_list(&freeable); } -int shrink_icache_memory(int priority, int gfp_mask) +int shrink_icache_memory(int priority, zone_t * zone) { - int count = 0; + int count = 0, nr_pages = 0; if (priority) count = inodes_stat.nr_unused / priority; prune_icache(count); - /* FIXME: kmem_cache_shrink here should tell us - the number of pages freed, and it should - work in a __GFP_DMA/__GFP_HIGHMEM behaviour - to free only the interesting pages in - function of the needs of the current allocation. */ - kmem_cache_shrink(inode_cachep); - return 0; + kmem_cache_shrink(inode_cachep, zone, &nr_pages); + + return nr_pages; } /* @@ -495,6 +500,7 @@ continue; if (find_actor && !find_actor(inode, ino, opaque)) continue; + inode->i_state |= I_REFERENCED; break; } return inode; diff -urN 2.4.0-test7-pre5aa1/include/linux/cache.h 2.4.0-test7-pre5aa1-cz/include/linux/cache.h --- 2.4.0-test7-pre5aa1/include/linux/cache.h Tue Aug 22 01:30:01 2000 +++ 2.4.0-test7-pre5aa1-cz/include/linux/cache.h Tue Aug 22 14:56:35 2000 @@ -1,6 +1,7 @@ #ifndef __LINUX_CACHE_H #define __LINUX_CACHE_H +#include #include #ifndef L1_CACHE_ALIGN @@ -13,6 +14,14 @@ #ifndef ____cacheline_aligned #define ____cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES))) +#endif + +#ifndef ____cacheline_aligned_in_smp +#ifdef CONFIG_SMP +#define ____cacheline_aligned_in_smp ____cacheline_aligned +#else +#define ____cacheline_aligned_in_smp +#endif /* CONFIG_SMP */ #endif #ifndef __cacheline_aligned diff -urN 2.4.0-test7-pre5aa1/include/linux/dcache.h 2.4.0-test7-pre5aa1-cz/include/linux/dcache.h --- 2.4.0-test7-pre5aa1/include/linux/dcache.h Tue Aug 22 01:30:01 2000 +++ 2.4.0-test7-pre5aa1-cz/include/linux/dcache.h Tue Aug 22 14:56:35 2000 @@ -115,6 +115,7 @@ * If this dentry points to a directory, then * s_nfsd_free_path semaphore will be down */ +#define DCACHE_REFERENCED 0x0008 extern spinlock_t dcache_lock; @@ -163,11 +164,11 @@ #define shrink_dcache() prune_dcache(0) struct zone_struct; /* dcache memory management */ -extern int shrink_dcache_memory(int, unsigned int); +extern int shrink_dcache_memory(int, struct zone_struct *); extern void prune_dcache(int); /* icache memory management (defined in linux/fs/inode.c) */ -extern int shrink_icache_memory(int, int); +extern int shrink_icache_memory(int, struct zone_struct *); extern void prune_icache(int); /* only used at mount-time */ diff -urN 2.4.0-test7-pre5aa1/include/linux/fs.h 2.4.0-test7-pre5aa1-cz/include/linux/fs.h --- 2.4.0-test7-pre5aa1/include/linux/fs.h Tue Aug 22 01:30:01 2000 +++ 2.4.0-test7-pre5aa1-cz/include/linux/fs.h Tue Aug 22 14:56:35 2000 @@ -203,6 +203,7 @@ #define BH_Mapped 4 /* 1 if the buffer has a disk mapping */ #define BH_New 5 /* 1 if the buffer is new and not yet written out */ #define BH_Protected 6 /* 1 if the buffer is protected */ +#define BH_Wait_IO 7 /* 1 if the buffer is under I/O for too long */ /* * Try to keep the most commonly used fields in single cache lines (16 @@ -245,6 +246,7 @@ typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate); void init_buffer(struct buffer_head *, bh_end_io_t *, void *); +unsigned int shrink_buffer_heads(unsigned int); #define __buffer_state(bh, state) (((bh)->b_state & (1UL << BH_##state)) != 0) @@ -450,6 +452,7 @@ #define I_LOCK 2 #define I_FREEING 4 #define I_CLEAR 8 +#define I_REFERENCED 16 extern void __mark_inode_dirty(struct inode *); static inline void mark_inode_dirty(struct inode *inode) @@ -938,7 +941,9 @@ extern int fs_may_remount_ro(struct super_block *); -extern int try_to_free_buffers(struct page *, int); +struct zone_struct; +extern int shrink_buffer_headers(struct zone_struct *); +extern int FASTCALL(try_to_free_buffers(struct page *, int)); extern void refile_buffer(struct buffer_head * buf); #define BUF_CLEAN 0 diff -urN 2.4.0-test7-pre5aa1/include/linux/highmem.h 2.4.0-test7-pre5aa1-cz/include/linux/highmem.h --- 2.4.0-test7-pre5aa1/include/linux/highmem.h Tue Aug 22 01:30:04 2000 +++ 2.4.0-test7-pre5aa1-cz/include/linux/highmem.h Tue Aug 22 14:56:39 2000 @@ -11,7 +11,7 @@ #include /* declarations for linux/mm/highmem.c */ -FASTCALL(unsigned int nr_free_highpages(void)); +extern unsigned int nr_free_highpages(void); extern struct page * prepare_highmem_swapout(struct page *); extern struct page * replace_with_highmem(struct page *); @@ -19,7 +19,7 @@ #else /* CONFIG_HIGHMEM */ -extern inline unsigned int nr_free_highpages(void) { return 0; } +#define nr_free_highpages() 0UL #define prepare_highmem_swapout(page) page #define replace_with_highmem(page) page diff -urN 2.4.0-test7-pre5aa1/include/linux/locks.h 2.4.0-test7-pre5aa1-cz/include/linux/locks.h --- 2.4.0-test7-pre5aa1/include/linux/locks.h Tue Aug 22 01:30:04 2000 +++ 2.4.0-test7-pre5aa1-cz/include/linux/locks.h Tue Aug 22 14:56:39 2000 @@ -29,6 +29,7 @@ extern inline void unlock_buffer(struct buffer_head *bh) { clear_bit(BH_Lock, &bh->b_state); + clear_bit(BH_Wait_IO, &bh->b_state); wake_up(&bh->b_wait); } diff -urN 2.4.0-test7-pre5aa1/include/linux/mm.h 2.4.0-test7-pre5aa1-cz/include/linux/mm.h --- 2.4.0-test7-pre5aa1/include/linux/mm.h Tue Aug 22 01:30:01 2000 +++ 2.4.0-test7-pre5aa1-cz/include/linux/mm.h Tue Aug 22 14:56:35 2000 @@ -168,7 +168,7 @@ #define PG_uptodate 3 #define PG_dirty 4 #define PG_decr_after 5 -#define PG_unused_01 6 +#define PG_buffer_age 6 #define PG__unused_02 7 #define PG_slab 8 #define PG_swap_cache 9 @@ -225,6 +225,10 @@ #define SetPageReserved(page) set_bit(PG_reserved, &(page)->flags) #define ClearPageReserved(page) clear_bit(PG_reserved, &(page)->flags) +#define ClearPageBufferAge(page) clear_bit(PG_buffer_age, &(page)->flags) +#define SetPageBufferAge(page) set_bit(PG_buffer_age, &(page)->flags) +#define TestandClearPageBufferAge(page) test_and_clear_bit(PG_buffer_age, &(page)->flags) + /* * Error return values for the *_nopage functions */ @@ -310,7 +314,7 @@ * can allocate highmem pages, the *get*page*() variants return * virtual kernel addresses to the allocated page(s). */ -extern struct page * FASTCALL(__alloc_pages(zonelist_t *zonelist, unsigned long order)); +extern struct page * FASTCALL(__alloc_pages(gfpmask_zone_t *, unsigned long order)); extern struct page * alloc_pages_node(int nid, int gfp_mask, unsigned long order); #ifndef CONFIG_DISCONTIGMEM @@ -321,7 +325,7 @@ */ if (order >= MAX_ORDER) return NULL; - return __alloc_pages(contig_page_data.node_zonelists+(gfp_mask), order); + return __alloc_pages(contig_page_data.node_gfpmask_zone+gfp_mask, order); } #else /* !CONFIG_DISCONTIGMEM */ extern struct page * alloc_pages(int gfp_mask, unsigned long order); @@ -415,7 +419,7 @@ /* filemap.c */ extern void remove_inode_page(struct page *); extern unsigned long page_unuse(struct page *); -extern int shrink_mmap(int, int); +extern int FASTCALL(shrink_mmap(int, int, zone_t *)); extern void truncate_inode_pages(struct address_space *, loff_t); /* generic vm_area_ops exported for stackable file systems */ diff -urN 2.4.0-test7-pre5aa1/include/linux/mmzone.h 2.4.0-test7-pre5aa1-cz/include/linux/mmzone.h --- 2.4.0-test7-pre5aa1/include/linux/mmzone.h Tue Aug 22 01:30:01 2000 +++ 2.4.0-test7-pre5aa1-cz/include/linux/mmzone.h Tue Aug 22 14:56:35 2000 @@ -21,16 +21,26 @@ struct pglist_data; +/* + * Memory balancing internally to the node can work correctly only on + * classzone basis while handling overlapped classzones. + */ typedef struct zone_struct { /* * Commonly accessed fields: */ - spinlock_t lock; unsigned long offset; unsigned long free_pages; - char low_on_memory; - char zone_wake_kswapd; + + /* + * Memory balancing is all classzone based, all the below + * fields refer to the classzone. The classzone includes + * the current zone plus all the lower zones in the MM. + */ + unsigned long classzone_free_pages; unsigned long pages_min, pages_low, pages_high; + char zone_wake_kswapd; + int nr_zone; /* * free areas of different sizes @@ -57,27 +67,34 @@ #define MAX_NR_ZONES 3 /* - * One allocation request operates on a zonelist. A zonelist - * is a list of zones, the first one is the 'goal' of the - * allocation, the other zones are fallback zones, in decreasing - * priority. - * - * Right now a zonelist takes up less than a cacheline. We never - * modify it apart from boot-up, and only a few indices are used, - * so despite the zonelist table being relatively big, the cache - * footprint of this construct is very small. + * The pgdat->node_gfpmask_zone[] array tell us which classzone + * we should allocate from given a certain gfpmask. It translates + * the gfpmask to a classzone. */ -typedef struct zonelist_struct { - zone_t * zones [MAX_NR_ZONES+1]; // NULL delimited +typedef struct gfpmask_zone_s { + zone_t * classzone; int gfp_mask; -} zonelist_t; +} gfpmask_zone_t; #define NR_GFPINDEX 0x100 +#define NR_VM_LRU 1 +#define LRU_CACHE 0 + +typedef struct vm_lru_s { + /* keep lock in a separate cacheline to avoid ping pong in SMP */ + spinlock_t lock ____cacheline_aligned_in_smp; + struct list_head head; + unsigned long nr_pages; +} vm_lru_t; + struct bootmem_data; typedef struct pglist_data { + spinlock_t freelist_lock ____cacheline_aligned_in_smp; zone_t node_zones[MAX_NR_ZONES]; - zonelist_t node_zonelists[NR_GFPINDEX]; + gfpmask_zone_t node_gfpmask_zone[NR_GFPINDEX]; + int nr_zones; + vm_lru_t vm_lru[NR_VM_LRU]; struct page *node_mem_map; unsigned long *valid_addr_bitmap; struct bootmem_data *bdata; @@ -92,8 +109,7 @@ extern pg_data_t *pgdat_list; #define memclass(pgzone, tzone) (((pgzone)->zone_pgdat == (tzone)->zone_pgdat) \ - && (((pgzone) - (pgzone)->zone_pgdat->node_zones) <= \ - ((tzone) - (pgzone)->zone_pgdat->node_zones))) + && ((pgzone) <= (tzone))) /* * The following two are not meant for general usage. They are here as diff -urN 2.4.0-test7-pre5aa1/include/linux/sched.h 2.4.0-test7-pre5aa1-cz/include/linux/sched.h --- 2.4.0-test7-pre5aa1/include/linux/sched.h Tue Aug 22 01:30:01 2000 +++ 2.4.0-test7-pre5aa1-cz/include/linux/sched.h Tue Aug 22 14:56:35 2000 @@ -373,6 +373,8 @@ u32 self_exec_id; /* Protection of (de-)allocation: mm, files, fs, tty */ spinlock_t alloc_lock; +/* Local freelist */ + struct list_head local_pages; int allocation_order, nr_local_pages; }; /* @@ -388,6 +390,7 @@ #define PF_SIGNALED 0x00000400 /* killed by a signal */ #define PF_MEMALLOC 0x00000800 /* Allocating memory */ #define PF_VFORK 0x00001000 /* Wake up parent in mm_release */ +#define PF_FREE_PAGES 0x00002000 /* In the middle of memory balancing */ #define PF_USEDFPU 0x00100000 /* task used FPU this quantum (SMP) */ @@ -452,7 +455,11 @@ blocked: {{0}}, \ sigqueue: NULL, \ sigqueue_tail: &tsk.sigqueue, \ - alloc_lock: SPIN_LOCK_UNLOCKED \ + alloc_lock: SPIN_LOCK_UNLOCKED, \ + local_pages: { \ + next: &init_task.local_pages, \ + prev: &init_task.local_pages, \ + }, \ } diff -urN 2.4.0-test7-pre5aa1/include/linux/slab.h 2.4.0-test7-pre5aa1-cz/include/linux/slab.h --- 2.4.0-test7-pre5aa1/include/linux/slab.h Tue Aug 22 01:30:01 2000 +++ 2.4.0-test7-pre5aa1-cz/include/linux/slab.h Tue Aug 22 14:56:35 2000 @@ -52,14 +52,14 @@ void (*)(void *, kmem_cache_t *, unsigned long), void (*)(void *, kmem_cache_t *, unsigned long)); extern int kmem_cache_destroy(kmem_cache_t *); -extern int kmem_cache_shrink(kmem_cache_t *); +extern int kmem_cache_shrink(kmem_cache_t *, zone_t *, int *); extern void *kmem_cache_alloc(kmem_cache_t *, int); extern void kmem_cache_free(kmem_cache_t *, void *); extern void *kmalloc(size_t, int); extern void kfree(const void *); -extern void kmem_cache_reap(int); +extern int kmem_cache_reap(int, zone_t *); extern int slabinfo_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data); extern int slabinfo_write_proc(struct file *file, const char *buffer, diff -urN 2.4.0-test7-pre5aa1/include/linux/swap.h 2.4.0-test7-pre5aa1-cz/include/linux/swap.h --- 2.4.0-test7-pre5aa1/include/linux/swap.h Tue Aug 22 01:30:04 2000 +++ 2.4.0-test7-pre5aa1-cz/include/linux/swap.h Tue Aug 22 14:56:39 2000 @@ -64,10 +64,8 @@ }; extern int nr_swap_pages; -FASTCALL(unsigned int nr_free_pages(void)); -FASTCALL(unsigned int nr_free_buffer_pages(void)); -FASTCALL(unsigned int nr_free_highpages(void)); -extern int nr_lru_pages; +extern unsigned long nr_free_pages(void); +extern unsigned long nr_free_buffer_pages(void); extern atomic_t nr_async_pages; extern struct address_space swapper_space; extern atomic_t page_cache_size; @@ -80,13 +78,13 @@ struct zone_t; /* linux/ipc/shm.c */ -extern int shm_swap(int, int); +extern int shm_swap(int, zone_t *); /* linux/mm/swap.c */ extern void swap_setup(void); /* linux/mm/vmscan.c */ -extern int try_to_free_pages(unsigned int gfp_mask); +extern int try_to_free_pages(unsigned int gfp_mask, zone_t *zone); /* linux/mm/page_io.c */ extern void rw_swap_page(int, struct page *, int); @@ -163,27 +161,38 @@ /* * Helper macros for lru_pages handling. */ -#define lru_cache_add(page) \ -do { \ - spin_lock(&pagemap_lru_lock); \ - list_add(&(page)->lru, &lru_cache); \ - nr_lru_pages++; \ - spin_unlock(&pagemap_lru_lock); \ -} while (0) +#define __vm_lru_add(page, vm_lru) \ +do { \ + list_add(&(page)->lru, &(vm_lru)->head); \ + (vm_lru)->nr_pages++; \ +} while(0) -#define __lru_cache_del(page) \ +#define __vm_lru_del(page, vm_lru) \ do { \ list_del(&(page)->lru); \ - nr_lru_pages--; \ + (vm_lru)->nr_pages--; \ +} while(0) + +#define lru_cache_add(page) \ +do { \ + vm_lru_t * vm_lru = (page)->zone->zone_pgdat->vm_lru; \ + vm_lru_t * lru_cache = &vm_lru[LRU_CACHE]; \ + \ + spin_lock(&lru_cache->lock); \ + __vm_lru_add(page, lru_cache); \ + spin_unlock(&lru_cache->lock); \ } while (0) -#define lru_cache_del(page) \ -do { \ - if (!PageLocked(page)) \ - BUG(); \ - spin_lock(&pagemap_lru_lock); \ - __lru_cache_del(page); \ - spin_unlock(&pagemap_lru_lock); \ +#define lru_cache_del(page) \ +do { \ + vm_lru_t * vm_lru = (page)->zone->zone_pgdat->vm_lru; \ + vm_lru_t * lru_cache = &vm_lru[LRU_CACHE]; \ + \ + if (!PageLocked(page)) \ + BUG(); \ + spin_lock(&lru_cache->lock); \ + __vm_lru_del(page, lru_cache); \ + spin_unlock(&lru_cache->lock); \ } while (0) extern spinlock_t swaplock; diff -urN 2.4.0-test7-pre5aa1/ipc/shm.c 2.4.0-test7-pre5aa1-cz/ipc/shm.c --- 2.4.0-test7-pre5aa1/ipc/shm.c Tue Aug 22 01:23:54 2000 +++ 2.4.0-test7-pre5aa1-cz/ipc/shm.c Tue Aug 22 13:09:05 2000 @@ -129,7 +129,7 @@ static int sysvipc_shm_read_proc(char *buffer, char **start, off_t offset, int length, int *eof, void *data); #endif -static void zshm_swap (int prio, int gfp_mask); +static void zshm_swap (int prio, zone_t *zone); static void zmap_unuse(swp_entry_t entry, struct page *page); static void shmzero_open(struct vm_area_struct *shmd); static void shmzero_close(struct vm_area_struct *shmd); @@ -1465,7 +1465,7 @@ #define RETRY 1 #define FAILED 2 -static int shm_swap_core(struct shmid_kernel *shp, unsigned long idx, swp_entry_t swap_entry, int *counter, struct page **outpage) +static int shm_swap_core(struct shmid_kernel *shp, unsigned long idx, swp_entry_t swap_entry, zone_t *zone, int *counter, struct page **outpage) { pte_t page; struct page *page_map; @@ -1474,7 +1474,7 @@ if (!pte_present(page)) return RETRY; page_map = pte_page(page); - if (page_map->zone->free_pages > page_map->zone->pages_high) + if (!memclass(page_map->zone, zone)) return RETRY; if (shp->id != zero_id) swap_attempts++; @@ -1527,22 +1527,23 @@ static unsigned long swap_id; /* currently being swapped */ static unsigned long swap_idx; /* next to swap */ -int shm_swap (int prio, int gfp_mask) +int shm_swap (int prio, zone_t *zone) { struct shmid_kernel *shp; swp_entry_t swap_entry; unsigned long id, idx; - int loop = 0; + int loop; int counter; struct page * page_map; - zshm_swap(prio, gfp_mask); - counter = shm_rss / (prio + 1); + zshm_swap(prio, zone); + counter = shm_rss / prio; if (!counter) return 0; if (shm_swap_preop(&swap_entry)) return 0; + loop = 0; shm_lockall(); check_id: shp = shm_get(swap_id); @@ -1568,7 +1569,7 @@ if (idx >= shp->shm_npages) goto next_id; - switch (shm_swap_core(shp, idx, swap_entry, &counter, &page_map)) { + switch (shm_swap_core(shp, idx, swap_entry, zone, &counter, &page_map)) { case RETRY: goto check_table; case FAILED: goto failed; } @@ -1854,7 +1855,7 @@ spin_unlock(&zmap_list_lock); } -static void zshm_swap (int prio, int gfp_mask) +static void zshm_swap (int prio, zone_t *zone) { struct shmid_kernel *shp; swp_entry_t swap_entry; @@ -1863,7 +1864,7 @@ int counter; struct page * page_map; - counter = zshm_rss / (prio + 1); + counter = zshm_rss / prio; if (!counter) return; next: @@ -1899,7 +1900,7 @@ goto next_id; } - switch (shm_swap_core(shp, idx, swap_entry, &counter, &page_map)) { + switch (shm_swap_core(shp, idx, swap_entry, zone, &counter, &page_map)) { case RETRY: goto check_table; case FAILED: goto failed; } diff -urN 2.4.0-test7-pre5aa1/ipc/util.c 2.4.0-test7-pre5aa1-cz/ipc/util.c --- 2.4.0-test7-pre5aa1/ipc/util.c Sat Jun 24 16:03:03 2000 +++ 2.4.0-test7-pre5aa1-cz/ipc/util.c Tue Aug 22 13:09:05 2000 @@ -345,7 +345,7 @@ return; } -int shm_swap (int prio, int gfp_mask) +int shm_swap (int prio, int gfp_mask, zone_t *zone) { return 0; } diff -urN 2.4.0-test7-pre5aa1/kernel/fork.c 2.4.0-test7-pre5aa1-cz/kernel/fork.c --- 2.4.0-test7-pre5aa1/kernel/fork.c Tue Aug 22 01:23:54 2000 +++ 2.4.0-test7-pre5aa1-cz/kernel/fork.c Tue Aug 22 13:09:05 2000 @@ -620,6 +620,8 @@ p->lock_depth = -1; /* -1 = no lock */ p->start_time = jiffies; + INIT_LIST_HEAD(&p->local_pages); + retval = -ENOMEM; /* copy all the process information */ if (copy_files(clone_flags, p)) diff -urN 2.4.0-test7-pre5aa1/mm/filemap.c 2.4.0-test7-pre5aa1-cz/mm/filemap.c --- 2.4.0-test7-pre5aa1/mm/filemap.c Tue Aug 22 13:08:39 2000 +++ 2.4.0-test7-pre5aa1-cz/mm/filemap.c Tue Aug 22 13:21:09 2000 @@ -44,7 +44,6 @@ atomic_t page_cache_size = ATOMIC_INIT(0); unsigned int page_hash_bits; struct page **page_hash_table; -struct list_head lru_cache; static spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED; /* @@ -125,7 +124,6 @@ head = &inode->i_mapping->pages; spin_lock(&pagecache_lock); - spin_lock(&pagemap_lru_lock); curr = head->next; while (curr != head) { @@ -136,13 +134,12 @@ if (TryLockPage(page)) continue; - __lru_cache_del(page); + lru_cache_del(page); __remove_inode_page(page); UnlockPage(page); page_cache_release(page); } - spin_unlock(&pagemap_lru_lock); spin_unlock(&pagecache_lock); } @@ -245,30 +242,110 @@ spin_unlock(&pagecache_lock); } +#define VM_PAGE_OLD 0 +#define VM_PAGE_BH_OLD 1 +#define VM_PAGE_YOUNG 2 + +static inline int vm_page_age(struct page * page) +{ + int ret = VM_PAGE_OLD, referenced, buffer_age; + + /* + * We hold he lru list lock here but the page isn't locked yet. + * + * The overlapped buffer headers can go away from under us + * (and that's not a problem since there's no + * stability issue), but the mapping can't go away because + * we always run lru_cache_del() _before_ __remove_inode_pages(), + * and the real buffer cache can't go away because + * if somebody would be attempting to free the buffer cache + * from under us then the page wouldn't be in the LRU in first place + * (only shrink_mmap frees the buffer cache and unlink the page + * from the lru before starting for real). + */ + if (!page->buffers && !page->mapping) + PAGE_BUG(page); + + /* + * If the page isn't referenced it means it's + * not interesting data and we so must throw away also + * the buffer headers without caring about their age. + * + * If the page is referenced very often we could + * still want to drop the overlapped buffers, + * think at read(2)/write(2) case. BufferAge tell us + * if we should drop the buffers even if the + * page is referenced very often. + */ + + referenced = PageTestandClearReferenced(page); + if (!!page->buffers ^ !!page->mapping) { + if (referenced) + ret = VM_PAGE_YOUNG; + } else { + /* + * This is page cache with overlapped buffers. + * Always clear the buffer-age bit. + * + * Note: the buffer could grow from under us, or + * the buffers could go away from under us, + * that's not a stability problem. + */ + buffer_age = TestandClearPageBufferAge(page); + + /* If the page wasn't referenced then it's old and that's all. */ + if (referenced) { + if (buffer_age) + /* The page and the overlapped bhs are both young */ + ret = VM_PAGE_YOUNG; + else + /* The page is young but the bh are old */ + ret = VM_PAGE_BH_OLD; + } + } + + return ret; +} + /* * nr_dirty represents the number of dirty pages that we will write async * before doing sync writes. We can only do sync writes if we can * wait for IO (__GFP_IO set). */ -int shrink_mmap(int priority, int gfp_mask) +int shrink_mmap(int priority, int gfp_mask, zone_t * zone) { - int ret = 0, count, nr_dirty; - struct list_head * page_lru; - struct page * page = NULL; - - count = nr_lru_pages / (priority + 1); - nr_dirty = priority; - - /* we need pagemap_lru_lock for list_del() ... subtle code below */ - spin_lock(&pagemap_lru_lock); - while (count > 0 && (page_lru = lru_cache.prev) != &lru_cache) { + int ret = 0, count, age, zone_state = 0; + struct list_head * page_lru, * lru_head, * dispose; + vm_lru_t * vm_lru_cache; + spinlock_t * lru_lock; + struct page * page; + + vm_lru_cache = &zone->zone_pgdat->vm_lru[LRU_CACHE]; + lru_lock = &vm_lru_cache->lock; + lru_head = &vm_lru_cache->head; + + /* we need lru_lock for list_del() ... subtle code below */ + spin_lock(lru_lock); + + count = vm_lru_cache->nr_pages / priority; + + while (count > 0 && (page_lru = lru_head->prev) != lru_head && !ret) { page = list_entry(page_lru, struct page, lru); list_del(page_lru); - if (PageTestandClearReferenced(page)) + age = vm_page_age(page); + + if (age == VM_PAGE_YOUNG) + goto dispose_continue; + + if (!memclass(page->zone, zone)) { + if (!(++zone_state % priority)) + count--; goto dispose_continue; + } count--; + /* * Avoid unscalable SMP locking for pages we can * immediate tell are untouchable.. @@ -284,7 +361,7 @@ the page so nobody else may SMP race with us running a lru_cache_del() (lru_cache_del() always run with the page locked down ;). */ - spin_unlock(&pagemap_lru_lock); + spin_unlock(lru_lock); /* avoid freeing the page while it's locked */ page_cache_get(page); @@ -294,15 +371,9 @@ * of zone - it's old. */ if (page->buffers) { - int wait; - /* - * 0 - free it if can do so without IO - * 1 - start write-out of dirty buffers - * 2 - wait for locked buffers - */ - wait = (gfp_mask & __GFP_IO) ? (nr_dirty-- < 0) ? 2 : 1 : 0; - if (!try_to_free_buffers(page, wait)) + if (!try_to_free_buffers(page, gfp_mask)) goto unlock_continue; + ret += shrink_buffer_headers(zone); /* page was locked, inode can't go away under us */ if (!page->mapping) { atomic_dec(&buffermem_pages); @@ -310,6 +381,9 @@ } } + if (age == VM_PAGE_BH_OLD) + goto unlock_continue; + /* Take the pagecache_lock spinlock held to avoid other tasks to notice the page while we are looking at its page count. If it's a pagecache-page we'll free it @@ -334,13 +408,6 @@ goto made_inode_progress; } - /* - * Page is from a zone we don't care about. - * Don't drop page cache entries in vain. - */ - if (page->zone->free_pages > page->zone->pages_high) - goto cache_unlock_continue; - /* is it a page-cache page? */ if (page->mapping) { if (!PageDirty(page) && !pgcache_under_min()) { @@ -356,11 +423,18 @@ cache_unlock_continue: spin_unlock(&pagecache_lock); unlock_continue: - spin_lock(&pagemap_lru_lock); + spin_lock(lru_lock); UnlockPage(page); page_cache_release(page); dispose_continue: - list_add(page_lru, &lru_cache); + list_add(page_lru, lru_head); + + if (current->need_resched) { + spin_unlock(lru_lock); + current->state = TASK_RUNNING; + schedule(); + spin_lock(lru_lock); + } } goto out; @@ -369,13 +443,13 @@ made_buffer_progress: UnlockPage(page); page_cache_release(page); - ret = 1; - spin_lock(&pagemap_lru_lock); - /* nr_lru_pages needs the spinlock */ - nr_lru_pages--; + ret += 1; + spin_lock(lru_lock); + /* nr_pages needs the spinlock */ + vm_lru_cache->nr_pages--; out: - spin_unlock(&pagemap_lru_lock); + spin_unlock(lru_lock); return ret; } @@ -414,7 +488,6 @@ if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh)) continue; - bh->b_flushtime = jiffies; ll_rw_block(WRITE, 1, &bh); } while ((bh = bh->b_this_page) != head); return 0; @@ -519,7 +592,7 @@ if (PageLocked(page)) BUG(); - flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty) | (1 << PG_referenced)); + flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty) | (1 << PG_referenced) | (1 << PG_buffer_age)); page->flags = flags | (1 << PG_locked); page_cache_get(page); page->index = offset; diff -urN 2.4.0-test7-pre5aa1/mm/numa.c 2.4.0-test7-pre5aa1-cz/mm/numa.c --- 2.4.0-test7-pre5aa1/mm/numa.c Thu Aug 17 19:57:44 2000 +++ 2.4.0-test7-pre5aa1-cz/mm/numa.c Tue Aug 22 13:09:05 2000 @@ -33,7 +33,7 @@ struct page * alloc_pages_node(int nid, int gfp_mask, unsigned long order) { - return __alloc_pages(NODE_DATA(nid)->node_zonelists + gfp_mask, order); + return __alloc_pages(NODE_DATA(nid)->node_gfpmask_zone + gfp_mask, order); } #ifdef CONFIG_DISCONTIGMEM diff -urN 2.4.0-test7-pre5aa1/mm/page_alloc.c 2.4.0-test7-pre5aa1-cz/mm/page_alloc.c --- 2.4.0-test7-pre5aa1/mm/page_alloc.c Thu Aug 17 19:57:44 2000 +++ 2.4.0-test7-pre5aa1-cz/mm/page_alloc.c Tue Aug 22 15:04:58 2000 @@ -58,19 +58,64 @@ */ #define BAD_RANGE(zone,x) (((zone) != (x)->zone) || (((x)-mem_map) < (zone)->offset) || (((x)-mem_map) >= (zone)->offset+(zone)->size)) +#define __free_pages_ok_critical_section(zone, pgdat, mask, area, index, base, page_idx) \ + (zone)->free_pages -= (mask); \ + \ + /* update the classzone */ \ + { \ + int nr_zone = (zone)->nr_zone; \ + register zone_t * z = (zone); \ + do { \ + z->classzone_free_pages -= (mask); \ + if (z->zone_wake_kswapd && \ + z->classzone_free_pages > z->pages_high) \ + z->zone_wake_kswapd = 0; \ + z++; \ + } while (++nr_zone < (pgdat)->nr_zones); \ + } \ + \ + while ((mask) + (1 << (MAX_ORDER-1))) { \ + struct page *buddy1, *buddy2; \ + \ + if ((area) >= (zone)->free_area + MAX_ORDER) \ + BUG(); \ + if (!test_and_change_bit(index, (area)->map)) \ + /* \ + * the buddy page is still allocated. \ + */ \ + break; \ + /* \ + * Move the buddy up one level. \ + */ \ + buddy1 = (base) + ((page_idx) ^ -(mask)); \ + buddy2 = (base) + (page_idx); \ + if (BAD_RANGE(zone,buddy1)) \ + BUG(); \ + if (BAD_RANGE(zone,buddy2)) \ + BUG(); \ + \ + memlist_del(&buddy1->list); \ + (mask) <<= 1; \ + (area)++; \ + (index) >>= 1; \ + (page_idx) &= (mask); \ + } \ + memlist_add_head(&((base) + (page_idx))->list, &(area)->free_list); + /* * Buddy system. Hairy. You really aren't expected to understand this * * Hint: -mask = 1+~mask */ -static void FASTCALL(__free_pages_ok (struct page *page, unsigned long order)); -static void __free_pages_ok (struct page *page, unsigned long order) +void __free_pages_ok (struct page *page, unsigned long order) { unsigned long index, page_idx, mask, flags; free_area_t *area; struct page *base; zone_t *zone; + spinlock_t * freelist_lock; + pg_data_t * pgdat; /* * Subtle. We do not want to test this in the inlined part of @@ -97,6 +142,10 @@ if (PageDirty(page)) BUG(); + if (current->flags & PF_FREE_PAGES) + goto local_freelist; + back_local_freelist: + zone = page->zone; mask = (~0UL) << order; @@ -108,44 +157,48 @@ area = zone->free_area + order; - spin_lock_irqsave(&zone->lock, flags); + pgdat = zone->zone_pgdat; + freelist_lock = &pgdat->freelist_lock; + spin_lock_irqsave(freelist_lock, flags); + __free_pages_ok_critical_section(zone, pgdat, mask, area, index, base, page_idx); + spin_unlock_irqrestore(freelist_lock, flags); + return; - zone->free_pages -= mask; + local_freelist: + /* + * This is a little subtle: if the allocation order + * wanted is major than zero we'd better take all the pages + * local since we must deal with fragmentation too and we + * can't rely on the nr_local_pages information. + */ + if (current->nr_local_pages && !current->allocation_order) + goto back_local_freelist; - while (mask + (1 << (MAX_ORDER-1))) { - struct page *buddy1, *buddy2; + list_add(&page->list, ¤t->local_pages); + page->index = order; + current->nr_local_pages++; +} - if (area >= zone->free_area + MAX_ORDER) - BUG(); - if (!test_and_change_bit(index, area->map)) - /* - * the buddy page is still allocated. - */ - break; - /* - * Move the buddy up one level. - */ - buddy1 = base + (page_idx ^ -mask); - buddy2 = base + page_idx; - if (BAD_RANGE(zone,buddy1)) - BUG(); - if (BAD_RANGE(zone,buddy2)) - BUG(); +static void free_local_pages(struct page * page) { + unsigned long index, page_idx, mask, order = page->index; + free_area_t *area; + struct page *base; + zone_t *zone; + pg_data_t * pgdat; - memlist_del(&buddy1->list); - mask <<= 1; - area++; - index >>= 1; - page_idx &= mask; - } - memlist_add_head(&(base + page_idx)->list, &area->free_list); + zone = page->zone; + mask = (~0UL) << order; + base = mem_map + zone->offset; + page_idx = page - base; + if (page_idx & ~mask) + BUG(); + index = page_idx >> (1 + order); - spin_unlock_irqrestore(&zone->lock, flags); + area = zone->free_area + order; - if (zone->free_pages > zone->pages_high) { - zone->zone_wake_kswapd = 0; - zone->low_on_memory = 0; - } + pgdat = zone->zone_pgdat; + + __free_pages_ok_critical_section(zone, pgdat, mask, area, index, base, page_idx); } #define MARK_USED(index, order, area) \ @@ -172,16 +225,14 @@ return page; } -static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned long order)); -static struct page * rmqueue(zone_t *zone, unsigned long order) +static inline struct page * rmqueue(zone_t *zone, unsigned long order, unsigned long flags) { free_area_t * area = zone->free_area + order; unsigned long curr_order = order; struct list_head *head, *curr; - unsigned long flags; struct page *page; + pg_data_t * pgdat; - spin_lock_irqsave(&zone->lock, flags); do { head = &area->free_list; curr = memlist_next(head); @@ -197,8 +248,20 @@ MARK_USED(index, curr_order, area); zone->free_pages -= 1 << order; + pgdat = zone->zone_pgdat; + /* update the classzone */ + { + int nr_zone = zone->nr_zone; + register zone_t * z = zone; + unsigned int chunk_size = 1<classzone_free_pages -= chunk_size; + z++; + } while (++nr_zone < pgdat->nr_zones); + } + page = expand(zone, page, index, order, curr_order, area); - spin_unlock_irqrestore(&zone->lock, flags); + spin_unlock_irqrestore(&pgdat->freelist_lock, flags); set_page_count(page, 1); if (BAD_RANGE(zone,page)) @@ -208,111 +271,110 @@ curr_order++; area++; } while (curr_order < MAX_ORDER); - spin_unlock_irqrestore(&zone->lock, flags); return NULL; } +static void refile_local_pages(void) +{ + if (current->nr_local_pages) { + struct page * page; + struct list_head * entry; + int nr_pages = current->nr_local_pages; + + while ((entry = current->local_pages.next) != ¤t->local_pages) { + list_del(entry); + page = list_entry(entry, struct page, list); + free_local_pages(page); + if (!nr_pages--) + panic("__get_free_pages local_pages list corrupted I"); + } + if (nr_pages) + panic("__get_free_pages local_pages list corrupted II"); + current->nr_local_pages = 0; + } +} + /* * This is the 'heart' of the zoned buddy allocator: */ -struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order) +struct page * __alloc_pages(gfpmask_zone_t * gfpmask_zone, unsigned long order) { - zone_t **zone; - extern wait_queue_head_t kswapd_wait; + zone_t * classzone = gfpmask_zone->classzone; + pg_data_t * pgdat = classzone->zone_pgdat; + spinlock_t * freelist_lock = &pgdat->freelist_lock; + long flags; + unsigned long size = 1UL << order; + + spin_lock_irqsave(freelist_lock, flags); /* - * (If anyone calls gfp from interrupts nonatomically then it - * will sooner or later tripped up by a schedule().) - * - * We are falling back to lower-level zones if allocation - * in a higher zone fails. + * If this is a recursive call, we'd better + * do our best to just allocate things without + * further thought. */ + if (current->flags & PF_MEMALLOC) + goto allocate_ok; - zone = zonelist->zones; - for (;;) { - zone_t *z = *(zone++); - if (!z) - break; - if (!z->size) - BUG(); - - /* Are we supposed to free memory? Don't make it worse.. */ - if (!z->zone_wake_kswapd) { - struct page *page = rmqueue(z, order); - if (z->free_pages < z->pages_low) { - z->zone_wake_kswapd = 1; - if (waitqueue_active(&kswapd_wait)) - wake_up_interruptible(&kswapd_wait); + /* classzone based memory balancing */ + if (classzone->classzone_free_pages > classzone->pages_low) { + int nr_zone; + zone_t * z; + + allocate_ok: + z = classzone; + for (nr_zone = classzone->nr_zone; + nr_zone >= 0; + nr_zone--, z--) { + if (z->free_pages >= size) { + struct page *page = rmqueue(z, order, flags); + if (page) + return page; } - if (page) - return page; } - } + } else { + extern wait_queue_head_t kswapd_wait; - /* Three possibilities to get here - * - Previous alloc_pages resulted in last zone set to have - * zone_wake_kswapd and start it. kswapd has not been able - * to release enough pages so that one zone does not have - * zone_wake_kswapd set. - * - Different sets of zones (zonelist) - * previous did not have all zones with zone_wake_kswapd but - * this one has... should kswapd be woken up? it will run once. - * - SMP race, kswapd went to sleep slightly after it as running - * in 'if (waitqueue_active(...))' above. - * + anyway the test is very cheap to do... - */ - if (waitqueue_active(&kswapd_wait)) - wake_up_interruptible(&kswapd_wait); - - /* - * Ok, we don't have any zones that don't need some - * balancing.. See if we have any that aren't critical.. - */ - zone = zonelist->zones; - for (;;) { - zone_t *z = *(zone++); - if (!z) - break; - if (!z->low_on_memory) { - struct page *page = rmqueue(z, order); - if (z->free_pages < z->pages_min) - z->low_on_memory = 1; - if (page) - return page; + if (!classzone->zone_wake_kswapd) { + classzone->zone_wake_kswapd = 1; + if (waitqueue_active(&kswapd_wait)) + wake_up_interruptible(&kswapd_wait); } - } - /* - * Uhhuh. All the zones have been critical, which means that - * we'd better do some synchronous swap-out. kswapd has not - * been able to cope.. - */ - if (!(current->flags & PF_MEMALLOC)) { - int gfp_mask = zonelist->gfp_mask; - if (!try_to_free_pages(gfp_mask)) { - if (!(gfp_mask & __GFP_HIGH)) - goto fail; - } - } + /* Are we reaching the critical stage? */ + if (classzone->classzone_free_pages > classzone->pages_min) + /* Not yet critical, so let kswapd handle it.. */ + goto allocate_ok; + + if (gfpmask_zone->gfp_mask & __GFP_WAIT) { + int freed; + + spin_unlock_irqrestore(freelist_lock, flags); + + current->flags |= PF_MEMALLOC|PF_FREE_PAGES; + current->allocation_order = order; + freed = try_to_free_pages(gfpmask_zone->gfp_mask, classzone); + current->flags &= ~(PF_MEMALLOC|PF_FREE_PAGES); - /* - * Final phase: allocate anything we can! - */ - zone = zonelist->zones; - for (;;) { - struct page *page; - - zone_t *z = *(zone++); - if (!z) - break; - page = rmqueue(z, order); - if (page) - return page; - } + spin_lock_irq(freelist_lock); + refile_local_pages(); + + if (freed || gfpmask_zone->gfp_mask & __GFP_HIGH) + goto allocate_ok; -fail: - /* No luck.. */ + /* + * Re-check we're low on memory keeping the spinlock held + * before failing. Somebody may have released + * lots of memory from under us while we was trying + * to free the pages. We check against pages_high + * to be sure to succeed only if lots of memory is been + * released. + */ + if (classzone->classzone_free_pages > classzone->pages_high) + goto allocate_ok; + } + } + spin_unlock_irqrestore(freelist_lock, flags); return NULL; } @@ -363,40 +425,44 @@ /* * Total amount of free (allocatable) RAM: */ -unsigned int nr_free_pages (void) +unsigned long nr_free_pages (void) { - unsigned int sum; - zone_t *zone; + unsigned long sum; int i; sum = 0; - for (i = 0; i < NUMNODES; i++) - for (zone = NODE_DATA(i)->node_zones; zone < NODE_DATA(i)->node_zones + MAX_NR_ZONES; zone++) - sum += zone->free_pages; + for (i = 0; i < NUMNODES; i++) { + pg_data_t * pgdat = NODE_DATA(i); + zone_t * node_zones = pgdat->node_zones; + sum += node_zones[pgdat->nr_zones-1].classzone_free_pages; + } return sum; } /* * Amount of free RAM allocatable as buffer memory: */ -unsigned int nr_free_buffer_pages (void) +unsigned long nr_free_buffer_pages (void) { - unsigned int sum; - zone_t *zone; + unsigned long sum = 0; int i; - sum = nr_lru_pages / 3; - for (i = 0; i < NUMNODES; i++) - for (zone = NODE_DATA(i)->node_zones; zone <= NODE_DATA(i)->node_zones+ZONE_NORMAL; zone++) - sum += zone->free_pages; + for (i = 0; i < NUMNODES; i++) { + pg_data_t * pgdat = NODE_DATA(i); + zone_t * node_zones = pgdat->node_zones; + int higher_zone = pgdat->nr_zones-1; + vm_lru_t * vm_lru_cache = &pgdat->vm_lru[LRU_CACHE]; + sum += vm_lru_cache->nr_pages / 3; + sum += node_zones[higher_zone <= ZONE_NORMAL ? higher_zone : ZONE_NORMAL].classzone_free_pages; + } return sum; } #if CONFIG_HIGHMEM -unsigned int nr_free_highpages (void) +unsigned long nr_free_highpages (void) { int i; - unsigned int pages = 0; + unsigned long pages = 0; for (i = 0; i < NUMNODES; i++) pages += NODE_DATA(i)->node_zones[ZONE_HIGHMEM].free_pages; @@ -411,39 +477,48 @@ */ void show_free_areas_core(int nid) { - unsigned long order; + unsigned long order, flags; unsigned type; + pg_data_t * pgdat = NODE_DATA(nid); + spinlock_t * freelist_lock = &pgdat->freelist_lock; - printk("Free pages: %6dkB (%6dkB HighMem)\n", + printk("Free pages: %6lukB (%6lukB HighMem)\n", nr_free_pages() << (PAGE_SHIFT-10), nr_free_highpages() << (PAGE_SHIFT-10)); - printk("( Free: %d, lru_cache: %d (%d %d %d) )\n", + printk("( Free: %lu, cache: %lu (%d %d %d) )\n", nr_free_pages(), - nr_lru_pages, + NODE_DATA(nid)->vm_lru[LRU_CACHE].nr_pages, freepages.min, freepages.low, freepages.high); + spin_lock_irqsave(freelist_lock, flags); for (type = 0; type < MAX_NR_ZONES; type++) { struct list_head *head, *curr; - zone_t *zone = NODE_DATA(nid)->node_zones + type; - unsigned long nr, total, flags; + zone_t *zone = pgdat->node_zones + type; + unsigned long nr, total; - printk(" %c%d%d %s: ", + printk(" %c%c%d %s: ", (zone->free_pages > zone->pages_low ? (zone->free_pages > zone->pages_high ? ' ' - : 'H') + : 'h') : (zone->free_pages > zone->pages_min + ? 'm' + : 'l')), + (zone->classzone_free_pages > zone->pages_low + ? (zone->classzone_free_pages > zone->pages_high + ? ' ' + : 'H') + : (zone->classzone_free_pages > zone->pages_min ? 'M' : 'L')), - zone->zone_wake_kswapd, zone->low_on_memory, + zone->zone_wake_kswapd, zone->name); total = 0; if (zone->size) { - spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < MAX_ORDER; order++) { head = &(zone->free_area + order)->free_list; curr = head; @@ -458,10 +533,15 @@ printk("%lu*%lukB ", nr, (PAGE_SIZE>>10) << order); } - spin_unlock_irqrestore(&zone->lock, flags); + if (total != zone->free_pages) + printk("error %lu ", + zone->free_pages * (PAGE_SIZE>>10)); } - printk("= %lukB)\n", total * (PAGE_SIZE>>10)); + printk("= %lukB", total * (PAGE_SIZE>>10)); + printk(" class %ldkB\n", + zone->classzone_free_pages * (PAGE_SIZE>>10)); } + spin_unlock_irqrestore(freelist_lock, flags); #ifdef SWAP_CACHE_INFO show_swap_cache_info(); @@ -476,18 +556,17 @@ /* * Builds allocation fallback zone lists. */ -static inline void build_zonelists(pg_data_t *pgdat) +static void __init build_gfpmask_zone(pg_data_t *pgdat) { int i, j, k; for (i = 0; i < NR_GFPINDEX; i++) { - zonelist_t *zonelist; + gfpmask_zone_t * gfpmask_zone; zone_t *zone; - zonelist = pgdat->node_zonelists + i; - memset(zonelist, 0, sizeof(*zonelist)); + gfpmask_zone = pgdat->node_gfpmask_zone + i; - zonelist->gfp_mask = i; + gfpmask_zone->gfp_mask = i; j = 0; k = ZONE_NORMAL; if (i & __GFP_HIGHMEM) @@ -507,21 +586,37 @@ #ifndef CONFIG_HIGHMEM BUG(); #endif - zonelist->zones[j++] = zone; + gfpmask_zone->classzone = zone; + break; } case ZONE_NORMAL: zone = pgdat->node_zones + ZONE_NORMAL; - if (zone->size) - zonelist->zones[j++] = zone; + if (zone->size) { + gfpmask_zone->classzone = zone; + break; + } case ZONE_DMA: zone = pgdat->node_zones + ZONE_DMA; - if (zone->size) - zonelist->zones[j++] = zone; + if (zone->size) { + gfpmask_zone->classzone = zone; + break; + } } - zonelist->zones[j++] = NULL; } } +static void __init vm_lru_init(pg_data_t * pgdat) +{ + int i; + vm_lru_t * vm_lru = pgdat->vm_lru; + + for (i = 0; i < NR_VM_LRU; i++) { + spin_lock_init(&vm_lru[i].lock); + INIT_LIST_HEAD(&vm_lru[i].head); + vm_lru[i].nr_pages = 0; + } +} + #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) /* @@ -538,7 +633,7 @@ unsigned long i, j; unsigned long map_size; unsigned long totalpages, offset, realtotalpages; - unsigned int cumulative = 0; + unsigned long classzonepages; pgdat->node_next = pgdat_list; pgdat_list = pgdat; @@ -570,7 +665,6 @@ freepages.min += i; freepages.low += i * 2; freepages.high += i * 3; - memlist_init(&lru_cache); /* * Some architectures (with lots of mem and discontinous memory @@ -587,6 +681,8 @@ pgdat->node_size = totalpages; pgdat->node_start_paddr = zone_start_paddr; pgdat->node_start_mapnr = (lmem_map - mem_map); + pgdat->nr_zones = 0; + spin_lock_init(&pgdat->freelist_lock); /* * Initially all pages are reserved - free ones are freed @@ -601,6 +697,7 @@ } offset = lmem_map - mem_map; + classzonepages = 0; for (j = 0; j < MAX_NR_ZONES; j++) { zone_t *zone = pgdat->node_zones + j; unsigned long mask; @@ -609,19 +706,22 @@ realsize = size = zones_size[j]; if (zholes_size) realsize -= zholes_size[j]; + classzonepages += realsize; printk("zone(%lu): %lu pages.\n", j, size); zone->size = size; zone->name = zone_names[j]; - zone->lock = SPIN_LOCK_UNLOCKED; zone->zone_pgdat = pgdat; + zone->nr_zone = j; zone->free_pages = 0; + zone->zone_wake_kswapd = 0; + zone->classzone_free_pages = 0; if (!size) continue; + pgdat->nr_zones = j+1; zone->offset = offset; - cumulative += size; - mask = (realsize / zone_balance_ratio[j]); + mask = (classzonepages / zone_balance_ratio[j]); if (mask < zone_balance_min[j]) mask = zone_balance_min[j]; else if (mask > zone_balance_max[j]) @@ -629,8 +729,6 @@ zone->pages_min = mask; zone->pages_low = mask*2; zone->pages_high = mask*3; - zone->low_on_memory = 0; - zone->zone_wake_kswapd = 0; zone->zone_mem_map = mem_map + offset; zone->zone_start_mapnr = offset; zone->zone_start_paddr = zone_start_paddr; @@ -659,7 +757,8 @@ (unsigned int *) alloc_bootmem_node(nid, bitmap_size); } } - build_zonelists(pgdat); + build_gfpmask_zone(pgdat); + vm_lru_init(pgdat); } void __init free_area_init(unsigned long *zones_size) diff -urN 2.4.0-test7-pre5aa1/mm/slab.c 2.4.0-test7-pre5aa1-cz/mm/slab.c --- 2.4.0-test7-pre5aa1/mm/slab.c Tue Aug 22 01:23:54 2000 +++ 2.4.0-test7-pre5aa1-cz/mm/slab.c Tue Aug 22 15:26:19 2000 @@ -140,8 +140,7 @@ * * Manages the objs in a slab. Placed either at the beginning of mem allocated * for a slab, or allocated from an general cache. - * Slabs are chained into one ordered list: fully used, partial, then fully - * free slabs. + * Slabs are chained into three list: fully used, partial, fully free slabs. */ typedef struct slab_s { struct list_head list; @@ -167,7 +166,7 @@ } cpucache_t; #define cc_entry(cpucache) \ - ((void **)(((cpucache_t*)cpucache)+1)) + ((void **)(((cpucache_t*)(cpucache))+1)) #define cc_data(cachep) \ ((cachep)->cpudata[smp_processor_id()]) /* @@ -181,8 +180,9 @@ struct kmem_cache_s { /* 1) each alloc & free */ /* full, partial first, then free */ - struct list_head slabs; - struct list_head *firstnotfull; + struct list_head slabs_full; + struct list_head slabs_partial; + struct list_head slabs_free; unsigned int objsize; unsigned int flags; /* constant flags */ unsigned int num; /* # of objs per slab */ @@ -345,8 +345,9 @@ /* internal cache of cache description objs */ static kmem_cache_t cache_cache = { - slabs: LIST_HEAD_INIT(cache_cache.slabs), - firstnotfull: &cache_cache.slabs, + slabs_full: LIST_HEAD_INIT(cache_cache.slabs_full), + slabs_partial: LIST_HEAD_INIT(cache_cache.slabs_partial), + slabs_free: LIST_HEAD_INIT(cache_cache.slabs_free), objsize: sizeof(kmem_cache_t), flags: SLAB_NO_REAP, spinlock: SPIN_LOCK_UNLOCKED, @@ -778,8 +779,9 @@ cachep->gfpflags |= GFP_DMA; spin_lock_init(&cachep->spinlock); cachep->objsize = size; - INIT_LIST_HEAD(&cachep->slabs); - cachep->firstnotfull = &cachep->slabs; + INIT_LIST_HEAD(&cachep->slabs_full); + INIT_LIST_HEAD(&cachep->slabs_partial); + INIT_LIST_HEAD(&cachep->slabs_free); if (flags & CFLGS_OFF_SLAB) cachep->slabp_cache = kmem_find_general_cachep(slab_size,0); @@ -884,10 +886,10 @@ #define drain_cpu_caches(cachep) do { } while (0) #endif -static int __kmem_cache_shrink(kmem_cache_t *cachep) +static int __kmem_cache_shrink(kmem_cache_t *cachep, zone_t * zone, int * nr_pages) { slab_t *slabp; - int ret; + int ret, progress; drain_cpu_caches(cachep); @@ -897,23 +899,33 @@ while (!cachep->growing) { struct list_head *p; - p = cachep->slabs.prev; - if (p == &cachep->slabs) + p = cachep->slabs_free.prev; + if (p == &cachep->slabs_free) break; - slabp = list_entry(cachep->slabs.prev, slab_t, list); + slabp = list_entry(cachep->slabs_free.prev, slab_t, list); if (slabp->inuse) - break; + BUG(); + + progress = 1; + if (zone) { + void * addr = slabp->s_mem - slabp->colouroff; + struct page * page = virt_to_page(addr); + + if (!memclass(page->zone, zone)) + progress = 0; + } list_del(&slabp->list); - if (cachep->firstnotfull == &slabp->list) - cachep->firstnotfull = &cachep->slabs; spin_unlock_irq(&cachep->spinlock); kmem_slab_destroy(cachep, slabp); + if (nr_pages && progress) + /* don't need the cache's spinlock to read the order */ + *nr_pages += 1UL << cachep->gfporder; spin_lock_irq(&cachep->spinlock); } - ret = !list_empty(&cachep->slabs); + ret = !list_empty(&cachep->slabs_full) || !list_empty(&cachep->slabs_partial); spin_unlock_irq(&cachep->spinlock); return ret; } @@ -925,12 +937,12 @@ * Releases as many slabs as possible for a cache. * To help debugging, a zero exit status indicates all slabs were released. */ -int kmem_cache_shrink(kmem_cache_t *cachep) +int kmem_cache_shrink(kmem_cache_t *cachep, zone_t * zone, int * nr_pages) { if (!cachep || in_interrupt() || !is_chained_kmem_cache(cachep)) BUG(); - return __kmem_cache_shrink(cachep); + return __kmem_cache_shrink(cachep, zone, nr_pages); } /** @@ -962,7 +974,7 @@ list_del(&cachep->next); up(&cache_chain_sem); - if (__kmem_cache_shrink(cachep)) { + if (__kmem_cache_shrink(cachep, NULL, NULL)) { printk(KERN_ERR "kmem_cache_destroy: Can't free all objects %p\n", cachep); down(&cache_chain_sem); @@ -1139,9 +1151,7 @@ cachep->growing--; /* Make slab active. */ - list_add_tail(&slabp->list,&cachep->slabs); - if (cachep->firstnotfull == &cachep->slabs) - cachep->firstnotfull = &slabp->list; + list_add(&slabp->list, &cachep->slabs_free); STATS_INC_GROWN(cachep); cachep->failures = 0; @@ -1198,7 +1208,7 @@ } static inline void * kmem_cache_alloc_one_tail (kmem_cache_t *cachep, - slab_t *slabp) + slab_t *slabp, int partial) { void *objp; @@ -1211,9 +1221,15 @@ objp = slabp->s_mem + slabp->free*cachep->objsize; slabp->free=slab_bufctl(slabp)[slabp->free]; - if (slabp->free == BUFCTL_END) - /* slab now full: move to next slab for next alloc */ - cachep->firstnotfull = slabp->list.next; + if (slabp->free == BUFCTL_END) { + list_del(&slabp->list); + list_add(&slabp->list, &cachep->slabs_full); + } else { + if (!partial) { + list_del(&slabp->list); + list_add(&slabp->list, &cachep->slabs_partial); + } + } #if DEBUG if (cachep->flags & SLAB_POISON) if (kmem_check_poison_obj(cachep, objp)) @@ -1239,16 +1255,20 @@ */ #define kmem_cache_alloc_one(cachep) \ ({ \ - slab_t *slabp; \ + slab_t *slabp; \ + struct list_head * slab_freelist; \ + int partial = 1; \ \ - /* Get slab alloc is to come from. */ \ - { \ - struct list_head* p = cachep->firstnotfull; \ - if (p == &cachep->slabs) \ + slab_freelist = &(cachep)->slabs_partial; \ + if (list_empty(slab_freelist)) { \ + partial = 0; \ + slab_freelist = &(cachep)->slabs_free; \ + if (list_empty(slab_freelist)) \ goto alloc_new_slab; \ - slabp = list_entry(p,slab_t, list); \ } \ - kmem_cache_alloc_one_tail(cachep, slabp); \ + \ + slabp = list_entry(slab_freelist->next, slab_t, list); \ + kmem_cache_alloc_one_tail(cachep, slabp, partial); \ }) #ifdef CONFIG_SMP @@ -1256,18 +1276,25 @@ { int batchcount = cachep->batchcount; cpucache_t* cc = cc_data(cachep); + struct list_head * slab_freelist; + int partial; + slab_t *slabp; spin_lock(&cachep->spinlock); while (batchcount--) { /* Get slab alloc is to come from. */ - struct list_head *p = cachep->firstnotfull; - slab_t *slabp; + slab_freelist = &(cachep)->slabs_partial; + partial = 1; + if (list_empty(slab_freelist)) { + partial = 0; + slab_freelist = &(cachep)->slabs_free; + if (list_empty(slab_freelist)) + break; + } - if (p == &cachep->slabs) - break; - slabp = list_entry(p,slab_t, list); + slabp = list_entry(slab_freelist->next, slab_t, list); cc_entry(cc)[cc->avail++] = - kmem_cache_alloc_one_tail(cachep, slabp); + kmem_cache_alloc_one_tail(cachep, slabp, partial); } spin_unlock(&cachep->spinlock); @@ -1397,43 +1424,24 @@ } STATS_DEC_ACTIVE(cachep); - /* fixup slab chain */ - if (slabp->inuse-- == cachep->num) - goto moveslab_partial; - if (!slabp->inuse) + /* fixup slab chains */ + if (!--slabp->inuse) goto moveslab_free; + if (slabp->inuse + 1 == cachep->num) + goto moveslab_partial; return; moveslab_partial: - /* was full. - * Even if the page is now empty, we can set c_firstnotfull to - * slabp: there are no partial slabs in this case - */ - { - struct list_head *t = cachep->firstnotfull; + /* Was full. */ + list_del(&slabp->list); + list_add(&slabp->list, &cachep->slabs_partial); + return; - cachep->firstnotfull = &slabp->list; - if (slabp->list.next == t) - return; - list_del(&slabp->list); - list_add_tail(&slabp->list, t); - return; - } moveslab_free: - /* - * was partial, now empty. - * c_firstnotfull might point to slabp - * FIXME: optimize - */ - { - struct list_head *t = cachep->firstnotfull->prev; - - list_del(&slabp->list); - list_add_tail(&slabp->list, &cachep->slabs); - if (cachep->firstnotfull == &slabp->list) - cachep->firstnotfull = t->next; - return; - } + /* Was partial, now empty. */ + list_del(&slabp->list); + list_add(&slabp->list, &cachep->slabs_free); + return; } #ifdef CONFIG_SMP @@ -1744,7 +1752,7 @@ * * Called from try_to_free_page(). */ -void kmem_cache_reap (int gfp_mask) +int kmem_cache_reap (int gfp_mask, zone_t * zone) { slab_t *slabp; kmem_cache_t *searchp; @@ -1752,12 +1760,13 @@ unsigned int best_pages; unsigned int best_len; unsigned int scan; + int nr_pages = 0; if (gfp_mask & __GFP_WAIT) down(&cache_chain_sem); else if (down_trylock(&cache_chain_sem)) - return; + return 0; scan = REAP_SCANLEN; best_len = 0; @@ -1798,13 +1807,13 @@ #endif full_free = 0; - p = searchp->slabs.prev; - while (p != &searchp->slabs) { + p = searchp->slabs_free.next; + while (p != &searchp->slabs_free) { slabp = list_entry(p, slab_t, list); if (slabp->inuse) - break; + BUG(); full_free++; - p = p->prev; + p = p->next; } /* @@ -1821,7 +1830,7 @@ best_cachep = searchp; best_len = full_free; best_pages = pages; - if (full_free >= REAP_PERFECT) { + if (pages >= REAP_PERFECT) { clock_searchp = list_entry(searchp->next.next, kmem_cache_t,next); goto perfect; @@ -1841,22 +1850,29 @@ spin_lock_irq(&best_cachep->spinlock); perfect: - /* free only 80% of the free slabs */ - best_len = (best_len*4 + 1)/5; + /* free only 50% of the free slabs */ + best_len = (best_len + 1)/2; for (scan = 0; scan < best_len; scan++) { struct list_head *p; + int progress; if (best_cachep->growing) break; - p = best_cachep->slabs.prev; - if (p == &best_cachep->slabs) + p = best_cachep->slabs_free.prev; + if (p == &best_cachep->slabs_free) break; slabp = list_entry(p,slab_t,list); if (slabp->inuse) - break; + BUG(); + progress = 1; + if (zone) { + void * addr = slabp->s_mem - slabp->colouroff; + struct page * page = virt_to_page(addr); + + if (!memclass(page->zone, zone)) + progress = 0; + } list_del(&slabp->list); - if (best_cachep->firstnotfull == &slabp->list) - best_cachep->firstnotfull = &best_cachep->slabs; STATS_INC_REAPED(best_cachep); /* Safe to drop the lock. The slab is no longer linked to the @@ -1864,12 +1880,15 @@ */ spin_unlock_irq(&best_cachep->spinlock); kmem_slab_destroy(best_cachep, slabp); + if (progress) + /* don't need the cache's spinlock to read the order */ + nr_pages += 1UL << best_cachep->gfporder; spin_lock_irq(&best_cachep->spinlock); } spin_unlock_irq(&best_cachep->spinlock); out: up(&cache_chain_sem); - return; + return nr_pages; } #ifdef CONFIG_PROC_FS @@ -1922,14 +1941,25 @@ spin_lock_irq(&cachep->spinlock); active_objs = 0; num_slabs = 0; - list_for_each(q,&cachep->slabs) { + list_for_each(q,&cachep->slabs_full) { slabp = list_entry(q, slab_t, list); + if (slabp->inuse != cachep->num) + BUG(); + active_objs += cachep->num; + active_slabs++; + } + list_for_each(q,&cachep->slabs_partial) { + slabp = list_entry(q, slab_t, list); + if (slabp->inuse == cachep->num || !slabp->inuse) + BUG(); active_objs += slabp->inuse; - num_objs += cachep->num; + active_slabs++; + } + list_for_each(q,&cachep->slabs_free) { + slabp = list_entry(q, slab_t, list); if (slabp->inuse) - active_slabs++; - else - num_slabs++; + BUG(); + num_slabs++; } num_slabs+=active_slabs; num_objs = num_slabs*cachep->num; diff -urN 2.4.0-test7-pre5aa1/mm/swap_state.c 2.4.0-test7-pre5aa1-cz/mm/swap_state.c --- 2.4.0-test7-pre5aa1/mm/swap_state.c Thu Aug 17 19:57:44 2000 +++ 2.4.0-test7-pre5aa1-cz/mm/swap_state.c Tue Aug 22 13:09:06 2000 @@ -58,7 +58,7 @@ BUG(); if (page->mapping) BUG(); - flags = page->flags & ~((1 << PG_error) | (1 << PG_dirty) | (1 << PG_referenced)); + flags = page->flags & ~((1 << PG_error) | (1 << PG_dirty) | (1 << PG_referenced) | (1 << PG_buffer_age)); page->flags = flags | (1 << PG_uptodate); add_to_page_cache_locked(page, &swapper_space, entry.val); } diff -urN 2.4.0-test7-pre5aa1/mm/vmscan.c 2.4.0-test7-pre5aa1-cz/mm/vmscan.c --- 2.4.0-test7-pre5aa1/mm/vmscan.c Thu Aug 17 19:57:44 2000 +++ 2.4.0-test7-pre5aa1-cz/mm/vmscan.c Tue Aug 22 15:35:01 2000 @@ -22,6 +22,7 @@ #include #include +#include /* * The swap-out functions return 1 if they successfully @@ -34,7 +35,7 @@ * using a process that no longer actually exists (it might * have died while we slept). */ -static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask) +static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask, zone_t * zone) { pte_t pte; swp_entry_t entry; @@ -45,7 +46,7 @@ if (!pte_present(pte)) goto out_failed; page = pte_page(pte); - if ((!VALID_PAGE(page)) || PageReserved(page)) + if ((!VALID_PAGE(page)) || PageReserved(page) || !memclass(page->zone, zone)) goto out_failed; if (mm->swap_cnt) @@ -113,13 +114,6 @@ goto out_unlock; /* - * Don't do any of the expensive stuff if - * we're not really interested in this zone. - */ - if (page->zone->free_pages > page->zone->pages_high) - goto out_unlock; - - /* * Ok, it's really dirty. That means that * we should either create a new swap cache * entry for it, or we should write it back @@ -209,7 +203,7 @@ * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de */ -static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask) +static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask, zone_t * zone) { pte_t * pte; unsigned long pmd_end; @@ -231,9 +225,13 @@ do { int result; vma->vm_mm->swap_address = address + PAGE_SIZE; - result = try_to_swap_out(mm, vma, address, pte, gfp_mask); + result = try_to_swap_out(mm, vma, address, pte, gfp_mask, zone); if (result) return result; + if (current->need_resched) { + vmlist_access_unlock(vma->vm_mm); + return 2; + } if (!mm->swap_cnt) return 0; address += PAGE_SIZE; @@ -242,7 +240,7 @@ return 0; } -static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask) +static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask, zone_t * zone) { pmd_t * pmd; unsigned long pgd_end; @@ -262,7 +260,7 @@ end = pgd_end; do { - int result = swap_out_pmd(mm, vma, pmd, address, end, gfp_mask); + int result = swap_out_pmd(mm, vma, pmd, address, end, gfp_mask, zone); if (result) return result; if (!mm->swap_cnt) @@ -273,7 +271,7 @@ return 0; } -static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int gfp_mask) +static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int gfp_mask, zone_t * zone) { pgd_t *pgdir; unsigned long end; @@ -288,7 +286,7 @@ if (address >= end) BUG(); do { - int result = swap_out_pgd(mm, vma, pgdir, address, end, gfp_mask); + int result = swap_out_pgd(mm, vma, pgdir, address, end, gfp_mask, zone); if (result) return result; if (!mm->swap_cnt) @@ -299,7 +297,7 @@ return 0; } -static int swap_out_mm(struct mm_struct * mm, int gfp_mask) +static int swap_out_mm(struct mm_struct * mm, int gfp_mask, zone_t * zone) { unsigned long address; struct vm_area_struct* vma; @@ -320,7 +318,7 @@ address = vma->vm_start; for (;;) { - int result = swap_out_vma(mm, vma, address, gfp_mask); + int result = swap_out_vma(mm, vma, address, gfp_mask, zone); if (result) return result; vma = vma->vm_next; @@ -342,11 +340,12 @@ * N.B. This function returns only 0 or 1. Return values != 1 from * the lower level routines result in continued processing. */ -static int swap_out(unsigned int priority, int gfp_mask) +static int swap_out(unsigned int priority, int gfp_mask, zone_t * zone) { struct task_struct * p; int counter; int __ret = 0; + int assign = 0; lock_kernel(); /* @@ -363,16 +362,16 @@ * Think of swap_cnt as a "shadow rss" - it tells us which process * we want to page out (always try largest first). */ - counter = (nr_threads << 2) >> (priority >> 2); - if (counter < 1) - counter = 1; + counter = nr_threads / priority ? : 1; for (; counter >= 0; counter--) { - unsigned long max_cnt = 0; - struct mm_struct *best = NULL; + unsigned long max_cnt; + struct mm_struct *best; int pid = 0; - int assign = 0; select: + max_cnt = 0; + best = NULL; + pid = 0; read_lock(&tasklist_lock); p = init_task.next_task; for (; p != &init_task; p = p->next_task) { @@ -391,6 +390,8 @@ } } read_unlock(&tasklist_lock); + if (assign == 1) + assign = 2; if (!best) { if (!assign) { assign = 1; @@ -401,9 +402,16 @@ int ret; atomic_inc(&best->mm_count); - ret = swap_out_mm(best, gfp_mask); + ret = swap_out_mm(best, gfp_mask, zone); mmdrop(best); + if (ret == 2) { + /* needs a reschedule */ + current->state = TASK_RUNNING; + schedule(); + goto select; + } + if (!ret) continue; @@ -418,50 +426,31 @@ return __ret; } -/* - * Check if there is any memory pressure (free_pages < pages_low) - */ -static inline int memory_pressure(void) +#define FALLBACK_PAGELRU_TO_DCACHE_RATIO ((400*1024*1024)>>PAGE_SHIFT) + +static inline int shrink_cache(int priority, int gfp_mask, zone_t * zone) { - pg_data_t *pgdat = pgdat_list; + static spinlock_t fallback_lock = SPIN_LOCK_UNLOCKED; + static int fallback_pagelru_to_dcache; + int ret, fallback; - do { - int i; - for(i = 0; i < MAX_NR_ZONES; i++) { - zone_t *zone = pgdat->node_zones+ i; - if (zone->size && - zone->free_pages < zone->pages_low) - return 1; - } - pgdat = pgdat->node_next; - } while (pgdat); + if (in_interrupt()) + BUG(); - return 0; -} + spin_lock(&fallback_lock); + fallback = !(++fallback_pagelru_to_dcache % FALLBACK_PAGELRU_TO_DCACHE_RATIO); + spin_unlock(&fallback_lock); -/* - * Check if all zones have recently had memory_pressure (zone_wake_kswapd) - */ -static inline int keep_kswapd_awake(void) -{ - int all_recent = 1; - pg_data_t *pgdat = pgdat_list; + ret = shrink_mmap(priority, gfp_mask, zone); - do { - int i; - for(i = 0; i < MAX_NR_ZONES; i++) { - zone_t *zone = pgdat->node_zones+ i; - if (zone->size) { - if (zone->free_pages < zone->pages_min) - return 1; - if (!zone->zone_wake_kswapd) - all_recent = 0; - } - } - pgdat = pgdat->node_next; - } while (pgdat); + if ((fallback || !ret) && (gfp_mask & __GFP_IO)) { + fallback_pagelru_to_dcache = 0; - return all_recent; + ret += shrink_dcache_memory(priority, zone); + ret += shrink_icache_memory(priority, zone); + } + + return ret; } /* @@ -471,97 +460,112 @@ * * We want to try to free "count" pages, and we want to * cluster them so that we get good swap-out behaviour. - * - * Don't try _too_ hard, though. We don't want to have bad - * latency. - * - * Note: only called by kswapd and try_to_free_pages - * both can WAIT at top level. */ #define FREE_COUNT 8 -#define SWAP_COUNT 16 -static int do_try_to_free_pages(unsigned int gfp_mask) +#define SWAP_COUNT SWAP_CLUSTER_MAX +int try_to_free_pages(unsigned int gfp_mask, zone_t *zone) { - int priority; - int count = FREE_COUNT; - int swap_count; + int priority, count = FREE_COUNT, nr_pages, swap_count; /* Always trim SLAB caches when memory gets low. */ - kmem_cache_reap(gfp_mask); + count -= kmem_cache_reap(gfp_mask, zone); + if (count <= 0) + return 1; - priority = 64; + priority = 5; do { - if (current->need_resched) { - schedule(); - /* time has passed - pressure too? */ - if (!memory_pressure()) - goto done; - } - - while (shrink_mmap(priority, gfp_mask)) { - if (!--count) + while ((nr_pages = shrink_cache(priority, gfp_mask, zone))) { + count -= nr_pages; + if (count <= 0) goto done; } - /* check if mission completed */ - if (!keep_kswapd_awake()) - goto done; - /* Try to get rid of some shared memory pages.. */ if (gfp_mask & __GFP_IO) { - /* - * don't be too light against the d/i cache since - * shrink_mmap() almost never fail when there's - * really plenty of memory free. - */ - count -= shrink_dcache_memory(priority, gfp_mask); - count -= shrink_icache_memory(priority, gfp_mask); - /* - * Not currently working, see fixme in shrink_?cache_memory - * In the inner funtions there is a comment: - * "To help debugging, a zero exit status indicates - * all slabs were released." (-arca?) - * lets handle it in a primitive but working way... - * if (count <= 0) - * goto done; - */ - if (!keep_kswapd_awake()) - goto done; - - while (shm_swap(priority, gfp_mask)) { + while (shm_swap(priority, zone)) { if (!--count) goto done; } } - /* - * Then, try to page stuff out.. - * - * This will not actually free any pages (they get - * put in the swap cache), so we must not count this - * as a "count" success. - */ + /* Then, try to page stuff out.. */ swap_count = SWAP_COUNT; - while (swap_out(priority, gfp_mask)) - if (--swap_count < 0) - break; + while (swap_out(priority, gfp_mask, zone)) { + if (!--swap_count) + goto done; + } + count -= SWAP_COUNT - swap_count; + if (count <= 0) + goto done; + } while (--priority > 0); +done: - } while (--priority >= 0); + return priority > 0; +} - /* Always end on a shrink_mmap.., may sleep... */ - while (shrink_mmap(0, gfp_mask)) { - if (!--count) - goto done; +DECLARE_WAIT_QUEUE_HEAD(kswapd_wait); + +static int kswapd_work_pgdat(pg_data_t * pgdat) +{ + int worked = 0, i; + zone_t * zone; + + for (i = pgdat->nr_zones-1; i >= 0; i--) { + zone = pgdat->node_zones + i; + if (!zone->zone_wake_kswapd) + continue; + if (!try_to_free_pages(GFP_KSWAPD, zone)) { + zone->zone_wake_kswapd = 0; + continue; + } + worked = 1; } - /* Return 1 if any page is freed, or - * there are no more memory pressure */ - return (count < FREE_COUNT || !keep_kswapd_awake()); - -done: + + return worked; +} + +static void kswapd_work(void) +{ + int worked; + pg_data_t * pgdat; + + do { + worked = 0; + pgdat = pgdat_list; + do + worked |= kswapd_work_pgdat(pgdat); + while ((pgdat = pgdat->node_next)); + } while (worked); +} + +static int kswapd_can_sleep_pgdat(pg_data_t * pgdat) +{ + zone_t * zone; + int i; + + for (i = pgdat->nr_zones-1; i >= 0; i--) { + zone = pgdat->node_zones + i; + if (!zone->zone_wake_kswapd) + continue; + return 0; + } + return 1; } -DECLARE_WAIT_QUEUE_HEAD(kswapd_wait); +static int kswapd_can_sleep(void) +{ + pg_data_t * pgdat; + + pgdat = pgdat_list; + do { + if (kswapd_can_sleep_pgdat(pgdat)) + continue; + return 0; + } while ((pgdat = pgdat->node_next)); + + return 1; +} /* * The background pageout daemon, started as a kernel thread @@ -579,11 +583,13 @@ int kswapd(void *unused) { struct task_struct *tsk = current; + wait_queue_t wait; tsk->session = 1; tsk->pgrp = 1; strcpy(tsk->comm, "kswapd"); sigfillset(&tsk->blocked); + init_waitqueue_entry(&wait, tsk); /* * Tell the memory management that we're a "memory allocator", @@ -599,52 +605,29 @@ */ tsk->flags |= PF_MEMALLOC; - for (;;) { - if (!keep_kswapd_awake()) { - interruptible_sleep_on(&kswapd_wait); - } + while (1) { + /* + * If we actually get into a low-memory situation, + * the processes needing more memory will wake us + * up on a more timely basis. + */ + kswapd_work(); + run_task_queue(&tq_disk); - do_try_to_free_pages(GFP_KSWAPD); - } -} + __set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&kswapd_wait, &wait); -/* - * Called by non-kswapd processes when they want more - * memory. - * - * In a perfect world, this should just wake up kswapd - * and return. We don't actually want to swap stuff out - * from user processes, because the locking issues are - * nasty to the extreme (file write locks, and MM locking) - * - * One option might be to let kswapd do all the page-out - * and VM page table scanning that needs locking, and this - * process thread could do just the mmap shrink stage that - * can be done by just dropping cached pages without having - * any deadlock issues. - */ -int try_to_free_pages(unsigned int gfp_mask) -{ - int retval = 1; + if (kswapd_can_sleep()) + schedule(); - if (gfp_mask & __GFP_WAIT) { - current->state = TASK_RUNNING; - current->flags |= PF_MEMALLOC; - retval = do_try_to_free_pages(gfp_mask); - current->flags &= ~PF_MEMALLOC; + __set_current_state(TASK_RUNNING); + remove_wait_queue(&kswapd_wait, &wait); } - - /* someone needed memory that kswapd had not provided - * make sure kswapd runs, should not happen often */ - if (waitqueue_active(&kswapd_wait)) - wake_up_interruptible(&kswapd_wait); - - return retval; } static int __init kswapd_init(void) { - printk("Starting kswapd v1.7\n"); + printk("Starting kswapd v1.8\n"); swap_setup(); kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND); return 0;