Signed-off-by: Andrea Arcangeli diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/arch/i386/Kconfig.debug x/arch/i386/Kconfig.debug --- x-ref/arch/i386/Kconfig.debug 2004-10-27 03:14:17.000000000 +0200 +++ x/arch/i386/Kconfig.debug 2004-10-30 15:56:48.746593856 +0200 @@ -46,6 +46,13 @@ config DEBUG_PAGEALLOC This results in a large slowdown, but helps to find certain types of memory corruptions. +config DEBUG_PAGE_ZERO + bool "Page zero debugging" + depends on DEBUG_KERNEL + help + Verify that the PG_zero pages returned by __GFP_ZERO allocations + are truly zero pages. + config 4KSTACKS bool "Use 4Kb for kernel stacks instead of 8Kb" help diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/arch/i386/kernel/process.c x/arch/i386/kernel/process.c --- x-ref/arch/i386/kernel/process.c 2004-10-27 03:14:17.000000000 +0200 +++ x/arch/i386/kernel/process.c 2004-10-30 15:56:48.752592944 +0200 @@ -143,6 +143,9 @@ void cpu_idle (void) while (1) { while (!need_resched()) { void (*idle)(void); + + idle_page_zero(); + /* * Mark this as an RCU critical section so that * synchronize_kernel() in the unload path waits diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/arch/i386/mm/pgtable.c x/arch/i386/mm/pgtable.c --- x-ref/arch/i386/mm/pgtable.c 2004-08-25 02:47:49.000000000 +0200 +++ x/arch/i386/mm/pgtable.c 2004-10-30 15:57:43.539264104 +0200 @@ -132,10 +132,7 @@ void __set_fixmap (enum fixed_addresses pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT); - if (pte) - clear_page(pte); - return pte; + return (pte_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); } struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) @@ -143,12 +140,14 @@ struct page *pte_alloc_one(struct mm_str struct page *pte; #ifdef CONFIG_HIGHPTE - pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT, 0); + pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); #else - pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0); + pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); #endif - if (pte) + if (pte && !PageZero(pte)) { clear_highpage(pte); + SetPageZero(pte); + } return pte; } diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/include/linux/gfp.h x/include/linux/gfp.h --- x-ref/include/linux/gfp.h 2004-10-27 03:14:22.000000000 +0200 +++ x/include/linux/gfp.h 2004-10-30 15:56:48.766590816 +0200 @@ -37,6 +37,8 @@ struct vm_area_struct; #define __GFP_NORETRY 0x1000 /* Do not retry. Might fail */ #define __GFP_NO_GROW 0x2000 /* Slab internal usage */ #define __GFP_COMP 0x4000 /* Add compound page metadata */ +#define __GFP_ZERO 0x8000 /* Alloc a zero page */ +#define __GFP_ONLY_ZERO 0x10000 /* Only try to find an already zero page */ #define __GFP_BITS_SHIFT 16 /* Room for 16 __GFP_FOO bits */ #define __GFP_BITS_MASK ((1 << __GFP_BITS_SHIFT) - 1) @@ -44,7 +46,7 @@ struct vm_area_struct; /* if you forget to add the bitmask here kernel will crash, period */ #define GFP_LEVEL_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS| \ __GFP_COLD|__GFP_NOWARN|__GFP_REPEAT| \ - __GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP) + __GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP|__GFP_ZERO) #define GFP_ATOMIC (__GFP_HIGH) #define GFP_NOIO (__GFP_WAIT) @@ -52,6 +54,8 @@ struct vm_area_struct; #define GFP_KERNEL (__GFP_WAIT | __GFP_IO | __GFP_FS) #define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS) #define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HIGHMEM) +#define GFP_HIGHZERO (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HIGHMEM | __GFP_ZERO) +#define GFP_ONLYZERO (__GFP_HIGHMEM | __GFP_ZERO | __GFP_ONLY_ZERO) /* atomic */ /* Flag - indicates that the buffer will be suitable for DMA. Ignored on some platforms, used as appropriate on others */ diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/include/linux/mmzone.h x/include/linux/mmzone.h --- x-ref/include/linux/mmzone.h 2004-10-30 15:55:57.365404984 +0200 +++ x/include/linux/mmzone.h 2004-10-30 15:56:48.767590664 +0200 @@ -42,16 +42,29 @@ struct zone_padding { #define ZONE_PADDING(name) #endif +enum per_cpu_pages_type { + PER_CPU_PAGES_HOT_COLD, /* hot at head.next, cold at head.prev */ + PER_CPU_PAGES_ZERO, /* zero pages, no need of clear_page for these */ + NR_PER_CPU_PAGES, +}; + struct per_cpu_pages { int count; /* number of pages in the list */ - int low; /* low watermark, refill needed */ - int high; /* high watermark, emptying needed */ - int batch; /* chunk size for buddy add/remove */ + int max_size; + int batch; struct list_head list; /* the list of pages */ }; +struct sysctl_per_cpu_pages { + int size_ratio; + int max_size; + int batch_ratio; + int max_batch; + int disable_idle_page_zero; +}; + struct per_cpu_pageset { - struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */ + struct per_cpu_pages pcp[NR_PER_CPU_PAGES]; #ifdef CONFIG_NUMA unsigned long numa_hit; /* allocated in intended node */ unsigned long numa_miss; /* allocated in non intended node */ @@ -364,6 +377,10 @@ int min_free_kbytes_sysctl_handler(struc extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1]; int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); +extern struct sysctl_per_cpu_pages sysctl_per_cpu_pages; +extern int per_cpu_pages_sysctl_handler(struct ctl_table *, int, struct file *, + void __user *, size_t *, loff_t *); +extern void idle_page_zero(void); #include /* Returns the number of the current Node. */ diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/include/linux/page-flags.h x/include/linux/page-flags.h --- x-ref/include/linux/page-flags.h 2004-10-27 03:14:22.000000000 +0200 +++ x/include/linux/page-flags.h 2004-10-30 15:56:48.767590664 +0200 @@ -74,6 +74,7 @@ #define PG_swapcache 16 /* Swap page: swp_entry_t in private */ #define PG_mappedtodisk 17 /* Has blocks allocated on-disk */ #define PG_reclaim 18 /* To be reclaimed asap */ +#define PG_zero 19 /* Zero contents, don't cover compound */ /* @@ -298,6 +299,11 @@ extern unsigned long __read_page_state(u #define PageSwapCache(page) 0 #endif +#define PageZero(page) test_bit(PG_zero, &(page)->flags) +#define SetPageZero(page) set_bit(PG_zero, &(page)->flags) +#define ClearPageZero(page) clear_bit(PG_zero, &(page)->flags) +#define TestClearPageZero(page) test_and_clear_bit(PG_zero, &(page)->flags) + struct page; /* forward declaration */ int test_clear_page_dirty(struct page *page); diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/include/linux/sysctl.h x/include/linux/sysctl.h --- x-ref/include/linux/sysctl.h 2004-10-30 15:55:57.368404528 +0200 +++ x/include/linux/sysctl.h 2004-10-30 15:56:48.768590512 +0200 @@ -167,6 +167,7 @@ enum VM_HUGETLB_GROUP=25, /* permitted hugetlb group */ VM_VFS_CACHE_PRESSURE=26, /* dcache/icache reclaim pressure */ VM_LEGACY_VA_LAYOUT=27, /* legacy/compatibility virtual address space layout */ + VM_PER_CPU_PAGES=28, /* per cpu pages tuning */ }; diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/kernel/sysctl.c x/kernel/sysctl.c --- x-ref/kernel/sysctl.c 2004-10-30 15:55:57.378403008 +0200 +++ x/kernel/sysctl.c 2004-10-30 15:56:48.776589296 +0200 @@ -798,6 +798,15 @@ static ctl_table vm_table[] = { .extra1 = &zero, }, #endif + { + .ctl_name = VM_PER_CPU_PAGES, + .procname = "per_cpu_pages", + .data = &sysctl_per_cpu_pages, + .maxlen = sizeof(sysctl_per_cpu_pages), + .mode = 0644, + .proc_handler = &per_cpu_pages_sysctl_handler, + .strategy = &sysctl_intvec, + }, { .ctl_name = 0 } }; diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/mm/memory.c x/mm/memory.c --- x-ref/mm/memory.c 2004-10-27 03:14:22.000000000 +0200 +++ x/mm/memory.c 2004-10-30 15:56:48.779588840 +0200 @@ -1059,7 +1059,7 @@ static int do_wp_page(struct mm_struct * } old_page = pfn_to_page(pfn); - if (!TestSetPageLocked(old_page)) { + if (!PageReserved(old_page) && !TestSetPageLocked(old_page)) { int reuse = can_share_swap_page(old_page); unlock_page(old_page); if (reuse) { @@ -1072,7 +1072,32 @@ static int do_wp_page(struct mm_struct * spin_unlock(&mm->page_table_lock); return VM_FAULT_MINOR; } + } else if (old_page == ZERO_PAGE(address)) { /* zero page is PageReserved */ + /* no need to clear_page or copy or unlock */ + new_page = alloc_page_vma(GFP_ONLYZERO, vma, address); + if (new_page) { + if (unlikely(!vma->anon_vma)) { + /* oh well, let's do it then */ + pte_unmap(page_table); + spin_unlock(&mm->page_table_lock); + + if (unlikely(anon_vma_prepare(vma))) { + old_page = new_page; /* free it */ + goto no_new_page; + } + + spin_lock(&mm->page_table_lock); + page_table = pte_offset_map(pmd, address); + if (unlikely(!pte_same(*page_table, pte))) { + old_page = new_page; /* free it */ + goto no_new_page; + } + } + ClearPageZero(new_page); + goto map_rw; + } } + pte_unmap(page_table); /* @@ -1095,9 +1120,10 @@ static int do_wp_page(struct mm_struct * spin_lock(&mm->page_table_lock); page_table = pte_offset_map(pmd, address); if (likely(pte_same(*page_table, pte))) { - if (PageReserved(old_page)) + if (PageReserved(old_page)) { + map_rw: ++mm->rss; - else + } else page_remove_rmap(old_page); break_cow(vma, new_page, address, page_table); lru_cache_add_active(new_page); @@ -1416,10 +1442,7 @@ do_anonymous_page(struct mm_struct *mm, unsigned long addr) { pte_t entry; - struct page * page = ZERO_PAGE(addr); - - /* Read-only mapping of ZERO_PAGE. */ - entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); + struct page * page; /* ..except if it's a write access */ if (write_access) { @@ -1429,10 +1452,11 @@ do_anonymous_page(struct mm_struct *mm, if (unlikely(anon_vma_prepare(vma))) goto no_mem; - page = alloc_page_vma(GFP_HIGHUSER, vma, addr); + page = alloc_page_vma(GFP_HIGHZERO, vma, addr); if (!page) goto no_mem; - clear_user_highpage(page, addr); + if (!TestClearPageZero(page)) + clear_user_highpage(page, addr); spin_lock(&mm->page_table_lock); page_table = pte_offset_map(pmd, addr); @@ -1450,6 +1474,9 @@ do_anonymous_page(struct mm_struct *mm, lru_cache_add_active(page); mark_page_accessed(page); page_add_anon_rmap(page, vma, addr); + } else { + /* Read-only mapping of ZERO_PAGE. */ + entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); } set_pte(page_table, entry); diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/mm/page_alloc.c x/mm/page_alloc.c --- x-ref/mm/page_alloc.c 2004-10-30 15:55:57.391401032 +0200 +++ x/mm/page_alloc.c 2004-10-30 15:56:48.786587776 +0200 @@ -12,6 +12,7 @@ * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 * (lots of bits borrowed from Ingo Molnar & Andrew Morton) + * Per cpu zero lists, Andrea Arcangeli, SUSE, Oct 2004 */ #include @@ -50,6 +51,14 @@ int numnodes = 1; */ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 }; +struct sysctl_per_cpu_pages sysctl_per_cpu_pages = { + 256, /* per-cpu pagelist size is "zone->present_pages/size_ratio" */ + 4*1024*1024 >> PAGE_SHIFT, /* maximum size */ + 8, /* batch is calculated as "max_size/batch_ratio" */ + 32, /* max number of pages to free/alloc at time - latency control */ + 0, /* if 1 it disables the idle page zero background generation */ +}; + EXPORT_SYMBOL(totalram_pages); EXPORT_SYMBOL(nr_swap_pages); @@ -189,7 +198,7 @@ static inline void __free_pages_bulk (st { unsigned long page_idx, index, mask; - if (order) + if (unlikely(order)) destroy_compound_page(page, order); mask = (~0UL) << order; page_idx = page - base; @@ -362,7 +371,7 @@ static void prep_new_page(struct page *p page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_referenced | 1 << PG_arch_1 | - 1 << PG_checked | 1 << PG_mappedtodisk); + 1 << PG_checked | 1 << PG_mappedtodisk | 1 << PG_zero); page->private = 0; set_page_refs(page, order); } @@ -399,24 +408,24 @@ static struct page *__rmqueue(struct zon * Obtain a specified number of elements from the buddy allocator, all under * a single hold of the lock, for efficiency. Add them to the supplied list. * Returns the number of new pages which were placed at *list. + * This assumes irqs are disabled by the caller. */ -static int rmqueue_bulk(struct zone *zone, unsigned int order, - unsigned long count, struct list_head *list) +static int __rmqueue_bulk(struct zone *zone, unsigned long count, + struct list_head *list) { - unsigned long flags; int i; int allocated = 0; struct page *page; - spin_lock_irqsave(&zone->lock, flags); + spin_lock(&zone->lock); for (i = 0; i < count; ++i) { - page = __rmqueue(zone, order); + page = __rmqueue(zone, 0); if (page == NULL) break; allocated++; list_add_tail(&page->lru, list); } - spin_unlock_irqrestore(&zone->lock, flags); + spin_unlock(&zone->lock); return allocated; } @@ -503,6 +512,26 @@ static void zone_statistics(struct zonel #endif } +#ifdef CONFIG_DEBUG_PAGE_ZERO +static void fastcall debug_page_zero(struct page *page) +{ + char * p; + unsigned long flags; + + if (system_state == SYSTEM_RUNNING) { + local_irq_save(flags); + + p = kmap_atomic(page, KM_IRQ0); + BUG_ON(memcmp(p, page_address(ZERO_PAGE(p /* ?? */)), PAGE_SIZE)); + kunmap_atomic(p, KM_IRQ0); + + local_irq_restore(flags); + } +} +#else +#define debug_page_zero(page) do { } while (0) +#endif + /* * Free a 0-order page */ @@ -512,6 +541,12 @@ static void fastcall free_hot_cold_page( struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; unsigned long flags; + struct list_head * place; + int excess_pages; + int zero = PageZero(page); + + if (zero) + debug_page_zero(page); arch_free_page(page, 0); @@ -520,12 +555,29 @@ static void fastcall free_hot_cold_page( if (PageAnon(page)) page->mapping = NULL; free_pages_check(__FUNCTION__, page); - pcp = &zone->pageset[get_cpu()].pcp[cold]; + pcp = &zone->pageset[get_cpu()].pcp[zero]; local_irq_save(flags); - if (pcp->count >= pcp->high) - pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); - list_add(&page->lru, &pcp->list); - pcp->count++; + if (zero) { + excess_pages = pcp->count - pcp->max_size; + if (excess_pages >= 0) { + if (excess_pages > 0) + pcp->count -= free_pages_bulk(zone, + max(excess_pages, pcp->batch), + &pcp->list, 0); + pcp = &zone->pageset[get_cpu()].pcp[0]; + } + } + place = pcp->list.prev; + if (!cold) + place = &pcp->list; + list_add(&page->lru, place); + ++pcp->count; + /* pcp->count == 0 must stop with pcp->max_size == 0 */ + excess_pages = pcp->count - pcp->max_size; + if (excess_pages > 0) + pcp->count -= free_pages_bulk(zone, + max(excess_pages, pcp->batch), + &pcp->list, 0); local_irq_restore(flags); put_cpu(); } @@ -540,43 +592,65 @@ void fastcall free_cold_page(struct page free_hot_cold_page(page, 1); } +static struct page * +alloc_per_cpu_page(struct zone *zone, int gfp_flags, int local) +{ + unsigned long flags; + struct per_cpu_pages *pcp; + struct page * page = NULL; + /* + * __GFP_ZERO and __GFP_COLD are orthogonal and they could even + * be used at the same time + */ + int zero = !!(gfp_flags & __GFP_ZERO); + int cold = gfp_flags & __GFP_COLD; + + pcp = &zone->pageset[get_cpu()].pcp[zero]; + if (!pcp->count && local) + goto end_put_cpu; + local_irq_save(flags); + if (!pcp->count) { + if (local) + goto end_irq_restore; + pcp->count += __rmqueue_bulk(zone, pcp->batch, &pcp->list); + } + if (pcp->count) { + struct list_head * pick = pcp->list.prev; + if (!cold) + pick = pcp->list.next; + page = list_entry(pick, struct page, lru); + list_del(&page->lru); + pcp->count--; + } + end_irq_restore: + local_irq_restore(flags); + end_put_cpu: + put_cpu(); + + return page; +} + /* * Really, prep_compound_page() should be called from __rmqueue_bulk(). But * we cheat by calling it from here, in the order > 0 path. Saves a branch * or two. */ - +#define buffered_rmqueue(zone, order, gfp_flags) __buffered_rmqueue(zone, order, gfp_flags, 0) static struct page * -buffered_rmqueue(struct zone *zone, int order, int gfp_flags) +__buffered_rmqueue(struct zone *zone, int order, int gfp_flags, int local) { unsigned long flags; struct page *page = NULL; - int cold = !!(gfp_flags & __GFP_COLD); - - if (order == 0) { - struct per_cpu_pages *pcp; - - pcp = &zone->pageset[get_cpu()].pcp[cold]; - local_irq_save(flags); - if (pcp->count <= pcp->low) - pcp->count += rmqueue_bulk(zone, 0, - pcp->batch, &pcp->list); - if (pcp->count) { - page = list_entry(pcp->list.next, struct page, lru); - list_del(&page->lru); - pcp->count--; - } - local_irq_restore(flags); - put_cpu(); - } - if (page == NULL) { + if (likely(!order)) + page = alloc_per_cpu_page(zone, gfp_flags, local); + else { spin_lock_irqsave(&zone->lock, flags); page = __rmqueue(zone, order); spin_unlock_irqrestore(&zone->lock, flags); } - if (page != NULL) { + if (likely(page)) { BUG_ON(bad_range(zone, page)); mod_page_state_zone(zone, pgalloc, 1 << order); prep_new_page(page, order); @@ -591,7 +665,7 @@ buffered_rmqueue(struct zone *zone, int */ struct page * fastcall __alloc_pages(unsigned int gfp_mask, unsigned int order, - struct zonelist *zonelist) + struct zonelist *zonelist) { const int wait = gfp_mask & __GFP_WAIT; unsigned long min; @@ -599,7 +673,7 @@ __alloc_pages(unsigned int gfp_mask, uns struct page *page; struct reclaim_state reclaim_state; struct task_struct *p = current; - int i; + int i, pass, zero = gfp_mask & __GFP_ZERO; int classzone_idx; int do_retry; int can_try_harder; @@ -628,16 +702,52 @@ __alloc_pages(unsigned int gfp_mask, uns */ classzone_idx = zone_idx(zones[0]); - /* Go through the zonelist once, looking for a zone with enough free */ - for (i = 0; (z = zones[i]) != NULL; i++) { - min = z->pages_low + (1<lowmem_reserve[classzone_idx]; + /* zero pages can only be provided of order 0 */ + BUG_ON(zero && order); + BUG_ON((gfp_mask & __GFP_ONLY_ZERO) && !(gfp_mask & __GFP_ZERO)); - if (z->free_pages < min) - continue; + /* + * We can't allocate from the buddy allocator before even trying + * the per-cpu-pages of the lower zones if lowmem_reserve permits. + * This is important only here in front of the allocator, this + * is the fast path. As soon as we reach pass 2 of this loop + * we're in the slow path and it doesn't matter anymore to use + * efficintly the per-cpu-page resources for this allocation + * (as worse we'll leave something available for the next one ;). + * + * pass 0 is for the per-cpu-zero list (this is the only "zero" allocator). + * pass 1 is for the per-cpu-hot-cold list. + * pass 2 is the buddy allocator. + */ + pass = 0; + if (!zero) + pass = 1; + if (unlikely(order)) + pass = 2; + for (; pass < 3; pass++) { + /* Go through the zonelist once, looking for a zone with enough free */ + for (i = 0; (z = zones[i]) != NULL; i++) { + min = z->pages_low + (1<lowmem_reserve[classzone_idx]; - page = buffered_rmqueue(z, order, gfp_mask); - if (page) - goto got_pg; + if (z->free_pages < min) + continue; + + switch (pass) { + case 0 ... 1: + page = __buffered_rmqueue(z, order, gfp_mask, 1); + if (page) + goto got_pg; + break; + case 2: + page = buffered_rmqueue(z, order, gfp_mask); + if (page) + goto got_pg; + } + } + if (gfp_mask & __GFP_ONLY_ZERO) + return NULL; + /* downgrade to hot-cold per-cpu-page list */ + gfp_mask &= ~__GFP_ZERO; } for (i = 0; (z = zones[i]) != NULL; i++) @@ -734,6 +844,10 @@ nopage: } return NULL; got_pg: + if (pass == 0) { + debug_page_zero(page); + SetPageZero(page); + } zone_statistics(zonelist, z); kernel_map_pages(page, 1 << order, 1); return page; @@ -765,10 +879,11 @@ fastcall unsigned long get_zeroed_page(u */ BUG_ON(gfp_mask & __GFP_HIGHMEM); - page = alloc_pages(gfp_mask, 0); + page = alloc_pages(gfp_mask | __GFP_ZERO, 0); if (page) { void *address = page_address(page); - clear_page(address); + if (!TestClearPageZero(page)) + clear_page(address); return (unsigned long) address; } return 0; @@ -1060,20 +1175,20 @@ void show_free_areas(void) pageset = zone->pageset + cpu; - for (temperature = 0; temperature < 2; temperature++) - printk("cpu %d %s: low %d, high %d, batch %d\n", + for (temperature = 0; temperature < ARRAY_SIZE(pageset->pcp); temperature++) + printk("cpu %d %s: count %d, max_size %d, batch %d\n", cpu, - temperature ? "cold" : "hot", - pageset->pcp[temperature].low, - pageset->pcp[temperature].high, - pageset->pcp[temperature].batch); + temperature ? "zero" : "hot_cold", + pageset->pcp[temperature].count, + pageset->pcp[temperature].max_size, + pageset->pcp[temperature].batch); } } get_page_state(&ps); get_zone_counts(&active, &inactive, &free); - printk("\nFree pages: %11ukB (%ukB HighMem)\n", + printk("Free pages: %11ukB (%ukB HighMem)\n", K(nr_free_pages()), K(nr_free_highpages())); @@ -1143,6 +1258,124 @@ void show_free_areas(void) show_swap_cache_info(); } +#define setup_per_cpu_pages_zone_boot(zone) __setup_per_cpu_pages_zone(zone, 1) +#define setup_per_cpu_pages_zone_sysctl(zone) __setup_per_cpu_pages_zone(zone, 0) +static unsigned long __setup_per_cpu_pages_zone(struct zone * zone, int boot) +{ + int i, cpu; + struct per_cpu_pages *pcp; + unsigned long per_cpu_pages = 0; + + for (cpu = 0; cpu < NR_CPUS; cpu++) { + for (i = 0; i < ARRAY_SIZE(zone->pageset[cpu].pcp); i++) { + pcp = &zone->pageset[cpu].pcp[i]; + + if (boot) { + pcp->count = 0; + INIT_LIST_HEAD(&pcp->list); + } + + /* set pcp->max_size with sysctl; pcp->max_size == 0 is allowed for tiny boxes */ + pcp->max_size = zone->present_pages / sysctl_per_cpu_pages.size_ratio; + if (pcp->max_size > sysctl_per_cpu_pages.max_size) + pcp->max_size = sysctl_per_cpu_pages.max_size; + per_cpu_pages += pcp->max_size; + + /* set pcp->batch with sysctl */ + pcp->batch = pcp->max_size / sysctl_per_cpu_pages.batch_ratio; + if (pcp->batch > sysctl_per_cpu_pages.max_batch) + pcp->batch = sysctl_per_cpu_pages.max_batch; + if (pcp->batch < 1) + pcp->batch = 1; + } + } + + return per_cpu_pages; +} + +static void setup_per_cpu_pages(void) +{ + struct pglist_data *pgdat; + int j; + + for_each_pgdat(pgdat) { + for (j = 0; j < MAX_NR_ZONES; j++) { + struct zone * zone = pgdat->node_zones + j; + setup_per_cpu_pages_zone_sysctl(zone); + } + } +} + +static int idle_page_zero_zone(struct zone * zone) +{ + struct per_cpu_pageset *pageset; + struct per_cpu_pages *pcp; + struct page * page; + unsigned long flags; + unsigned long * p; + int i; + + cond_resched(); + + /* the idle task cannot change cpu */ + pageset = zone->pageset + smp_processor_id(); + if (pageset->pcp[1].count >= pageset->pcp[1].max_size || + !pageset->pcp[0].count) + return 0; + + cond_resched(); + + page = alloc_per_cpu_page(zone, 0, 1); + + cond_resched(); + + if (!page) + return 0; + + if (!PageZero(page)) { + p = kmap_atomic(page, KM_USER0); + for (i = 0; i < PAGE_SIZE / sizeof(long); ++i) { + p[i] = 0; + if (need_resched()) { + kunmap_atomic(p, KM_USER0); + __cond_resched(); + p = kmap_atomic(page, KM_USER0); + } + } + kunmap_atomic(p, KM_USER0); + + SetPageZero(page); + } + + cond_resched(); + + local_irq_save(flags); + pcp = &pageset->pcp[1]; + list_add(&page->lru, &pcp->list); + ++pcp->count; + local_irq_restore(flags); + + cond_resched(); + + return 1; +} + +void idle_page_zero(void) +{ + struct pglist_data *pgdat; + int j; + + if (sysctl_per_cpu_pages.disable_idle_page_zero) + return; + + pgdat = NODE_DATA(numa_node_id()); + for (j = MAX_NR_ZONES-1; j >= 0; j--) { + struct zone * zone = pgdat->node_zones + j; + if (idle_page_zero_zone(zone)) + return; + } +} + /* * Builds allocation fallback zone lists. */ @@ -1477,7 +1710,7 @@ static void __init free_area_init_core(s { unsigned long i, j; const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1); - int cpu, nid = pgdat->node_id; + int nid = pgdat->node_id; unsigned long zone_start_pfn = pgdat->node_start_pfn; pgdat->nr_zones = 0; @@ -1486,7 +1719,7 @@ static void __init free_area_init_core(s for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize; - unsigned long batch; + unsigned long per_cpu_pages; zone_table[NODEZONE(nid, j)] = zone; realsize = size = zones_size[j]; @@ -1507,39 +1740,9 @@ static void __init free_area_init_core(s zone->temp_priority = zone->prev_priority = DEF_PRIORITY; - /* - * The per-cpu-pages pools are set to around 1000th of the - * size of the zone. But no more than 1/4 of a meg - there's - * no point in going beyond the size of L2 cache. - * - * OK, so we don't know how big the cache is. So guess. - */ - batch = zone->present_pages / 1024; - if (batch * PAGE_SIZE > 256 * 1024) - batch = (256 * 1024) / PAGE_SIZE; - batch /= 4; /* We effectively *= 4 below */ - if (batch < 1) - batch = 1; - - for (cpu = 0; cpu < NR_CPUS; cpu++) { - struct per_cpu_pages *pcp; - - pcp = &zone->pageset[cpu].pcp[0]; /* hot */ - pcp->count = 0; - pcp->low = 2 * batch; - pcp->high = 6 * batch; - pcp->batch = 1 * batch; - INIT_LIST_HEAD(&pcp->list); - - pcp = &zone->pageset[cpu].pcp[1]; /* cold */ - pcp->count = 0; - pcp->low = 0; - pcp->high = 2 * batch; - pcp->batch = 1 * batch; - INIT_LIST_HEAD(&pcp->list); - } - printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", - zone_names[j], realsize, batch); + per_cpu_pages = setup_per_cpu_pages_zone_boot(zone); + printk(KERN_DEBUG " %s zone: %lu pages, per_cpu_pages:%lu\n", + zone_names[j], realsize, per_cpu_pages); INIT_LIST_HEAD(&zone->active_list); INIT_LIST_HEAD(&zone->inactive_list); zone->nr_scan_active = 0; @@ -1952,6 +2155,23 @@ int lowmem_reserve_ratio_sysctl_handler( } /* + * lowmem_reserve_ratio_sysctl_handler - just a wrapper around + * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() + * whenever sysctl_lowmem_reserve_ratio changes. + * + * The reserve ratio obviously has absolutely no relation with the + * pages_min watermarks. The lowmem reserve ratio can only make sense + * if in function of the boot time zone sizes. + */ +int per_cpu_pages_sysctl_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) +{ + proc_dointvec_minmax(table, write, file, buffer, length, ppos); + setup_per_cpu_pages(); + return 0; +} + +/* * allocate a large system hash table from bootmem * - it is assumed that the hash table must contain an exact power-of-2 * quantity of entries diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/mm/shmem.c x/mm/shmem.c --- x-ref/mm/shmem.c 2004-10-27 03:14:22.000000000 +0200 +++ x/mm/shmem.c 2004-10-30 15:56:48.789587320 +0200 @@ -85,7 +85,7 @@ static inline struct page *shmem_dir_all * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE: * might be reconsidered if it ever diverges from PAGE_SIZE. */ - return alloc_pages(gfp_mask, PAGE_CACHE_SHIFT-PAGE_SHIFT); + return alloc_pages(gfp_mask | __GFP_HIGH, PAGE_CACHE_SHIFT-PAGE_SHIFT); } static inline void shmem_dir_free(struct page *page) @@ -368,7 +368,8 @@ static swp_entry_t *shmem_swp_alloc(stru spin_unlock(&info->lock); page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping)); if (page) { - clear_highpage(page); + if (!TestClearPageZero(page)) + clear_highpage(page); page->nr_swapped = 0; } spin_lock(&info->lock); @@ -848,7 +849,7 @@ shmem_alloc_page(unsigned long gfp, stru pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); pvma.vm_pgoff = idx; pvma.vm_end = PAGE_SIZE; - page = alloc_page_vma(gfp, &pvma, 0); + page = alloc_page_vma(gfp | __GFP_ZERO, &pvma, 0); mpol_free(pvma.vm_policy); return page; } @@ -864,7 +865,7 @@ static inline struct page * shmem_alloc_page(unsigned long gfp,struct shmem_inode_info *info, unsigned long idx) { - return alloc_page(gfp); + return alloc_page(gfp | __GFP_ZERO); } #endif @@ -1073,7 +1074,8 @@ repeat: info->alloced++; spin_unlock(&info->lock); - clear_highpage(filepage); + if (!TestClearPageZero(filepage)) + clear_highpage(filepage); flush_dcache_page(filepage); SetPageUptodate(filepage); } diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/mm/swapfile.c x/mm/swapfile.c --- x-ref/mm/swapfile.c 2004-10-27 03:14:22.000000000 +0200 +++ x/mm/swapfile.c 2004-10-30 15:56:48.791587016 +0200 @@ -312,8 +312,8 @@ int can_share_swap_page(struct page *pag { int retval = 0; - if (!PageLocked(page)) - BUG(); + BUG_ON(!PageLocked(page)); + BUG_ON(PageReserved(page)); switch (page_count(page)) { case 3: if (!PagePrivate(page)) @@ -325,8 +325,6 @@ int can_share_swap_page(struct page *pag retval = exclusive_swap_page(page); break; case 1: - if (PageReserved(page)) - break; retval = 1; } return retval;