attached are the 4 patches that implement dynamic unmapping from the linear array for aggressive use-after-free detection. patch-umap-core: update the cache flushing logic within change_attr: - use page->link for storing the pages to be unlinked, avoids the need for OOM handling. - use a spinlock instead of the semaphore patch-umap-gfp: unmap pages between free_pages and gfp. - use change_attr - x86-isms: sending an IPI for the flush_tlb during gfp is not possible, because gfp can be called with disabled local interrupts. This means that after free_pages, another cpu could continue to access the page if it has an tlb entry. x86 cpu do not cache negative lookup results, thus there won't be oopses due to missing flushes after alloc_pages. - use page->private to log failed unmap calls. Without it, I got an oops during boot. patch-umap-slab: add change_page_attr to slabs that support it. Implementation identical to patch-umap-gfp. patch-umap-task: increase the taskstruct_cache object size to PAGE_SIZE, then slab can unmap the pages. arch/i386/Kconfig | 8 ++++ arch/i386/kernel/cpu/common.c | 8 ++++ arch/i386/mm/pageattr.c | 65 +++++++++++++++++----------------------- kernel/fork.c | 12 ++++++- mm/page_alloc.c | 56 ++++++++++++++++++++++++++++++++-- mm/slab.c | 68 +++++++++++++++++++++++++++++++++++++++--- 6 files changed, 171 insertions(+), 46 deletions(-) diff -puN arch/i386/Kconfig~unmap-page-debugging arch/i386/Kconfig --- 25/arch/i386/Kconfig~unmap-page-debugging 2003-05-22 01:17:12.000000000 -0700 +++ 25-akpm/arch/i386/Kconfig 2003-05-22 01:17:13.000000000 -0700 @@ -1559,6 +1559,14 @@ config SPINLINE itself (as ".text.lock.filename"). This can be helpful for finding the callers of locks. +config DEBUG_PAGEALLOC + bool "Page alloc debugging" + depends on DEBUG_KERNEL + help + Unmap pages from the kernel linear mapping after free_pages(). + This results in a large slowdown, but helps to find certain types + of memory corruptions. + config DEBUG_HIGHMEM bool "Highmem debugging" depends on DEBUG_KERNEL && HIGHMEM diff -puN arch/i386/kernel/cpu/common.c~unmap-page-debugging arch/i386/kernel/cpu/common.c --- 25/arch/i386/kernel/cpu/common.c~unmap-page-debugging 2003-05-22 01:17:12.000000000 -0700 +++ 25-akpm/arch/i386/kernel/cpu/common.c 2003-05-22 01:17:13.000000000 -0700 @@ -430,6 +430,14 @@ void __init early_cpu_init(void) rise_init_cpu(); nexgen_init_cpu(); umc_init_cpu(); + +#ifdef CONFIG_DEBUG_PAGEALLOC + /* pse is not compatible with on-the-fly unmapping, + * disable it even if the cpus claim to support it. + */ + clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); + disable_pse = 1; +#endif } /* * cpu_init() initializes state that is per-CPU. Some data is already diff -puN arch/i386/mm/pageattr.c~unmap-page-debugging arch/i386/mm/pageattr.c --- 25/arch/i386/mm/pageattr.c~unmap-page-debugging 2003-05-22 01:17:12.000000000 -0700 +++ 25-akpm/arch/i386/mm/pageattr.c 2003-05-22 01:17:13.000000000 -0700 @@ -13,6 +13,10 @@ #include #include +static spinlock_t cpa_lock = SPIN_LOCK_UNLOCKED; +static struct list_head df_list = LIST_HEAD_INIT(df_list); + + static inline pte_t *lookup_address(unsigned long address) { pgd_t *pgd = pgd_offset_k(address); @@ -31,10 +35,15 @@ static struct page *split_large_page(uns { int i; unsigned long addr; - struct page *base = alloc_pages(GFP_KERNEL, 0); + struct page *base; pte_t *pbase; + + spin_unlock_irq(&cpa_lock); + base = alloc_pages(GFP_KERNEL, 0); + spin_lock_irq(&cpa_lock); if (!base) return NULL; + address = __pa(address); addr = address & LARGE_PAGE_MASK; pbase = (pte_t *)page_address(base); @@ -90,7 +99,7 @@ static inline void revert_page(struct pa } static int -__change_page_attr(struct page *page, pgprot_t prot, struct page **oldpage) +__change_page_attr(struct page *page, pgprot_t prot) { pte_t *kpte; unsigned long address; @@ -126,7 +135,7 @@ __change_page_attr(struct page *page, pg } if (cpu_has_pse && (atomic_read(&kpte_page->count) == 1)) { - *oldpage = kpte_page; + list_add(&kpte_page->list, &df_list); revert_page(kpte_page, address); } return 0; @@ -137,12 +146,6 @@ static inline void flush_map(void) on_each_cpu(flush_kernel_map, NULL, 1, 1); } -struct deferred_page { - struct deferred_page *next; - struct page *fpage; -}; -static struct deferred_page *df_list; /* protected by init_mm.mmap_sem */ - /* * Change the page attributes of an page in the linear mapping. * @@ -159,46 +162,36 @@ static struct deferred_page *df_list; /* int change_page_attr(struct page *page, int numpages, pgprot_t prot) { int err = 0; - struct page *fpage; int i; + unsigned long flags; - down_write(&init_mm.mmap_sem); + spin_lock_irqsave(&cpa_lock, flags); for (i = 0; i < numpages; i++, page++) { - fpage = NULL; - err = __change_page_attr(page, prot, &fpage); + err = __change_page_attr(page, prot); if (err) break; - if (fpage) { - struct deferred_page *df; - df = kmalloc(sizeof(struct deferred_page), GFP_KERNEL); - if (!df) { - flush_map(); - __free_page(fpage); - } else { - df->next = df_list; - df->fpage = fpage; - df_list = df; - } - } } - up_write(&init_mm.mmap_sem); + spin_unlock_irqrestore(&cpa_lock, flags); return err; } void global_flush_tlb(void) { - struct deferred_page *df, *next_df; + LIST_HEAD(l); + struct list_head* n; + + BUG_ON(irqs_disabled()); - down_read(&init_mm.mmap_sem); - df = xchg(&df_list, NULL); - up_read(&init_mm.mmap_sem); + spin_lock_irq(&cpa_lock); + list_splice_init(&df_list, &l); + spin_unlock_irq(&cpa_lock); flush_map(); - for (; df; df = next_df) { - next_df = df->next; - if (df->fpage) - __free_page(df->fpage); - kfree(df); - } + n = l.next; + while (n != &l) { + struct page *pg = list_entry(n, struct page, list); + n = n->next; + __free_page(pg); + } } EXPORT_SYMBOL(change_page_attr); diff -puN kernel/fork.c~unmap-page-debugging kernel/fork.c --- 25/kernel/fork.c~unmap-page-debugging 2003-05-22 01:17:12.000000000 -0700 +++ 25-akpm/kernel/fork.c 2003-05-22 01:17:13.000000000 -0700 @@ -187,10 +187,18 @@ int autoremove_wake_function(wait_queue_ void __init fork_init(unsigned long mempages) { /* create a slab on which task_structs can be allocated */ +#ifdef CONFIG_DEBUG_PAGEALLOC task_struct_cachep = kmem_cache_create("task_struct", - sizeof(struct task_struct),0, - SLAB_MUST_HWCACHE_ALIGN, NULL, NULL); + min((size_t)PAGE_SIZE, sizeof(struct task_struct)), + 0, SLAB_MUST_HWCACHE_ALIGN, NULL, NULL); +#else + task_struct_cachep = + kmem_cache_create("task_struct", + sizeof(struct task_struct), + 0, SLAB_MUST_HWCACHE_ALIGN, NULL, NULL); +#endif + if (!task_struct_cachep) panic("fork_init(): cannot create task_struct SLAB cache"); diff -puN mm/page_alloc.c~unmap-page-debugging mm/page_alloc.c --- 25/mm/page_alloc.c~unmap-page-debugging 2003-05-22 01:17:12.000000000 -0700 +++ 25-akpm/mm/page_alloc.c 2003-05-22 01:17:13.000000000 -0700 @@ -30,6 +30,8 @@ #include #include +#include +#include DECLARE_BITMAP(node_online_map, MAX_NUMNODES); DECLARE_BITMAP(memblk_online_map, MAX_NR_MEMBLKS); @@ -52,6 +54,47 @@ static int zone_balance_ratio[MAX_NR_ZON static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, }; static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, }; +#ifdef CONFIG_DEBUG_PAGEALLOC +static int __map_pages(struct page *page, unsigned int num, pgprot_t prot) +{ + int retval; +#ifdef CONFIG_HIGHMEM + if (page >= highmem_start_page) + return -1; +#endif + retval = change_page_attr(page,num,prot); + /* we should perform an IPI and flush all tlbs, + * but that can deadlock->flush only current cpu. + */ + __flush_tlb_all(); + return retval; +} + +static void map_pages(struct page *page, unsigned int num) +{ + if (page->private == 1) + return; + __map_pages(page, num, PAGE_KERNEL); +} + +static void unmap_pages(struct page *page, unsigned int num) +{ + if (__map_pages(page, num, __pgprot(0)) < 0) { + page->private = 1; + return ; + } + page->private = 0; +} +#else +static void unmap_pages(struct page *page, unsigned int num) +{ +} + +static void map_pages(struct page *page, unsigned int num) +{ +} +#endif + /* * Temporary debugging check for pages not lying within a given zone. */ @@ -266,6 +309,7 @@ void __free_pages_ok(struct page *page, mod_page_state(pgfree, 1 << order); free_pages_check(__FUNCTION__, page); list_add(&page->list, &list); + unmap_pages(page, 1<pageset[get_cpu()].pcp[cold]; @@ -557,7 +602,7 @@ __alloc_pages(unsigned int gfp_mask, uns (!wait && z->free_pages >= z->pages_high)) { page = buffered_rmqueue(z, order, cold); if (page) - return page; + goto got_pg; } min += z->pages_low * sysctl_lower_zone_protection; } @@ -580,7 +625,7 @@ __alloc_pages(unsigned int gfp_mask, uns (!wait && z->free_pages >= z->pages_high)) { page = buffered_rmqueue(z, order, cold); if (page) - return page; + goto got_pg; } min += local_min * sysctl_lower_zone_protection; } @@ -595,7 +640,7 @@ rebalance: page = buffered_rmqueue(z, order, cold); if (page) - return page; + goto got_pg; } goto nopage; } @@ -623,7 +668,7 @@ rebalance: (!wait && z->free_pages >= z->pages_high)) { page = buffered_rmqueue(z, order, cold); if (page) - return page; + goto got_pg; } min += z->pages_low * sysctl_lower_zone_protection; } @@ -654,6 +699,9 @@ nopage: current->comm, order, gfp_mask); } return NULL; +got_pg: + map_pages(page, 1 << order); + return page; } /* diff -puN mm/slab.c~unmap-page-debugging mm/slab.c --- 25/mm/slab.c~unmap-page-debugging 2003-05-22 01:17:13.000000000 -0700 +++ 25-akpm/mm/slab.c 2003-05-22 01:17:13.000000000 -0700 @@ -85,6 +85,8 @@ #include #include #include +#include +#include /* * DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL, @@ -753,6 +755,49 @@ static inline void kmem_freepages (kmem_ } #if DEBUG + +#ifdef CONFIG_DEBUG_PAGEALLOC +static int __map_pages(struct page *page, unsigned int num, pgprot_t prot) +{ + int retval; + + retval = change_page_attr(page,num,prot); + /* we should perform an IPI and flush all tlbs, + * but that can deadlock->flush only current cpu. + */ + __flush_tlb_all(); + return retval; +} + +static void map_pages(void *objp, unsigned int size) +{ + struct page *pg = virt_to_page(objp); + + if (pg->private == 1) + return; + __map_pages(pg, size/PAGE_SIZE, PAGE_KERNEL); +} + +static void unmap_pages(void *objp, unsigned int size) +{ + struct page *pg = virt_to_page(objp); + + if (__map_pages(pg, size/PAGE_SIZE, __pgprot(0)) < 0) { + pg->private = 1; + return ; + } + pg->private = 0; +} +#else +static void map_pages(void *objp, unsigned int size) +{ +} + +static void unmap_pages(void *objp, unsigned int size) +{ +} +#endif + static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val) { int size = cachep->objsize; @@ -846,8 +891,12 @@ static void slab_destroy (kmem_cache_t * void *objp = slabp->s_mem + cachep->objsize * i; int objlen = cachep->objsize; - if (cachep->flags & SLAB_POISON) + if (cachep->flags & SLAB_POISON) { + if ((cachep->objsize%PAGE_SIZE)==0 && OFF_SLAB(cachep)) + map_pages(objp, cachep->objsize); + check_poison_obj(cachep, objp); + } if (cachep->flags & SLAB_STORE_USER) objlen -= BYTES_PER_WORD; @@ -1360,8 +1409,11 @@ static void cache_init_objs (kmem_cache_ #if DEBUG int objlen = cachep->objsize; /* need to poison the objs? */ - if (cachep->flags & SLAB_POISON) + if (cachep->flags & SLAB_POISON) { poison_obj(cachep, objp, POISON_BEFORE); + if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) + unmap_pages(objp, cachep->objsize); + } if (cachep->flags & SLAB_STORE_USER) { objlen -= BYTES_PER_WORD; ((unsigned long*)(objp+objlen))[0] = 0; @@ -1587,8 +1639,11 @@ static inline void *cache_free_debugchec else cachep->dtor(objp, cachep, 0); } - if (cachep->flags & SLAB_POISON) + if (cachep->flags & SLAB_POISON) { poison_obj(cachep, objp, POISON_AFTER); + if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) + unmap_pages(objp, cachep->objsize); + } #endif return objp; } @@ -1722,8 +1777,13 @@ cache_alloc_debugcheck_after(kmem_cache_ if (!objp) return objp; - if (cachep->flags & SLAB_POISON) + if (cachep->flags & SLAB_POISON) { + if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) { + map_pages(objp, cachep->objsize); + } + check_poison_obj(cachep, objp); + } if (cachep->flags & SLAB_STORE_USER) { objlen -= BYTES_PER_WORD; *((void **)(objp+objlen)) = caller; _