From: William Lee Irwin III In case a more explicit description is required (only difference from before is ripping out touch_all_pages() for CONFIG_DISCONTIGMEM): (a) fixes oops on page->lru list poison in pgd_dtor() on PAE (b) fixes pmd corruption due to userspace teardown not zeroing out the kernel pmd entries (c) fixes pgd_ctor() performance problems on larger/PAE systems (d) fixes touch_all_pages() boot-time triplefault on i386 discontig (e) preserves pgd and pmd preconstruction (f) preserves pgtable.c's #ifdef-lessness Successfully tested on 28x/56GB (sorry, I lost a node to a temporary hardware failure) with XKVA enabled. Without this patch (and the cpumask_arith.h fixes) no runtime whatsoever is possible (in fact, the touch_all_pages() issue takes it out well prior to console_init()). vs. 2.6.0-test2-mm4 arch/i386/mm/init.c | 39 +++++++++- arch/i386/mm/pgtable.c | 170 +++++++++++++++++++++++++++------------------ include/asm-i386/pgtable.h | 4 - 3 files changed, 142 insertions(+), 71 deletions(-) diff -puN arch/i386/mm/init.c~4g4g-wli-fixes arch/i386/mm/init.c --- 25/arch/i386/mm/init.c~4g4g-wli-fixes 2003-08-05 08:41:16.000000000 -0700 +++ 25-akpm/arch/i386/mm/init.c 2003-08-05 08:41:16.000000000 -0700 @@ -332,6 +332,11 @@ void __init zone_sizes_init(void) extern void zone_sizes_init(void); #endif /* !CONFIG_DISCONTIGMEM */ +#ifdef CONFIG_DISCONTIGMEM +void __init touch_all_pages(void) +{ +} +#else void __init touch_all_pages(void) { int i; @@ -346,6 +351,7 @@ void __init touch_all_pages(void) printk("done.\n"); } +#endif /* * paging_init() sets up the page tables - note that the first 8MB are @@ -515,11 +521,13 @@ void __init mem_init(void) load_LDT(&init_mm.context); } -kmem_cache_t *pgd_cache; -kmem_cache_t *pmd_cache; +kmem_cache_t *pgd_cache, *pmd_cache, *kpmd_cache; void __init pgtable_cache_init(void) { + void (*ctor)(void *, kmem_cache_t *, unsigned long); + void (*dtor)(void *, kmem_cache_t *, unsigned long); + if (PTRS_PER_PMD > 1) { pmd_cache = kmem_cache_create("pmd", PTRS_PER_PMD*sizeof(pmd_t), @@ -529,13 +537,36 @@ void __init pgtable_cache_init(void) NULL); if (!pmd_cache) panic("pgtable_cache_init(): cannot create pmd cache"); + + if (TASK_SIZE > PAGE_OFFSET) { + kpmd_cache = kmem_cache_create("kpmd", + PTRS_PER_PMD*sizeof(pmd_t), + 0, + SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, + kpmd_ctor, + NULL); + if (!kpmd_cache) + panic("pgtable_cache_init(): " + "cannot create kpmd cache"); + } } + + if (PTRS_PER_PMD == 1 || TASK_SIZE <= PAGE_OFFSET) + ctor = pgd_ctor; + else + ctor = NULL; + + if (PTRS_PER_PMD == 1 && TASK_SIZE <= PAGE_OFFSET) + dtor = pgd_dtor; + else + dtor = NULL; + pgd_cache = kmem_cache_create("pgd", PTRS_PER_PGD*sizeof(pgd_t), 0, SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, - pgd_ctor, - pgd_dtor); + ctor, + dtor); if (!pgd_cache) panic("pgtable_cache_init(): Cannot create pgd cache"); } diff -puN arch/i386/mm/pgtable.c~4g4g-wli-fixes arch/i386/mm/pgtable.c --- 25/arch/i386/mm/pgtable.c~4g4g-wli-fixes 2003-08-05 08:41:16.000000000 -0700 +++ 25-akpm/arch/i386/mm/pgtable.c 2003-08-05 08:41:16.000000000 -0700 @@ -158,6 +158,17 @@ void pmd_ctor(void *pmd, kmem_cache_t *c memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); } +void kpmd_ctor(void *__pmd, kmem_cache_t *cache, unsigned long flags) +{ + pmd_t *kpmd, *pmd; + kpmd = pmd_offset(&swapper_pg_dir[PTRS_PER_PGD-1], + (PTRS_PER_PMD - NR_SHARED_PMDS)*PMD_SIZE); + pmd = (pmd_t *)__pmd + (PTRS_PER_PMD - NR_SHARED_PMDS); + + memset(__pmd, 0, (PTRS_PER_PMD - NR_SHARED_PMDS)*sizeof(pmd_t)); + memcpy(pmd, kpmd, NR_SHARED_PMDS*sizeof(pmd_t)); +} + /* * List of all pgd's needed so it can invalidate entries in both cached * and uncached pgd's. This is essentially codepath-based locking @@ -169,24 +180,60 @@ void pmd_ctor(void *pmd, kmem_cache_t *c * could be used. The locking scheme was chosen on the basis of * manfred's recommendations and having no core impact whatsoever. * -- wli + * + * The entire issue goes away when XKVA is configured. */ spinlock_t pgd_lock = SPIN_LOCK_UNLOCKED; LIST_HEAD(pgd_list); +/* + * This is not that hard to figure out. + * (a) PTRS_PER_PMD == 1 means non-PAE. + * (b) PTRS_PER_PMD > 1 means PAE. + * (c) TASK_SIZE > PAGE_OFFSET means XKVA. + * (d) TASK_SIZE <= PAGE_OFFSET means non-XKVA. + * + * Do *NOT* back out the preconstruction like the patch I'm cleaning + * up after this very instant did, or at all, for that matter. + * This is never called when PTRS_PER_PMD > 1 && TASK_SIZE > PAGE_OFFSET. + * -- wli + */ void pgd_ctor(void *__pgd, kmem_cache_t *cache, unsigned long unused) { + pgd_t *pgd = (pgd_t *)__pgd; unsigned long flags; - pgd_t *pgd0 = __pgd; + + if (PTRS_PER_PMD == 1) { + if (TASK_SIZE <= PAGE_OFFSET) + spin_lock_irqsave(&pgd_lock, flags); + else + memcpy(&pgd[PTRS_PER_PGD - NR_SHARED_PMDS], + &swapper_pg_dir[PTRS_PER_PGD - NR_SHARED_PMDS], + NR_SHARED_PMDS * sizeof(pgd_t)); + } + + if (TASK_SIZE <= PAGE_OFFSET) + memcpy(pgd + USER_PTRS_PER_PGD, + swapper_pg_dir + USER_PTRS_PER_PGD, + (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); if (PTRS_PER_PMD > 1) return; - spin_lock_irqsave(&pgd_lock, flags); - list_add(&virt_to_page(pgd0)->lru, &pgd_list); - spin_unlock_irqrestore(&pgd_lock, flags); + if (TASK_SIZE > PAGE_OFFSET) + memset(pgd, 0, (PTRS_PER_PGD - NR_SHARED_PMDS)*sizeof(pgd_t)); + else { + list_add(&virt_to_page(pgd)->lru, &pgd_list); + spin_unlock_irqrestore(&pgd_lock, flags); + memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); + } } -/* never called when PTRS_PER_PMD > 1 */ +/* + * Never called when PTRS_PER_PMD > 1 || TASK_SIZE > PAGE_OFFSET + * for with PAE we would list_del() multiple times, and for non-PAE + * with XKVA all the AGP pgd shootdown code is unnecessary. + */ void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused) { unsigned long flags; /* can be called from interrupt context */ @@ -196,87 +243,80 @@ void pgd_dtor(void *pgd, kmem_cache_t *c spin_unlock_irqrestore(&pgd_lock, flags); } -#ifdef CONFIG_X86_PAE - +/* + * See the comments above pgd_ctor() wrt. preconstruction. + * Do *NOT* memcpy() here. If you do, you back out important + * anti- cache pollution code. + * + */ pgd_t *pgd_alloc(struct mm_struct *mm) { int i; pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL); - if (pgd) { -#ifdef CONFIG_X86_4G_VM_LAYOUT - pmd_t *pmd0, *kernel_pmd0; -#endif - pmd_t *pmd; + if (PTRS_PER_PMD == 1 || !pgd) + return pgd; - for (i = 0; i < USER_PTRS_PER_PGD; ++i) { - pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); - if (!pmd) - goto out_oom; - set_pgd(&pgd[i], __pgd(1 + __pa((u64)((u32)pmd)))); - } + /* + * In the 4G userspace case alias the top 16 MB virtual + * memory range into the user mappings as well (these + * include the trampoline and CPU data structures). + */ + for (i = 0; i < USER_PTRS_PER_PGD; ++i) { + kmem_cache_t *cache; + pmd_t *pmd; -#ifdef CONFIG_X86_4G_VM_LAYOUT - /* - * In the 4G userspace case alias the top 16 MB virtual - * memory range into the user mappings as well (these - * include the trampoline and CPU data structures). - */ - pmd0 = pmd; - kernel_pmd0 = (pmd_t *)__va(pgd_val(swapper_pg_dir[PTRS_PER_PGD-1]) & PAGE_MASK); - memcpy(pmd0 + PTRS_PER_PMD - NR_SHARED_PMDS, kernel_pmd0 + PTRS_PER_PMD - NR_SHARED_PMDS, sizeof(pmd_t) * NR_SHARED_PMDS); -#else - memcpy(pgd + USER_PTRS_PER_PGD, - swapper_pg_dir + USER_PTRS_PER_PGD, - (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); -#endif + if (TASK_SIZE > PAGE_OFFSET && i == USER_PTRS_PER_PGD - 1) + cache = kpmd_cache; + else + cache = pmd_cache; + + pmd = kmem_cache_alloc(cache, GFP_KERNEL); + if (!pmd) + goto out_oom; + set_pgd(&pgd[i], __pgd(1 + __pa((u64)((u32)pmd)))); } + return pgd; out_oom: + /* + * we don't have to handle the kpmd_cache here, since it's the + * last allocation, and has either nothing to free or when it + * succeeds the whole operation succeeds. + */ for (i--; i >= 0; i--) kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); kmem_cache_free(pgd_cache, pgd); return NULL; } -#else /* ! PAE */ - -pgd_t *pgd_alloc(struct mm_struct *mm) -{ - pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL); - - if (pgd) { -#ifdef CONFIG_X86_4G_VM_LAYOUT - memset(pgd, 0, PTRS_PER_PGD * sizeof(pgd_t)); - /* - * In the 4G userspace case alias the top 16 MB virtual - * memory range into the user mappings as well (these - * include the trampoline and CPU data structures). - */ - memcpy(pgd + PTRS_PER_PGD-NR_SHARED_PMDS, - swapper_pg_dir + PTRS_PER_PGD-NR_SHARED_PMDS, - NR_SHARED_PMDS * sizeof(pgd_t)); -#else - memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); - memcpy(pgd + USER_PTRS_PER_PGD, - swapper_pg_dir + USER_PTRS_PER_PGD, - (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); -#endif - } - return pgd; -} - -#endif /* CONFIG_X86_PAE */ - void pgd_free(pgd_t *pgd) { int i; - /* in the PAE case user pgd entries are overwritten before usage */ - if (PTRS_PER_PMD > 1) - for (i = 0; i < USER_PTRS_PER_PGD; ++i) - kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); /* in the non-PAE case, clear_page_tables() clears user pgd entries */ + if (PTRS_PER_PMD == 1) + goto out_free; + + /* in the PAE case user pgd entries are overwritten before usage */ + for (i = 0; i < USER_PTRS_PER_PGD; ++i) { + kmem_cache_t *cache; + pmd_t *pmd = __va(pgd_val(pgd[i]) - 1); + + /* + * only userspace pmd's are cleared for us + * by mm/memory.c; it's a slab cache invariant + * that we must separate the kernel pmd slab + * all times, else we'll have bad pmd's. + */ + if (TASK_SIZE > PAGE_OFFSET && i == USER_PTRS_PER_PGD - 1) + cache = kpmd_cache; + else + cache = pmd_cache; + + kmem_cache_free(cache, pmd); + } +out_free: kmem_cache_free(pgd_cache, pgd); } diff -puN include/asm-i386/pgtable.h~4g4g-wli-fixes include/asm-i386/pgtable.h --- 25/include/asm-i386/pgtable.h~4g4g-wli-fixes 2003-08-05 08:41:16.000000000 -0700 +++ 25-akpm/include/asm-i386/pgtable.h 2003-08-05 08:41:16.000000000 -0700 @@ -32,12 +32,12 @@ #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) extern unsigned long empty_zero_page[1024]; extern pgd_t swapper_pg_dir[1024]; -extern kmem_cache_t *pgd_cache; -extern kmem_cache_t *pmd_cache; +extern kmem_cache_t *pgd_cache, *pmd_cache, *kpmd_cache; extern spinlock_t pgd_lock; extern struct list_head pgd_list; void pmd_ctor(void *, kmem_cache_t *, unsigned long); +void kpmd_ctor(void *, kmem_cache_t *, unsigned long); void pgd_ctor(void *, kmem_cache_t *, unsigned long); void pgd_dtor(void *, kmem_cache_t *, unsigned long); void pgtable_cache_init(void); _