This code is playing with page->lru from pages which came from slab. But to remove page->list we need to convert slab over to using page->lru. So we cannot allow the i386 pagetable code to go scribbling on the ->lru field of active slab pages. This optimisation was pretty thin, and it is more important to shrink the pageframe (on all architectures). --- 25-akpm/arch/i386/mm/init.c | 30 ++------ 25-akpm/arch/i386/mm/pageattr.c | 25 +++---- 25-akpm/arch/i386/mm/pgtable.c | 102 ++++++++++-------------------- 25-akpm/include/asm-i386/pgtable-3level.h | 2 25-akpm/include/asm-i386/pgtable.h | 30 ++++---- 5 files changed, 75 insertions(+), 114 deletions(-) diff -puN arch/i386/mm/init.c~unslabify-pgds-and-pmds arch/i386/mm/init.c --- 25/arch/i386/mm/init.c~unslabify-pgds-and-pmds 2004-04-03 03:00:13.795134376 -0800 +++ 25-akpm/arch/i386/mm/init.c 2004-04-03 03:00:13.803133160 -0800 @@ -523,30 +523,20 @@ void __init mem_init(void) #endif } -kmem_cache_t *pgd_cache; -kmem_cache_t *pmd_cache; +#ifdef CONFIG_X86_PAE +struct kmem_cache_s *pae_pgd_cachep; void __init pgtable_cache_init(void) { - if (PTRS_PER_PMD > 1) { - pmd_cache = kmem_cache_create("pmd", - PTRS_PER_PMD*sizeof(pmd_t), - PTRS_PER_PMD*sizeof(pmd_t), - 0, - pmd_ctor, - NULL); - if (!pmd_cache) - panic("pgtable_cache_init(): cannot create pmd cache"); - } - pgd_cache = kmem_cache_create("pgd", - PTRS_PER_PGD*sizeof(pgd_t), - PTRS_PER_PGD*sizeof(pgd_t), - 0, - pgd_ctor, - PTRS_PER_PMD == 1 ? pgd_dtor : NULL); - if (!pgd_cache) - panic("pgtable_cache_init(): Cannot create pgd cache"); + /* + * PAE pgds must be 16-byte aligned: + */ + pae_pgd_cachep = kmem_cache_create("pae_pgd", 32, 0, + SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, NULL, NULL); + if (!pae_pgd_cachep) + panic("init_pae(): Cannot alloc pae_pgd SLAB cache"); } +#endif /* * This function cannot be __init, since exceptions don't work in that diff -puN arch/i386/mm/pageattr.c~unslabify-pgds-and-pmds arch/i386/mm/pageattr.c --- 25/arch/i386/mm/pageattr.c~unslabify-pgds-and-pmds 2004-04-03 03:00:13.796134224 -0800 +++ 25-akpm/arch/i386/mm/pageattr.c 2004-04-03 03:00:13.803133160 -0800 @@ -67,22 +67,19 @@ static void flush_kernel_map(void *dummy static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) { - struct page *page; - unsigned long flags; - set_pte_atomic(kpte, pte); /* change init_mm */ - if (PTRS_PER_PMD > 1) - return; - - spin_lock_irqsave(&pgd_lock, flags); - list_for_each_entry(page, &pgd_list, lru) { - pgd_t *pgd; - pmd_t *pmd; - pgd = (pgd_t *)page_address(page) + pgd_index(address); - pmd = pmd_offset(pgd, address); - set_pte_atomic((pte_t *)pmd, pte); +#ifndef CONFIG_X86_PAE + { + struct list_head *l; + spin_lock(&mmlist_lock); + list_for_each(l, &init_mm.mmlist) { + struct mm_struct *mm = list_entry(l, struct mm_struct, mmlist); + pmd_t *pmd = pmd_offset(pgd_offset(mm, address), address); + set_pte_atomic((pte_t *)pmd, pte); + } + spin_unlock(&mmlist_lock); } - spin_unlock_irqrestore(&pgd_lock, flags); +#endif } /* diff -puN arch/i386/mm/pgtable.c~unslabify-pgds-and-pmds arch/i386/mm/pgtable.c --- 25/arch/i386/mm/pgtable.c~unslabify-pgds-and-pmds 2004-04-03 03:00:13.797134072 -0800 +++ 25-akpm/arch/i386/mm/pgtable.c 2004-04-03 03:00:13.804133008 -0800 @@ -12,7 +12,6 @@ #include #include #include -#include #include #include @@ -152,88 +151,61 @@ struct page *pte_alloc_one(struct mm_str return pte; } -void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags) -{ - memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); -} - -/* - * List of all pgd's needed for non-PAE so it can invalidate entries - * in both cached and uncached pgd's; not needed for PAE since the - * kernel pmd is shared. If PAE were not to share the pmd a similar - * tactic would be needed. This is essentially codepath-based locking - * against pageattr.c; it is the unique case in which a valid change - * of kernel pagetables can't be lazily synchronized by vmalloc faults. - * vmalloc faults work because attached pagetables are never freed. - * If the locking proves to be non-performant, a ticketing scheme with - * checks at dup_mmap(), exec(), and other mmlist addition points - * could be used. The locking scheme was chosen on the basis of - * manfred's recommendations and having no core impact whatsoever. - * -- wli - */ -spinlock_t pgd_lock = SPIN_LOCK_UNLOCKED; -LIST_HEAD(pgd_list); +#ifdef CONFIG_X86_PAE -void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused) +pgd_t *pgd_alloc(struct mm_struct *mm) { - unsigned long flags; - - if (PTRS_PER_PMD == 1) - spin_lock_irqsave(&pgd_lock, flags); + int i; + pgd_t *pgd = kmem_cache_alloc(pae_pgd_cachep, GFP_KERNEL); - memcpy((pgd_t *)pgd + USER_PTRS_PER_PGD, + if (pgd) { + for (i = 0; i < USER_PTRS_PER_PGD; i++) { + unsigned long pmd = __get_free_page(GFP_KERNEL); + if (!pmd) + goto out_oom; + clear_page(pmd); + set_pgd(pgd + i, __pgd(1 + __pa(pmd))); + } + memcpy(pgd + USER_PTRS_PER_PGD, swapper_pg_dir + USER_PTRS_PER_PGD, (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); - - if (PTRS_PER_PMD > 1) - return; - - list_add(&virt_to_page(pgd)->lru, &pgd_list); - spin_unlock_irqrestore(&pgd_lock, flags); - memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); + } + return pgd; +out_oom: + for (i--; i >= 0; i--) + free_page((unsigned long)__va(pgd_val(pgd[i])-1)); + kmem_cache_free(pae_pgd_cachep, pgd); + return NULL; } -/* never called when PTRS_PER_PMD > 1 */ -void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused) +void pgd_free(pgd_t *pgd) { - unsigned long flags; /* can be called from interrupt context */ + int i; - spin_lock_irqsave(&pgd_lock, flags); - list_del(&virt_to_page(pgd)->lru); - spin_unlock_irqrestore(&pgd_lock, flags); + for (i = 0; i < USER_PTRS_PER_PGD; i++) + free_page((unsigned long)__va(pgd_val(pgd[i])-1)); + kmem_cache_free(pae_pgd_cachep, pgd); } +#else + pgd_t *pgd_alloc(struct mm_struct *mm) { - int i; - pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL); - - if (PTRS_PER_PMD == 1 || !pgd) - return pgd; + pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL); - for (i = 0; i < USER_PTRS_PER_PGD; ++i) { - pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); - if (!pmd) - goto out_oom; - set_pgd(&pgd[i], __pgd(1 + __pa((u64)((u32)pmd)))); + if (pgd) { + memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); + memcpy(pgd + USER_PTRS_PER_PGD, + swapper_pg_dir + USER_PTRS_PER_PGD, + (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); } return pgd; - -out_oom: - for (i--; i >= 0; i--) - kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); - kmem_cache_free(pgd_cache, pgd); - return NULL; } void pgd_free(pgd_t *pgd) { - int i; - - /* in the PAE case user pgd entries are overwritten before usage */ - if (PTRS_PER_PMD > 1) - for (i = 0; i < USER_PTRS_PER_PGD; ++i) - kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); - /* in the non-PAE case, clear_page_tables() clears user pgd entries */ - kmem_cache_free(pgd_cache, pgd); + free_page((unsigned long)pgd); } + +#endif /* CONFIG_X86_PAE */ + diff -puN include/asm-i386/pgtable-3level.h~unslabify-pgds-and-pmds include/asm-i386/pgtable-3level.h --- 25/include/asm-i386/pgtable-3level.h~unslabify-pgds-and-pmds 2004-04-03 03:00:13.798133920 -0800 +++ 25-akpm/include/asm-i386/pgtable-3level.h 2004-04-03 03:00:13.805132856 -0800 @@ -123,4 +123,6 @@ static inline pmd_t pfn_pmd(unsigned lon #define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) }) #define PTE_FILE_MAX_BITS 32 +extern struct kmem_cache_s *pae_pgd_cachep; + #endif /* _I386_PGTABLE_3LEVEL_H */ diff -puN include/asm-i386/pgtable.h~unslabify-pgds-and-pmds include/asm-i386/pgtable.h --- 25/include/asm-i386/pgtable.h~unslabify-pgds-and-pmds 2004-04-03 03:00:13.799133768 -0800 +++ 25-akpm/include/asm-i386/pgtable.h 2004-04-03 03:00:13.805132856 -0800 @@ -21,27 +21,15 @@ #include #endif -#include -#include -#include +extern pgd_t swapper_pg_dir[1024]; +extern void paging_init(void); /* * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. */ -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) extern unsigned long empty_zero_page[1024]; -extern pgd_t swapper_pg_dir[1024]; -extern kmem_cache_t *pgd_cache; -extern kmem_cache_t *pmd_cache; -extern spinlock_t pgd_lock; -extern struct list_head pgd_list; - -void pmd_ctor(void *, kmem_cache_t *, unsigned long); -void pgd_ctor(void *, kmem_cache_t *, unsigned long); -void pgd_dtor(void *, kmem_cache_t *, unsigned long); -void pgtable_cache_init(void); -void paging_init(void); +#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) #endif /* !__ASSEMBLY__ */ @@ -53,8 +41,20 @@ void paging_init(void); #ifndef __ASSEMBLY__ #ifdef CONFIG_X86_PAE # include + +/* + * Need to initialise the X86 PAE caches + */ +extern void pgtable_cache_init(void); + #else # include + +/* + * No page table caches to initialise + */ +#define pgtable_cache_init() do { } while (0) + #endif #endif _