diff -purN -X /home/mbligh/.diff.exclude reference/arch/arm/mm/fault-armv.c current/arch/arm/mm/fault-armv.c --- reference/arch/arm/mm/fault-armv.c 2003-10-01 11:47:31.000000000 -0700 +++ current/arch/arm/mm/fault-armv.c 2004-04-12 10:22:25.000000000 -0700 @@ -191,7 +191,7 @@ void __flush_dcache_page(struct page *pa __cpuc_flush_dcache_page(page_address(page)); - if (!page->mapping) + if (!page_mapping(page)) return; /* @@ -292,7 +292,7 @@ void update_mmu_cache(struct vm_area_str if (!pfn_valid(pfn)) return; page = pfn_to_page(pfn); - if (page->mapping) { + if (page_mapping(page)) { int dirty = test_and_clear_bit(PG_dcache_dirty, &page->flags); if (dirty) diff -purN -X /home/mbligh/.diff.exclude reference/arch/arm/mm/mm-armv.c current/arch/arm/mm/mm-armv.c --- reference/arch/arm/mm/mm-armv.c 2004-03-11 14:33:33.000000000 -0800 +++ current/arch/arm/mm/mm-armv.c 2004-04-12 10:22:26.000000000 -0700 @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -232,7 +231,7 @@ void free_pgd_slow(pgd_t *pgd) pte = pmd_page(*pmd); pmd_clear(pmd); - pgtable_remove_rmap(pte); + dec_page_state(nr_page_table_pages); pte_free(pte); pmd_free(pmd); free: diff -purN -X /home/mbligh/.diff.exclude reference/arch/mips/mm/cache.c current/arch/mips/mm/cache.c --- reference/arch/mips/mm/cache.c 2004-03-11 14:33:51.000000000 -0800 +++ current/arch/mips/mm/cache.c 2004-04-12 10:22:25.000000000 -0700 @@ -57,7 +57,7 @@ void flush_dcache_page(struct page *page { unsigned long addr; - if (page->mapping && + if (page_mapping(page) && list_empty(&page->mapping->i_mmap) && list_empty(&page->mapping->i_mmap_shared)) { SetPageDcacheDirty(page); @@ -66,7 +66,7 @@ void flush_dcache_page(struct page *page } /* - * We could delay the flush for the !page->mapping case too. But that + * We could delay the flush for the !page_mapping case too. But that * case is for exec env/arg pages and those are %99 certainly going to * get faulted into the tlb (and thus flushed) anyways. */ @@ -81,7 +81,7 @@ void __update_cache(struct vm_area_struc unsigned long pfn, addr; pfn = pte_pfn(pte); - if (pfn_valid(pfn) && (page = pfn_to_page(pfn), page->mapping) && + if (pfn_valid(pfn) && (page = pfn_to_page(pfn), page_mapping(page)) && Page_dcache_dirty(page)) { if (pages_do_alias((unsigned long)page_address(page), address & PAGE_MASK)) { diff -purN -X /home/mbligh/.diff.exclude reference/arch/parisc/kernel/cache.c current/arch/parisc/kernel/cache.c --- reference/arch/parisc/kernel/cache.c 2004-01-15 10:41:01.000000000 -0800 +++ current/arch/parisc/kernel/cache.c 2004-04-12 10:22:25.000000000 -0700 @@ -68,7 +68,7 @@ update_mmu_cache(struct vm_area_struct * { struct page *page = pte_page(pte); - if (VALID_PAGE(page) && page->mapping && + if (VALID_PAGE(page) && page_mapping(page) && test_bit(PG_dcache_dirty, &page->flags)) { flush_kernel_dcache_page(page_address(page)); @@ -234,7 +234,7 @@ void __flush_dcache_page(struct page *pa flush_kernel_dcache_page(page_address(page)); - if (!page->mapping) + if (!page_mapping(page)) return; /* check shared list first if it's not empty...it's usually * the shortest */ diff -purN -X /home/mbligh/.diff.exclude reference/arch/ppc/mm/pgtable.c current/arch/ppc/mm/pgtable.c --- reference/arch/ppc/mm/pgtable.c 2004-04-07 14:54:00.000000000 -0700 +++ current/arch/ppc/mm/pgtable.c 2004-04-12 10:22:26.000000000 -0700 @@ -86,9 +86,14 @@ pte_t *pte_alloc_one_kernel(struct mm_st extern int mem_init_done; extern void *early_get_page(void); - if (mem_init_done) + if (mem_init_done) { pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT); - else + if (pte) { + struct page *ptepage = virt_to_page(pte); + ptepage->mapping = (void *) mm; + ptepage->index = address & PMD_MASK; + } + } else pte = (pte_t *)early_get_page(); if (pte) clear_page(pte); @@ -97,7 +102,7 @@ pte_t *pte_alloc_one_kernel(struct mm_st struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) { - struct page *pte; + struct page *ptepage; #ifdef CONFIG_HIGHPTE int flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_REPEAT; @@ -105,10 +110,13 @@ struct page *pte_alloc_one(struct mm_str int flags = GFP_KERNEL | __GFP_REPEAT; #endif - pte = alloc_pages(flags, 0); - if (pte) - clear_highpage(pte); - return pte; + ptepage = alloc_pages(flags, 0); + if (ptepage) { + ptepage->mapping = (void *) mm; + ptepage->index = address & PMD_MASK; + clear_highpage(ptepage); + } + return ptepage; } void pte_free_kernel(pte_t *pte) @@ -116,15 +124,17 @@ void pte_free_kernel(pte_t *pte) #ifdef CONFIG_SMP hash_page_sync(); #endif + virt_to_page(pte)->mapping = NULL; free_page((unsigned long)pte); } -void pte_free(struct page *pte) +void pte_free(struct page *ptepage) { #ifdef CONFIG_SMP hash_page_sync(); #endif - __free_page(pte); + ptepage->mapping = NULL; + __free_page(ptepage); } #ifndef CONFIG_44x diff -purN -X /home/mbligh/.diff.exclude reference/arch/ppc64/mm/hugetlbpage.c current/arch/ppc64/mm/hugetlbpage.c --- reference/arch/ppc64/mm/hugetlbpage.c 2004-04-07 14:54:00.000000000 -0700 +++ current/arch/ppc64/mm/hugetlbpage.c 2004-04-12 10:22:27.000000000 -0700 @@ -25,7 +25,6 @@ #include #include #include -#include #include @@ -279,7 +278,7 @@ static int open_32bit_htlbpage_range(str } pmd_clear(pmd); - pgtable_remove_rmap(page); + dec_page_state(nr_page_table_pages); pte_free(page); } } diff -purN -X /home/mbligh/.diff.exclude reference/arch/ppc64/mm/tlb.c current/arch/ppc64/mm/tlb.c --- reference/arch/ppc64/mm/tlb.c 2004-03-11 14:33:55.000000000 -0800 +++ current/arch/ppc64/mm/tlb.c 2004-04-12 10:22:27.000000000 -0700 @@ -31,7 +31,6 @@ #include #include #include -#include DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch); @@ -59,7 +58,8 @@ void hpte_update(pte_t *ptep, unsigned l ptepage = virt_to_page(ptep); mm = (struct mm_struct *) ptepage->mapping; - addr = ptep_to_address(ptep); + addr = ptepage->index + + (((unsigned long)ptep & ~PAGE_MASK) * PTRS_PER_PTE); if (REGION_ID(addr) == USER_REGION_ID) context = mm->context.id; diff -purN -X /home/mbligh/.diff.exclude reference/arch/sparc64/kernel/smp.c current/arch/sparc64/kernel/smp.c --- reference/arch/sparc64/kernel/smp.c 2004-04-07 14:54:02.000000000 -0700 +++ current/arch/sparc64/kernel/smp.c 2004-04-12 10:22:25.000000000 -0700 @@ -671,9 +671,9 @@ static __inline__ void __local_flush_dca #if (L1DCACHE_SIZE > PAGE_SIZE) __flush_dcache_page(page->virtual, ((tlb_type == spitfire) && - page->mapping != NULL)); + page_mapping(page) != NULL)); #else - if (page->mapping != NULL && + if (page_mapping(page) != NULL && tlb_type == spitfire) __flush_icache_page(__pa(page->virtual)); #endif @@ -694,7 +694,7 @@ void smp_flush_dcache_page_impl(struct p if (tlb_type == spitfire) { data0 = ((u64)&xcall_flush_dcache_page_spitfire); - if (page->mapping != NULL) + if (page_mapping(page) != NULL) data0 |= ((u64)1 << 32); spitfire_xcall_deliver(data0, __pa(page->virtual), @@ -727,7 +727,7 @@ void flush_dcache_page_all(struct mm_str goto flush_self; if (tlb_type == spitfire) { data0 = ((u64)&xcall_flush_dcache_page_spitfire); - if (page->mapping != NULL) + if (page_mapping(page) != NULL) data0 |= ((u64)1 << 32); spitfire_xcall_deliver(data0, __pa(page->virtual), diff -purN -X /home/mbligh/.diff.exclude reference/arch/sparc64/mm/init.c current/arch/sparc64/mm/init.c --- reference/arch/sparc64/mm/init.c 2004-04-07 14:54:02.000000000 -0700 +++ current/arch/sparc64/mm/init.c 2004-04-12 10:22:25.000000000 -0700 @@ -139,9 +139,9 @@ __inline__ void flush_dcache_page_impl(s #if (L1DCACHE_SIZE > PAGE_SIZE) __flush_dcache_page(page->virtual, ((tlb_type == spitfire) && - page->mapping != NULL)); + page_mapping(page) != NULL)); #else - if (page->mapping != NULL && + if (page_mapping(page) != NULL && tlb_type == spitfire) __flush_icache_page(__pa(page->virtual)); #endif @@ -203,7 +203,7 @@ void update_mmu_cache(struct vm_area_str pfn = pte_pfn(pte); if (pfn_valid(pfn) && - (page = pfn_to_page(pfn), page->mapping) && + (page = pfn_to_page(pfn), page_mapping(page)) && ((pg_flags = page->flags) & (1UL << PG_dcache_dirty))) { int cpu = ((pg_flags >> 24) & (NR_CPUS - 1UL)); @@ -227,7 +227,7 @@ void flush_dcache_page(struct page *page int dirty = test_bit(PG_dcache_dirty, &page->flags); int dirty_cpu = dcache_dirty_cpu(page); - if (page->mapping && + if (page_mapping(page) && list_empty(&page->mapping->i_mmap) && list_empty(&page->mapping->i_mmap_shared)) { if (dirty) { @@ -237,7 +237,7 @@ void flush_dcache_page(struct page *page } set_dcache_dirty(page); } else { - /* We could delay the flush for the !page->mapping + /* We could delay the flush for the !page_mapping * case too. But that case is for exec env/arg * pages and those are %99 certainly going to get * faulted into the tlb (and thus flushed) anyways. @@ -279,7 +279,7 @@ static inline void flush_cache_pte_range if (!pfn_valid(pfn)) continue; page = pfn_to_page(pfn); - if (PageReserved(page) || !page->mapping) + if (PageReserved(page) || !page_mapping(page)) continue; pgaddr = (unsigned long) page_address(page); uaddr = address + offset; diff -purN -X /home/mbligh/.diff.exclude reference/fs/buffer.c current/fs/buffer.c --- reference/fs/buffer.c 2004-04-07 14:54:28.000000000 -0700 +++ current/fs/buffer.c 2004-04-12 10:22:25.000000000 -0700 @@ -837,19 +837,10 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); * * FIXME: may need to call ->reservepage here as well. That's rather up to the * address_space though. - * - * For now, we treat swapper_space specially. It doesn't use the normal - * block a_ops. */ int __set_page_dirty_buffers(struct page *page) { struct address_space * const mapping = page->mapping; - int ret = 0; - - if (mapping == NULL) { - SetPageDirty(page); - goto out; - } spin_lock(&mapping->private_lock); if (page_has_buffers(page)) { @@ -878,8 +869,7 @@ int __set_page_dirty_buffers(struct page __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } -out: - return ret; + return 0; } EXPORT_SYMBOL(__set_page_dirty_buffers); @@ -1576,7 +1566,7 @@ static inline void discard_buffer(struct */ int try_to_release_page(struct page *page, int gfp_mask) { - struct address_space * const mapping = page->mapping; + struct address_space * const mapping = page_mapping(page); if (!PageLocked(page)) BUG(); @@ -2881,7 +2871,7 @@ failed: int try_to_free_buffers(struct page *page) { - struct address_space * const mapping = page->mapping; + struct address_space * const mapping = page_mapping(page); struct buffer_head *buffers_to_free = NULL; int ret = 0; @@ -2889,14 +2879,14 @@ int try_to_free_buffers(struct page *pag if (PageWriteback(page)) return 0; - if (mapping == NULL) { /* swapped-in anon page */ + if (mapping == NULL) { /* can this still happen? */ ret = drop_buffers(page, &buffers_to_free); goto out; } spin_lock(&mapping->private_lock); ret = drop_buffers(page, &buffers_to_free); - if (ret && !PageSwapCache(page)) { + if (ret) { /* * If the filesystem writes its buffers by hand (eg ext3) * then we can have clean buffers against a dirty page. We diff -purN -X /home/mbligh/.diff.exclude reference/fs/exec.c current/fs/exec.c --- reference/fs/exec.c 2004-03-11 14:35:06.000000000 -0800 +++ current/fs/exec.c 2004-04-12 10:22:26.000000000 -0700 @@ -45,7 +45,7 @@ #include #include #include -#include +#include #include #include @@ -293,53 +293,46 @@ EXPORT_SYMBOL(copy_strings_kernel); * This routine is used to map in a page into an address space: needed by * execve() for the initial stack and environment pages. * - * tsk->mmap_sem is held for writing. + * tsk->mm->mmap_sem is held for writing. */ void put_dirty_page(struct task_struct *tsk, struct page *page, unsigned long address, pgprot_t prot) { + struct mm_struct *mm = tsk->mm; pgd_t * pgd; pmd_t * pmd; pte_t * pte; - struct pte_chain *pte_chain; if (page_count(page) != 1) printk(KERN_ERR "mem_map disagrees with %p at %08lx\n", page, address); - pgd = pgd_offset(tsk->mm, address); - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto out_sig; - spin_lock(&tsk->mm->page_table_lock); - pmd = pmd_alloc(tsk->mm, pgd, address); + pgd = pgd_offset(mm, address); + spin_lock(&mm->page_table_lock); + pmd = pmd_alloc(mm, pgd, address); if (!pmd) goto out; - pte = pte_alloc_map(tsk->mm, pmd, address); + pte = pte_alloc_map(mm, pmd, address); if (!pte) goto out; if (!pte_none(*pte)) { pte_unmap(pte); goto out; } + mm->rss++; lru_cache_add_active(page); flush_dcache_page(page); set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, prot)))); - pte_chain = page_add_rmap(page, pte, pte_chain); + page_add_anon_rmap(page, mm, address); pte_unmap(pte); - tsk->mm->rss++; - spin_unlock(&tsk->mm->page_table_lock); + spin_unlock(&mm->page_table_lock); /* no need for flush_tlb */ - pte_chain_free(pte_chain); return; out: - spin_unlock(&tsk->mm->page_table_lock); -out_sig: + spin_unlock(&mm->page_table_lock); __free_page(page); force_sig(SIGKILL, tsk); - pte_chain_free(pte_chain); - return; } int setup_arg_pages(struct linux_binprm *bprm) diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-alpha/pgtable.h current/include/asm-alpha/pgtable.h --- reference/include/asm-alpha/pgtable.h 2003-10-14 15:50:32.000000000 -0700 +++ current/include/asm-alpha/pgtable.h 2004-04-12 10:22:27.000000000 -0700 @@ -349,6 +349,4 @@ extern void paging_init(void); /* We have our own get_unmapped_area to cope with ADDR_LIMIT_32BIT. */ #define HAVE_ARCH_UNMAPPED_AREA -typedef pte_t *pte_addr_t; - #endif /* _ALPHA_PGTABLE_H */ diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-alpha/rmap.h current/include/asm-alpha/rmap.h --- reference/include/asm-alpha/rmap.h 2002-12-09 18:46:10.000000000 -0800 +++ current/include/asm-alpha/rmap.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,7 +0,0 @@ -#ifndef _ALPHA_RMAP_H -#define _ALPHA_RMAP_H - -/* nothing to see, move along */ -#include - -#endif diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-arm/cacheflush.h current/include/asm-arm/cacheflush.h --- reference/include/asm-arm/cacheflush.h 2004-03-11 14:35:14.000000000 -0800 +++ current/include/asm-arm/cacheflush.h 2004-04-12 10:22:25.000000000 -0700 @@ -283,7 +283,7 @@ flush_cache_page(struct vm_area_struct * * flush_dcache_page is used when the kernel has written to the page * cache page at virtual address page->virtual. * - * If this page isn't mapped (ie, page->mapping = NULL), or it has + * If this page isn't mapped (ie, page_mapping == NULL), or it has * userspace mappings (page->mapping->i_mmap or page->mapping->i_mmap_shared) * then we _must_ always clean + invalidate the dcache entries associated * with the kernel mapping. @@ -299,7 +299,7 @@ extern void __flush_dcache_page(struct p static inline void flush_dcache_page(struct page *page) { - if (page->mapping && !mapping_mapped(page->mapping)) + if (page_mapping(page) && !mapping_mapped(page->mapping)) set_bit(PG_dcache_dirty, &page->flags); else __flush_dcache_page(page); diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-arm/kmap_types.h current/include/asm-arm/kmap_types.h --- reference/include/asm-arm/kmap_types.h 2004-01-15 10:41:16.000000000 -0800 +++ current/include/asm-arm/kmap_types.h 2004-04-12 10:22:27.000000000 -0700 @@ -14,7 +14,6 @@ enum km_type { KM_BIO_DST_IRQ, KM_PTE0, KM_PTE1, - KM_PTE2, KM_IRQ0, KM_IRQ1, KM_SOFTIRQ0, diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-arm/pgtable.h current/include/asm-arm/pgtable.h --- reference/include/asm-arm/pgtable.h 2004-01-15 10:41:16.000000000 -0800 +++ current/include/asm-arm/pgtable.h 2004-04-12 10:22:27.000000000 -0700 @@ -353,8 +353,6 @@ extern pgd_t swapper_pg_dir[PTRS_PER_PGD #define io_remap_page_range(vma,from,phys,size,prot) \ remap_page_range(vma,from,phys,size,prot) -typedef pte_t *pte_addr_t; - #define pgtable_cache_init() do { } while (0) #endif /* !__ASSEMBLY__ */ diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-arm/rmap.h current/include/asm-arm/rmap.h --- reference/include/asm-arm/rmap.h 2002-12-09 18:45:42.000000000 -0800 +++ current/include/asm-arm/rmap.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,6 +0,0 @@ -#ifndef _ARM_RMAP_H -#define _ARM_RMAP_H - -#include - -#endif /* _ARM_RMAP_H */ diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-arm26/pgtable.h current/include/asm-arm26/pgtable.h --- reference/include/asm-arm26/pgtable.h 2003-10-14 15:50:32.000000000 -0700 +++ current/include/asm-arm26/pgtable.h 2004-04-12 10:22:27.000000000 -0700 @@ -290,8 +290,6 @@ static inline pte_t mk_pte_phys(unsigned #define io_remap_page_range(vma,from,phys,size,prot) \ remap_page_range(vma,from,phys,size,prot) -typedef pte_t *pte_addr_t; - #endif /* !__ASSEMBLY__ */ #endif /* _ASMARM_PGTABLE_H */ diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-arm26/rmap.h current/include/asm-arm26/rmap.h --- reference/include/asm-arm26/rmap.h 2003-06-19 14:41:50.000000000 -0700 +++ current/include/asm-arm26/rmap.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,66 +0,0 @@ -#ifndef _ARM_RMAP_H -#define _ARM_RMAP_H - -/* - * linux/include/asm-arm26/proc-armv/rmap.h - * - * Architecture dependant parts of the reverse mapping code, - * - * ARM is different since hardware page tables are smaller than - * the page size and Linux uses a "duplicate" one with extra info. - * For rmap this means that the first 2 kB of a page are the hardware - * page tables and the last 2 kB are the software page tables. - */ - -static inline void pgtable_add_rmap(struct page *page, struct mm_struct * mm, unsigned long address) -{ - page->mapping = (void *)mm; - page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1); - inc_page_state(nr_page_table_pages); -} - -static inline void pgtable_remove_rmap(struct page *page) -{ - page->mapping = NULL; - page->index = 0; - dec_page_state(nr_page_table_pages); -} - -static inline struct mm_struct * ptep_to_mm(pte_t * ptep) -{ - struct page * page = virt_to_page(ptep); - return (struct mm_struct *)page->mapping; -} - -/* The page table takes half of the page */ -#define PTE_MASK ((PAGE_SIZE / 2) - 1) - -static inline unsigned long ptep_to_address(pte_t * ptep) -{ - struct page * page = virt_to_page(ptep); - unsigned long low_bits; - - low_bits = ((unsigned long)ptep & PTE_MASK) * PTRS_PER_PTE; - return page->index + low_bits; -} - -//FIXME!!! IS these correct? -static inline pte_addr_t ptep_to_paddr(pte_t *ptep) -{ - return (pte_addr_t)ptep; -} - -static inline pte_t *rmap_ptep_map(pte_addr_t pte_paddr) -{ - return (pte_t *)pte_paddr; -} - -static inline void rmap_ptep_unmap(pte_t *pte) -{ - return; -} - - -//#include - -#endif /* _ARM_RMAP_H */ diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-cris/pgtable.h current/include/asm-cris/pgtable.h --- reference/include/asm-cris/pgtable.h 2003-07-28 15:31:11.000000000 -0700 +++ current/include/asm-cris/pgtable.h 2004-04-12 10:22:27.000000000 -0700 @@ -337,6 +337,4 @@ extern inline void update_mmu_cache(stru #define pte_to_pgoff(x) (pte_val(x) >> 6) #define pgoff_to_pte(x) __pte(((x) << 6) | _PAGE_FILE) -typedef pte_t *pte_addr_t; - #endif /* _CRIS_PGTABLE_H */ diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-cris/rmap.h current/include/asm-cris/rmap.h --- reference/include/asm-cris/rmap.h 2002-12-09 18:46:10.000000000 -0800 +++ current/include/asm-cris/rmap.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,7 +0,0 @@ -#ifndef _CRIS_RMAP_H -#define _CRIS_RMAP_H - -/* nothing to see, move along :) */ -#include - -#endif diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-generic/rmap.h current/include/asm-generic/rmap.h --- reference/include/asm-generic/rmap.h 2003-06-05 14:56:02.000000000 -0700 +++ current/include/asm-generic/rmap.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,90 +0,0 @@ -#ifndef _GENERIC_RMAP_H -#define _GENERIC_RMAP_H -/* - * linux/include/asm-generic/rmap.h - * - * Architecture dependent parts of the reverse mapping code, - * this version should work for most architectures with a - * 'normal' page table layout. - * - * We use the struct page of the page table page to find out - * the process and full address of a page table entry: - * - page->mapping points to the process' mm_struct - * - page->index has the high bits of the address - * - the lower bits of the address are calculated from the - * offset of the page table entry within the page table page - * - * For CONFIG_HIGHPTE, we need to represent the address of a pte in a - * scalar pte_addr_t. The pfn of the pte's page is shifted left by PAGE_SIZE - * bits and is then ORed with the byte offset of the pte within its page. - * - * For CONFIG_HIGHMEM4G, the pte_addr_t is 32 bits. 20 for the pfn, 12 for - * the offset. - * - * For CONFIG_HIGHMEM64G, the pte_addr_t is 64 bits. 52 for the pfn, 12 for - * the offset. - */ -#include - -static inline void pgtable_add_rmap(struct page * page, struct mm_struct * mm, unsigned long address) -{ -#ifdef BROKEN_PPC_PTE_ALLOC_ONE - /* OK, so PPC calls pte_alloc() before mem_map[] is setup ... ;( */ - extern int mem_init_done; - - if (!mem_init_done) - return; -#endif - page->mapping = (void *)mm; - page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1); - inc_page_state(nr_page_table_pages); -} - -static inline void pgtable_remove_rmap(struct page * page) -{ - page->mapping = NULL; - page->index = 0; - dec_page_state(nr_page_table_pages); -} - -static inline struct mm_struct * ptep_to_mm(pte_t * ptep) -{ - struct page * page = kmap_atomic_to_page(ptep); - return (struct mm_struct *) page->mapping; -} - -static inline unsigned long ptep_to_address(pte_t * ptep) -{ - struct page * page = kmap_atomic_to_page(ptep); - unsigned long low_bits; - low_bits = ((unsigned long)ptep & ~PAGE_MASK) * PTRS_PER_PTE; - return page->index + low_bits; -} - -#ifdef CONFIG_HIGHPTE -static inline pte_addr_t ptep_to_paddr(pte_t *ptep) -{ - pte_addr_t paddr; - paddr = ((pte_addr_t)page_to_pfn(kmap_atomic_to_page(ptep))) << PAGE_SHIFT; - return paddr + (pte_addr_t)((unsigned long)ptep & ~PAGE_MASK); -} -#else -static inline pte_addr_t ptep_to_paddr(pte_t *ptep) -{ - return (pte_addr_t)ptep; -} -#endif - -#ifndef CONFIG_HIGHPTE -static inline pte_t *rmap_ptep_map(pte_addr_t pte_paddr) -{ - return (pte_t *)pte_paddr; -} - -static inline void rmap_ptep_unmap(pte_t *pte) -{ - return; -} -#endif - -#endif /* _GENERIC_RMAP_H */ diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-h8300/pgtable.h current/include/asm-h8300/pgtable.h --- reference/include/asm-h8300/pgtable.h 2003-10-01 11:35:30.000000000 -0700 +++ current/include/asm-h8300/pgtable.h 2004-04-12 10:22:27.000000000 -0700 @@ -7,8 +7,6 @@ #include #include -typedef pte_t *pte_addr_t; - #define pgd_present(pgd) (1) /* pages are always present on NO_MM */ #define pgd_none(pgd) (0) #define pgd_bad(pgd) (0) diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-i386/kmap_types.h current/include/asm-i386/kmap_types.h --- reference/include/asm-i386/kmap_types.h 2003-06-05 14:56:03.000000000 -0700 +++ current/include/asm-i386/kmap_types.h 2004-04-12 10:22:27.000000000 -0700 @@ -19,7 +19,6 @@ D(5) KM_BIO_SRC_IRQ, D(6) KM_BIO_DST_IRQ, D(7) KM_PTE0, D(8) KM_PTE1, -D(9) KM_PTE2, D(10) KM_IRQ0, D(11) KM_IRQ1, D(12) KM_SOFTIRQ0, diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-i386/pgtable.h current/include/asm-i386/pgtable.h --- reference/include/asm-i386/pgtable.h 2004-04-07 14:54:32.000000000 -0700 +++ current/include/asm-i386/pgtable.h 2004-04-12 10:22:27.000000000 -0700 @@ -308,18 +308,6 @@ static inline pte_t pte_modify(pte_t pte #define pte_unmap_nested(pte) do { } while (0) #endif -#if defined(CONFIG_HIGHPTE) && defined(CONFIG_HIGHMEM4G) -typedef u32 pte_addr_t; -#endif - -#if defined(CONFIG_HIGHPTE) && defined(CONFIG_HIGHMEM64G) -typedef u64 pte_addr_t; -#endif - -#if !defined(CONFIG_HIGHPTE) -typedef pte_t *pte_addr_t; -#endif - /* * The i386 doesn't have any external MMU info: the kernel page * tables contain all the necessary information. diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-i386/rmap.h current/include/asm-i386/rmap.h --- reference/include/asm-i386/rmap.h 2002-12-09 18:46:11.000000000 -0800 +++ current/include/asm-i386/rmap.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,21 +0,0 @@ -#ifndef _I386_RMAP_H -#define _I386_RMAP_H - -/* nothing to see, move along */ -#include - -#ifdef CONFIG_HIGHPTE -static inline pte_t *rmap_ptep_map(pte_addr_t pte_paddr) -{ - unsigned long pfn = (unsigned long)(pte_paddr >> PAGE_SHIFT); - unsigned long off = ((unsigned long)pte_paddr) & ~PAGE_MASK; - return (pte_t *)((char *)kmap_atomic(pfn_to_page(pfn), KM_PTE2) + off); -} - -static inline void rmap_ptep_unmap(pte_t *pte) -{ - kunmap_atomic(pte, KM_PTE2); -} -#endif - -#endif diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-ia64/pgtable.h current/include/asm-ia64/pgtable.h --- reference/include/asm-ia64/pgtable.h 2004-02-04 16:24:28.000000000 -0800 +++ current/include/asm-ia64/pgtable.h 2004-04-12 10:22:27.000000000 -0700 @@ -468,8 +468,6 @@ extern void hugetlb_free_pgtables(struct struct vm_area_struct * prev, unsigned long start, unsigned long end); #endif -typedef pte_t *pte_addr_t; - /* * IA-64 doesn't have any external MMU info: the page tables contain all the necessary * information. However, we use this routine to take care of any (delayed) i-cache diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-ia64/rmap.h current/include/asm-ia64/rmap.h --- reference/include/asm-ia64/rmap.h 2002-12-09 18:45:55.000000000 -0800 +++ current/include/asm-ia64/rmap.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,7 +0,0 @@ -#ifndef _ASM_IA64_RMAP_H -#define _ASM_IA64_RMAP_H - -/* nothing to see, move along */ -#include - -#endif /* _ASM_IA64_RMAP_H */ diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-m68k/pgtable.h current/include/asm-m68k/pgtable.h --- reference/include/asm-m68k/pgtable.h 2004-02-04 16:24:29.000000000 -0800 +++ current/include/asm-m68k/pgtable.h 2004-04-12 10:22:27.000000000 -0700 @@ -168,8 +168,6 @@ static inline void update_mmu_cache(stru ? (__pgprot((pgprot_val(prot) & _CACHEMASK040) | _PAGE_NOCACHE_S)) \ : (prot))) -typedef pte_t *pte_addr_t; - #endif /* !__ASSEMBLY__ */ /* diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-m68k/rmap.h current/include/asm-m68k/rmap.h --- reference/include/asm-m68k/rmap.h 2002-12-09 18:46:24.000000000 -0800 +++ current/include/asm-m68k/rmap.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,7 +0,0 @@ -#ifndef _M68K_RMAP_H -#define _M68K_RMAP_H - -/* nothing to see, move along */ -#include - -#endif diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-m68knommu/pgtable.h current/include/asm-m68knommu/pgtable.h --- reference/include/asm-m68knommu/pgtable.h 2003-06-05 14:56:22.000000000 -0700 +++ current/include/asm-m68knommu/pgtable.h 2004-04-12 10:22:27.000000000 -0700 @@ -11,8 +11,6 @@ #include #include -typedef pte_t *pte_addr_t; - /* * Trivial page table functions. */ diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-m68knommu/rmap.h current/include/asm-m68knommu/rmap.h --- reference/include/asm-m68knommu/rmap.h 2002-12-09 18:45:53.000000000 -0800 +++ current/include/asm-m68knommu/rmap.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,2 +0,0 @@ -/* Do not need anything here */ - diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-mips/kmap_types.h current/include/asm-mips/kmap_types.h --- reference/include/asm-mips/kmap_types.h 2003-07-02 14:44:55.000000000 -0700 +++ current/include/asm-mips/kmap_types.h 2004-04-12 10:22:27.000000000 -0700 @@ -19,7 +19,6 @@ D(5) KM_BIO_SRC_IRQ, D(6) KM_BIO_DST_IRQ, D(7) KM_PTE0, D(8) KM_PTE1, -D(9) KM_PTE2, D(10) KM_IRQ0, D(11) KM_IRQ1, D(12) KM_SOFTIRQ0, diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-mips/pgtable-32.h current/include/asm-mips/pgtable-32.h --- reference/include/asm-mips/pgtable-32.h 2004-03-11 14:35:20.000000000 -0800 +++ current/include/asm-mips/pgtable-32.h 2004-04-12 10:22:27.000000000 -0700 @@ -216,10 +216,4 @@ static inline pmd_t *pmd_offset(pgd_t *d #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#ifdef CONFIG_64BIT_PHYS_ADDR -typedef u64 pte_addr_t; -#else -typedef pte_t *pte_addr_t; -#endif - #endif /* _ASM_PGTABLE_32_H */ diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-mips/pgtable-64.h current/include/asm-mips/pgtable-64.h --- reference/include/asm-mips/pgtable-64.h 2004-03-11 14:35:20.000000000 -0800 +++ current/include/asm-mips/pgtable-64.h 2004-04-12 10:22:27.000000000 -0700 @@ -214,6 +214,4 @@ static inline pte_t mk_swap_pte(unsigned #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -typedef pte_t *pte_addr_t; - #endif /* _ASM_PGTABLE_64_H */ diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-mips/rmap.h current/include/asm-mips/rmap.h --- reference/include/asm-mips/rmap.h 2003-07-02 14:44:56.000000000 -0700 +++ current/include/asm-mips/rmap.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,7 +0,0 @@ -#ifndef __ASM_RMAP_H -#define __ASM_RMAP_H - -/* nothing to see, move along */ -#include - -#endif /* __ASM_RMAP_H */ diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-parisc/cacheflush.h current/include/asm-parisc/cacheflush.h --- reference/include/asm-parisc/cacheflush.h 2003-10-14 15:50:33.000000000 -0700 +++ current/include/asm-parisc/cacheflush.h 2004-04-12 10:22:25.000000000 -0700 @@ -69,7 +69,7 @@ extern void __flush_dcache_page(struct p static inline void flush_dcache_page(struct page *page) { - if (page->mapping && list_empty(&page->mapping->i_mmap) && + if (page_mapping(page) && list_empty(&page->mapping->i_mmap) && list_empty(&page->mapping->i_mmap_shared)) { set_bit(PG_dcache_dirty, &page->flags); } else { diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-parisc/pgtable.h current/include/asm-parisc/pgtable.h --- reference/include/asm-parisc/pgtable.h 2004-02-04 16:24:29.000000000 -0800 +++ current/include/asm-parisc/pgtable.h 2004-04-12 10:22:27.000000000 -0700 @@ -450,8 +450,6 @@ static inline void ptep_mkdirty(pte_t *p #define pte_same(A,B) (pte_val(A) == pte_val(B)) -typedef pte_t *pte_addr_t; - #endif /* !__ASSEMBLY__ */ #define io_remap_page_range remap_page_range diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-parisc/rmap.h current/include/asm-parisc/rmap.h --- reference/include/asm-parisc/rmap.h 2002-12-09 18:46:23.000000000 -0800 +++ current/include/asm-parisc/rmap.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,7 +0,0 @@ -#ifndef _PARISC_RMAP_H -#define _PARISC_RMAP_H - -/* nothing to see, move along */ -#include - -#endif diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-ppc/pgtable.h current/include/asm-ppc/pgtable.h --- reference/include/asm-ppc/pgtable.h 2004-02-18 14:57:18.000000000 -0800 +++ current/include/asm-ppc/pgtable.h 2004-04-12 10:22:27.000000000 -0700 @@ -670,8 +670,6 @@ extern void kernel_set_cachemode (unsign */ #define pgtable_cache_init() do { } while (0) -typedef pte_t *pte_addr_t; - #endif /* !__ASSEMBLY__ */ #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-ppc/rmap.h current/include/asm-ppc/rmap.h --- reference/include/asm-ppc/rmap.h 2002-12-09 18:46:19.000000000 -0800 +++ current/include/asm-ppc/rmap.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,9 +0,0 @@ -#ifndef _PPC_RMAP_H -#define _PPC_RMAP_H - -/* PPC calls pte_alloc() before mem_map[] is setup ... */ -#define BROKEN_PPC_PTE_ALLOC_ONE - -#include - -#endif diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-ppc64/pgalloc.h current/include/asm-ppc64/pgalloc.h --- reference/include/asm-ppc64/pgalloc.h 2004-02-04 16:24:30.000000000 -0800 +++ current/include/asm-ppc64/pgalloc.h 2004-04-12 10:22:27.000000000 -0700 @@ -48,28 +48,43 @@ pmd_free(pmd_t *pmd) pmd_populate_kernel(mm, pmd, page_address(pte_page)) static inline pte_t * -pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr) +pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - return kmem_cache_alloc(zero_cache, GFP_KERNEL|__GFP_REPEAT); + pte_t *pte; + pte = kmem_cache_alloc(zero_cache, GFP_KERNEL|__GFP_REPEAT); + if (pte) { + struct page *ptepage = virt_to_page(pte); + ptepage->mapping = (void *) mm; + ptepage->index = address & PMD_MASK; + } + return pte; } static inline struct page * pte_alloc_one(struct mm_struct *mm, unsigned long address) { - pte_t *pte = pte_alloc_one_kernel(mm, address); - - if (pte) - return virt_to_page(pte); - + pte_t *pte; + pte = kmem_cache_alloc(zero_cache, GFP_KERNEL|__GFP_REPEAT); + if (pte) { + struct page *ptepage = virt_to_page(pte); + ptepage->mapping = (void *) mm; + ptepage->index = address & PMD_MASK; + return ptepage; + } return NULL; } static inline void pte_free_kernel(pte_t *pte) { + virt_to_page(pte)->mapping = NULL; kmem_cache_free(zero_cache, pte); } -#define pte_free(pte_page) pte_free_kernel(page_address(pte_page)) +static inline void pte_free(struct page *ptepage) +{ + ptepage->mapping = NULL; + kmem_cache_free(zero_cache, page_address(ptepage)); +} struct pte_freelist_batch { diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-ppc64/pgtable.h current/include/asm-ppc64/pgtable.h --- reference/include/asm-ppc64/pgtable.h 2004-03-11 14:35:23.000000000 -0800 +++ current/include/asm-ppc64/pgtable.h 2004-04-12 10:22:27.000000000 -0700 @@ -488,8 +488,6 @@ extern struct vm_struct * im_get_area(un int region_type); unsigned long im_free(void *addr); -typedef pte_t *pte_addr_t; - long pSeries_lpar_hpte_insert(unsigned long hpte_group, unsigned long va, unsigned long prpn, int secondary, unsigned long hpteflags, diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-ppc64/rmap.h current/include/asm-ppc64/rmap.h --- reference/include/asm-ppc64/rmap.h 2002-12-09 18:46:27.000000000 -0800 +++ current/include/asm-ppc64/rmap.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,9 +0,0 @@ -#ifndef _PPC64_RMAP_H -#define _PPC64_RMAP_H - -/* PPC64 calls pte_alloc() before mem_map[] is setup ... */ -#define BROKEN_PPC_PTE_ALLOC_ONE - -#include - -#endif diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-s390/pgtable.h current/include/asm-s390/pgtable.h --- reference/include/asm-s390/pgtable.h 2004-04-07 14:54:34.000000000 -0700 +++ current/include/asm-s390/pgtable.h 2004-04-12 10:22:27.000000000 -0700 @@ -760,8 +760,6 @@ extern inline pte_t mk_swap_pte(unsigned #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -typedef pte_t *pte_addr_t; - #ifndef __s390x__ # define PTE_FILE_MAX_BITS 26 #else /* __s390x__ */ diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-s390/rmap.h current/include/asm-s390/rmap.h --- reference/include/asm-s390/rmap.h 2002-12-09 18:46:10.000000000 -0800 +++ current/include/asm-s390/rmap.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,7 +0,0 @@ -#ifndef _S390_RMAP_H -#define _S390_RMAP_H - -/* nothing to see, move along */ -#include - -#endif diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-sh/pgalloc.h current/include/asm-sh/pgalloc.h --- reference/include/asm-sh/pgalloc.h 2004-02-04 16:24:31.000000000 -0800 +++ current/include/asm-sh/pgalloc.h 2004-04-12 10:22:25.000000000 -0700 @@ -101,7 +101,7 @@ static inline pte_t ptep_get_and_clear(p unsigned long pfn = pte_pfn(pte); if (pfn_valid(pfn)) { page = pfn_to_page(pfn); - if (!page->mapping + if (!page_mapping(page) || list_empty(&page->mapping->i_mmap_shared)) __clear_bit(PG_mapped, &page->flags); } diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-sh/pgtable.h current/include/asm-sh/pgtable.h --- reference/include/asm-sh/pgtable.h 2004-04-07 14:54:35.000000000 -0700 +++ current/include/asm-sh/pgtable.h 2004-04-12 10:22:27.000000000 -0700 @@ -274,8 +274,6 @@ extern void update_mmu_cache(struct vm_a #define pte_same(A,B) (pte_val(A) == pte_val(B)) -typedef pte_t *pte_addr_t; - #endif /* !__ASSEMBLY__ */ #define kern_addr_valid(addr) (1) diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-sh/rmap.h current/include/asm-sh/rmap.h --- reference/include/asm-sh/rmap.h 2002-12-09 18:46:22.000000000 -0800 +++ current/include/asm-sh/rmap.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,7 +0,0 @@ -#ifndef _SH_RMAP_H -#define _SH_RMAP_H - -/* nothing to see, move along */ -#include - -#endif diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-sparc/kmap_types.h current/include/asm-sparc/kmap_types.h --- reference/include/asm-sparc/kmap_types.h 2004-01-15 10:41:17.000000000 -0800 +++ current/include/asm-sparc/kmap_types.h 2004-04-12 10:22:27.000000000 -0700 @@ -11,7 +11,6 @@ enum km_type { KM_BIO_DST_IRQ, KM_PTE0, KM_PTE1, - KM_PTE2, KM_IRQ0, KM_IRQ1, KM_SOFTIRQ0, diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-sparc/pgtable.h current/include/asm-sparc/pgtable.h --- reference/include/asm-sparc/pgtable.h 2004-04-07 14:54:35.000000000 -0700 +++ current/include/asm-sparc/pgtable.h 2004-04-12 10:22:27.000000000 -0700 @@ -491,8 +491,6 @@ extern int io_remap_page_range(struct vm #include -typedef pte_t *pte_addr_t; - #endif /* !(__ASSEMBLY__) */ /* We provide our own get_unmapped_area to cope with VA holes for userland */ diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-sparc/rmap.h current/include/asm-sparc/rmap.h --- reference/include/asm-sparc/rmap.h 2002-12-09 18:46:23.000000000 -0800 +++ current/include/asm-sparc/rmap.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,7 +0,0 @@ -#ifndef _SPARC_RMAP_H -#define _SPARC_RMAP_H - -/* nothing to see, move along */ -#include - -#endif diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-sparc64/pgtable.h current/include/asm-sparc64/pgtable.h --- reference/include/asm-sparc64/pgtable.h 2004-01-15 10:41:17.000000000 -0800 +++ current/include/asm-sparc64/pgtable.h 2004-04-12 10:22:27.000000000 -0700 @@ -384,8 +384,6 @@ extern unsigned long get_fb_unmapped_are extern void check_pgt_cache(void); -typedef pte_t *pte_addr_t; - #endif /* !(__ASSEMBLY__) */ #endif /* !(_SPARC64_PGTABLE_H) */ diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-sparc64/rmap.h current/include/asm-sparc64/rmap.h --- reference/include/asm-sparc64/rmap.h 2002-12-09 18:45:54.000000000 -0800 +++ current/include/asm-sparc64/rmap.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,7 +0,0 @@ -#ifndef _SPARC64_RMAP_H -#define _SPARC64_RMAP_H - -/* nothing to see, move along */ -#include - -#endif diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-um/pgtable.h current/include/asm-um/pgtable.h --- reference/include/asm-um/pgtable.h 2003-10-14 15:50:34.000000000 -0700 +++ current/include/asm-um/pgtable.h 2004-04-12 10:22:27.000000000 -0700 @@ -384,18 +384,6 @@ static inline pmd_t * pmd_offset(pgd_t * #define pte_unmap(pte) kunmap_atomic((pte), KM_PTE0) #define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1) -#if defined(CONFIG_HIGHPTE) && defined(CONFIG_HIGHMEM4G) -typedef u32 pte_addr_t; -#endif - -#if defined(CONFIG_HIGHPTE) && defined(CONFIG_HIGHMEM64G) -typedef u64 pte_addr_t; -#endif - -#if !defined(CONFIG_HIGHPTE) -typedef pte_t *pte_addr_t; -#endif - #define update_mmu_cache(vma,address,pte) do ; while (0) /* Encode and de-code a swap entry */ diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-um/rmap.h current/include/asm-um/rmap.h --- reference/include/asm-um/rmap.h 2002-12-09 18:46:11.000000000 -0800 +++ current/include/asm-um/rmap.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,6 +0,0 @@ -#ifndef __UM_RMAP_H -#define __UM_RMAP_H - -#include "asm/arch/rmap.h" - -#endif diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-v850/pgtable.h current/include/asm-v850/pgtable.h --- reference/include/asm-v850/pgtable.h 2002-12-09 18:46:13.000000000 -0800 +++ current/include/asm-v850/pgtable.h 2004-04-12 10:22:27.000000000 -0700 @@ -5,8 +5,6 @@ #include -typedef pte_t *pte_addr_t; - #define pgd_present(pgd) (1) /* pages are always present on NO_MM */ #define pgd_none(pgd) (0) #define pgd_bad(pgd) (0) diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-v850/rmap.h current/include/asm-v850/rmap.h --- reference/include/asm-v850/rmap.h 2002-12-09 18:46:17.000000000 -0800 +++ current/include/asm-v850/rmap.h 1969-12-31 16:00:00.000000000 -0800 @@ -1 +0,0 @@ -/* Do not need anything here */ diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-x86_64/pgtable.h current/include/asm-x86_64/pgtable.h --- reference/include/asm-x86_64/pgtable.h 2004-03-11 14:35:28.000000000 -0800 +++ current/include/asm-x86_64/pgtable.h 2004-04-12 10:22:27.000000000 -0700 @@ -390,8 +390,6 @@ extern inline pte_t pte_modify(pte_t pte #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -typedef pte_t *pte_addr_t; - #endif /* !__ASSEMBLY__ */ extern int kern_addr_valid(unsigned long addr); diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-x86_64/rmap.h current/include/asm-x86_64/rmap.h --- reference/include/asm-x86_64/rmap.h 2002-12-09 18:46:16.000000000 -0800 +++ current/include/asm-x86_64/rmap.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,7 +0,0 @@ -#ifndef _X8664_RMAP_H -#define _X8664_RMAP_H - -/* nothing to see, move along */ -#include - -#endif diff -purN -X /home/mbligh/.diff.exclude reference/include/linux/mm.h current/include/linux/mm.h --- reference/include/linux/mm.h 2004-04-07 14:54:36.000000000 -0700 +++ current/include/linux/mm.h 2004-04-12 10:22:28.000000000 -0700 @@ -150,8 +150,6 @@ struct vm_operations_struct { int (*populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock); }; -/* forward declaration; pte_chain is meant to be internal to rmap.c */ -struct pte_chain; struct mmu_gather; struct inode; @@ -180,16 +178,12 @@ struct page { page_flags_t flags; /* atomic flags, some possibly updated asynchronously */ atomic_t count; /* Usage count, see below. */ + int mapcount; /* rmap counts ptes mapped in mms */ struct list_head list; /* ->mapping has some page lists. */ struct address_space *mapping; /* The inode (or ...) we belong to. */ unsigned long index; /* Our offset within mapping. */ struct list_head lru; /* Pageout list, eg. active_list; protected by zone->lru_lock !! */ - union { - struct pte_chain *chain;/* Reverse pte mapping pointer. - * protected by PG_chainlock */ - pte_addr_t direct; - } pte; unsigned long private; /* mapping-private opaque data */ /* @@ -404,14 +398,15 @@ void page_address_init(void); #endif /* - * Return true if this page is mapped into pagetables. Subtle: test pte.direct - * rather than pte.chain. Because sometimes pte.direct is 64-bit, and .chain - * is only 32-bit. + * On an anonymous page mapped into a user virtual memory area, + * page->mapping points to its anonmm, not to a struct address_space. + * + * Please note that, confusingly, "page_mapping" refers to the inode + * address_space which maps the page from disk; whereas "page_mapped" + * refers to user virtual address space into which the page is mapped. */ -static inline int page_mapped(struct page *page) -{ - return page->pte.direct != 0; -} +#define page_mapping(page) (PageAnon(page)? NULL: (page)->mapping) +#define page_mapped(page) ((page)->mapcount != 0) /* * Error return values for the *_nopage functions @@ -472,6 +467,7 @@ int get_user_pages(struct task_struct *t int __set_page_dirty_buffers(struct page *page); int __set_page_dirty_nobuffers(struct page *page); +int set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); /* @@ -498,23 +494,6 @@ extern struct shrinker *set_shrinker(int extern void remove_shrinker(struct shrinker *shrinker); /* - * If the mapping doesn't provide a set_page_dirty a_op, then - * just fall through and assume that it wants buffer_heads. - * FIXME: make the method unconditional. - */ -static inline int set_page_dirty(struct page *page) -{ - if (page->mapping) { - int (*spd)(struct page *); - - spd = page->mapping->a_ops->set_page_dirty; - if (spd) - return (*spd)(page); - } - return __set_page_dirty_buffers(page); -} - -/* * On a two-level page table, this ends up being trivial. Thus the * inlining and the symmetry break with pte_alloc_map() that does all * of this out-of-line. @@ -541,6 +520,9 @@ extern void si_meminfo_node(struct sysin extern void insert_vm_struct(struct mm_struct *, struct vm_area_struct *); extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *, struct rb_node **, struct rb_node *); +extern struct vm_area_struct *copy_vma(struct vm_area_struct *, + unsigned long addr, unsigned long len, unsigned long pgoff); +extern void vma_relink_file(struct vm_area_struct *, struct vm_area_struct *); extern void exit_mmap(struct mm_struct *); extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); diff -purN -X /home/mbligh/.diff.exclude reference/include/linux/page-flags.h current/include/linux/page-flags.h --- reference/include/linux/page-flags.h 2004-04-07 14:54:36.000000000 -0700 +++ current/include/linux/page-flags.h 2004-04-12 10:22:26.000000000 -0700 @@ -69,13 +69,14 @@ #define PG_private 12 /* Has something at ->private */ #define PG_writeback 13 /* Page is under writeback */ #define PG_nosave 14 /* Used for system suspend/resume */ -#define PG_chainlock 15 /* lock bit for ->pte_chain */ +#define PG_rmaplock 15 /* Lock bit for reversing to ptes */ -#define PG_direct 16 /* ->pte_chain points directly at pte */ +#define PG_swapcache 16 /* Swap page: swp_entry_t in private */ #define PG_mappedtodisk 17 /* Has blocks allocated on-disk */ #define PG_reclaim 18 /* To be reclaimed asap */ #define PG_compound 19 /* Part of a compound page */ +#define PG_anon 20 /* Anonymous page: anonmm in mapping */ /* * Global page accounting. One instance per CPU. Only unsigned longs are @@ -279,12 +280,6 @@ extern void get_full_page_state(struct p #define ClearPageNosave(page) clear_bit(PG_nosave, &(page)->flags) #define TestClearPageNosave(page) test_and_clear_bit(PG_nosave, &(page)->flags) -#define PageDirect(page) test_bit(PG_direct, &(page)->flags) -#define SetPageDirect(page) set_bit(PG_direct, &(page)->flags) -#define TestSetPageDirect(page) test_and_set_bit(PG_direct, &(page)->flags) -#define ClearPageDirect(page) clear_bit(PG_direct, &(page)->flags) -#define TestClearPageDirect(page) test_and_clear_bit(PG_direct, &(page)->flags) - #define PageMappedToDisk(page) test_bit(PG_mappedtodisk, &(page)->flags) #define SetPageMappedToDisk(page) set_bit(PG_mappedtodisk, &(page)->flags) #define ClearPageMappedToDisk(page) clear_bit(PG_mappedtodisk, &(page)->flags) @@ -298,15 +293,16 @@ extern void get_full_page_state(struct p #define SetPageCompound(page) set_bit(PG_compound, &(page)->flags) #define ClearPageCompound(page) clear_bit(PG_compound, &(page)->flags) -/* - * The PageSwapCache predicate doesn't use a PG_flag at this time, - * but it may again do so one day. - */ +#define PageAnon(page) test_bit(PG_anon, &(page)->flags) +#define SetPageAnon(page) set_bit(PG_anon, &(page)->flags) +#define ClearPageAnon(page) clear_bit(PG_anon, &(page)->flags) + #ifdef CONFIG_SWAP -extern struct address_space swapper_space; -#define PageSwapCache(page) ((page)->mapping == &swapper_space) +#define PageSwapCache(page) test_bit(PG_swapcache, &(page)->flags) +#define SetPageSwapCache(page) set_bit(PG_swapcache, &(page)->flags) +#define ClearPageSwapCache(page) clear_bit(PG_swapcache, &(page)->flags) #else -#define PageSwapCache(page) 0 +#define PageSwapCache(page) 0 #endif struct page; /* forward declaration */ diff -purN -X /home/mbligh/.diff.exclude reference/include/linux/pagemap.h current/include/linux/pagemap.h --- reference/include/linux/pagemap.h 2004-01-15 10:41:19.000000000 -0800 +++ current/include/linux/pagemap.h 2004-04-12 10:22:25.000000000 -0700 @@ -138,17 +138,6 @@ static inline unsigned long get_page_cac return atomic_read(&nr_pagecache); } -static inline void ___add_to_page_cache(struct page *page, - struct address_space *mapping, unsigned long index) -{ - list_add(&page->list, &mapping->clean_pages); - page->mapping = mapping; - page->index = index; - - mapping->nrpages++; - pagecache_acct(1); -} - extern void FASTCALL(__lock_page(struct page *page)); extern void FASTCALL(unlock_page(struct page *page)); diff -purN -X /home/mbligh/.diff.exclude reference/include/linux/rmap-locking.h current/include/linux/rmap-locking.h --- reference/include/linux/rmap-locking.h 2004-04-07 14:54:36.000000000 -0700 +++ current/include/linux/rmap-locking.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,23 +0,0 @@ -/* - * include/linux/rmap-locking.h - * - * Locking primitives for exclusive access to a page's reverse-mapping - * pte chain. - */ - -#include - -struct pte_chain; -extern kmem_cache_t *pte_chain_cache; - -#define pte_chain_lock(page) bit_spin_lock(PG_chainlock, (unsigned long *)&page->flags) -#define pte_chain_unlock(page) bit_spin_unlock(PG_chainlock, (unsigned long *)&page->flags) - -struct pte_chain *pte_chain_alloc(int gfp_flags); -void __pte_chain_free(struct pte_chain *pte_chain); - -static inline void pte_chain_free(struct pte_chain *pte_chain) -{ - if (pte_chain) - __pte_chain_free(pte_chain); -} diff -purN -X /home/mbligh/.diff.exclude reference/include/linux/rmap.h current/include/linux/rmap.h --- reference/include/linux/rmap.h 1969-12-31 16:00:00.000000000 -0800 +++ current/include/linux/rmap.h 2004-04-12 10:22:26.000000000 -0700 @@ -0,0 +1,70 @@ +#ifndef _LINUX_RMAP_H +#define _LINUX_RMAP_H +/* + * Declarations for Reverse Mapping functions in mm/rmap.c + * Its structures are declared within that file. + */ + +#include +#include + +#define rmap_lock(page) bit_spin_lock(PG_rmaplock, &(page)->flags) +#define rmap_unlock(page) bit_spin_unlock(PG_rmaplock, &(page)->flags) + +#ifdef CONFIG_MMU + +void fastcall page_add_anon_rmap(struct page *, + struct mm_struct *, unsigned long addr); +void fastcall page_update_anon_rmap(struct page *, + struct mm_struct *, unsigned long addr); +void fastcall page_add_obj_rmap(struct page *); +void fastcall page_remove_rmap(struct page *); + +/** + * page_dup_rmap - duplicate pte mapping to a page + * @page: the page to add the mapping to + * + * For copy_page_range only: minimal extract from page_add_rmap, + * avoiding unnecessary tests (already checked) so it's quicker. + */ +static inline void page_dup_rmap(struct page *page) +{ + rmap_lock(page); + page->mapcount++; + rmap_unlock(page); +} + +/* + * Called from kernel/fork.c to manage anonymous memory + */ +void init_rmap(void); +int exec_rmap(struct mm_struct *); +int dup_rmap(struct mm_struct *, struct mm_struct *oldmm); +void exit_rmap(struct mm_struct *); + +/* + * Called from mm/vmscan.c to handle paging out + */ +int fastcall page_referenced(struct page *); +int fastcall try_to_unmap(struct page *); + +#else /* !CONFIG_MMU */ + +#define init_rmap() do {} while (0) +#define exec_rmap(mm) (0) +#define dup_rmap(mm, oldmm) (0) +#define exit_rmap(mm) do {} while (0) + +#define page_referenced(page) TestClearPageReferenced(page) +#define try_to_unmap(page) SWAP_FAIL + +#endif /* CONFIG_MMU */ + +/* + * Return values of try_to_unmap + */ +#define SWAP_SUCCESS 0 +#define SWAP_AGAIN 1 +#define SWAP_FAIL 2 + +#endif /* _LINUX_RMAP_H */ diff -purN -X /home/mbligh/.diff.exclude reference/include/linux/sched.h current/include/linux/sched.h --- reference/include/linux/sched.h 2004-04-07 14:54:36.000000000 -0700 +++ current/include/linux/sched.h 2004-04-12 10:22:26.000000000 -0700 @@ -201,6 +201,7 @@ struct mm_struct { * together off init_mm.mmlist, and are protected * by mmlist_lock */ + struct anonmm *anonmm; /* For rmap to track anon mem */ unsigned long start_code, end_code, start_data, end_data; unsigned long start_brk, brk, start_stack; diff -purN -X /home/mbligh/.diff.exclude reference/include/linux/swap.h current/include/linux/swap.h --- reference/include/linux/swap.h 2004-02-04 16:24:33.000000000 -0800 +++ current/include/linux/swap.h 2004-04-12 10:22:28.000000000 -0700 @@ -76,7 +76,6 @@ struct reclaim_state { #ifdef __KERNEL__ struct address_space; -struct pte_chain; struct sysinfo; struct writeback_control; struct zone; @@ -177,26 +176,11 @@ extern int try_to_free_pages(struct zone extern int shrink_all_memory(int); extern int vm_swappiness; -/* linux/mm/rmap.c */ #ifdef CONFIG_MMU -int FASTCALL(page_referenced(struct page *)); -struct pte_chain *FASTCALL(page_add_rmap(struct page *, pte_t *, - struct pte_chain *)); -void FASTCALL(page_remove_rmap(struct page *, pte_t *)); -int FASTCALL(try_to_unmap(struct page *)); - /* linux/mm/shmem.c */ extern int shmem_unuse(swp_entry_t entry, struct page *page); -#else -#define page_referenced(page) TestClearPageReferenced(page) -#define try_to_unmap(page) SWAP_FAIL #endif /* CONFIG_MMU */ -/* return values of try_to_unmap */ -#define SWAP_SUCCESS 0 -#define SWAP_AGAIN 1 -#define SWAP_FAIL 2 - #ifdef CONFIG_SWAP /* linux/mm/page_io.c */ extern int swap_readpage(struct file *, struct page *); @@ -230,6 +214,8 @@ extern void swap_free(swp_entry_t); extern void free_swap_and_cache(swp_entry_t); extern sector_t map_swap_page(struct swap_info_struct *, pgoff_t); extern struct swap_info_struct *get_swap_info_struct(unsigned); +extern struct swap_info_struct *swap_info_get(swp_entry_t); +extern void swap_info_put(struct swap_info_struct *); extern int can_share_swap_page(struct page *); extern int remove_exclusive_swap_page(struct page *); diff -purN -X /home/mbligh/.diff.exclude reference/init/main.c current/init/main.c --- reference/init/main.c 2004-04-07 14:54:37.000000000 -0700 +++ current/init/main.c 2004-04-12 10:22:26.000000000 -0700 @@ -84,7 +84,6 @@ extern void signals_init(void); extern void buffer_init(void); extern void pidhash_init(void); extern void pidmap_init(void); -extern void pte_chain_init(void); extern void radix_tree_init(void); extern void free_initmem(void); extern void populate_rootfs(void); @@ -460,7 +459,6 @@ asmlinkage void __init start_kernel(void calibrate_delay(); pidmap_init(); pgtable_cache_init(); - pte_chain_init(); #ifdef CONFIG_X86 if (efi_enabled) efi_enter_virtual_mode(); diff -purN -X /home/mbligh/.diff.exclude reference/kernel/fork.c current/kernel/fork.c --- reference/kernel/fork.c 2004-03-11 14:35:38.000000000 -0800 +++ current/kernel/fork.c 2004-04-12 10:22:28.000000000 -0700 @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -322,7 +323,7 @@ static inline int dup_mmap(struct mm_str /* insert tmp into the share list, just after mpnt */ down(&file->f_mapping->i_shared_sem); - list_add_tail(&tmp->shared, &mpnt->shared); + list_add(&tmp->shared, &mpnt->shared); up(&file->f_mapping->i_shared_sem); } @@ -417,9 +418,14 @@ struct mm_struct * mm_alloc(void) mm = allocate_mm(); if (mm) { memset(mm, 0, sizeof(*mm)); - return mm_init(mm); + mm = mm_init(mm); + if (mm && exec_rmap(mm)) { + mm_free_pgd(mm); + free_mm(mm); + mm = NULL; + } } - return NULL; + return mm; } /* @@ -446,6 +452,7 @@ void mmput(struct mm_struct *mm) spin_unlock(&mmlist_lock); exit_aio(mm); exit_mmap(mm); + exit_rmap(mm); mmdrop(mm); } } @@ -550,6 +557,12 @@ static int copy_mm(unsigned long clone_f if (!mm_init(mm)) goto fail_nomem; + if (dup_rmap(mm, oldmm)) { + mm_free_pgd(mm); + free_mm(mm); + goto fail_nomem; + } + if (init_new_context(tsk,mm)) goto fail_nocontext; @@ -1246,4 +1259,6 @@ void __init proc_caches_init(void) SLAB_HWCACHE_ALIGN, NULL, NULL); if(!mm_cachep) panic("vma_init: Cannot alloc mm_struct SLAB cache"); + + init_rmap(); } diff -purN -X /home/mbligh/.diff.exclude reference/mm/filemap.c current/mm/filemap.c --- reference/mm/filemap.c 2004-04-07 14:54:38.000000000 -0700 +++ current/mm/filemap.c 2004-04-12 10:22:25.000000000 -0700 @@ -118,10 +118,12 @@ void remove_from_page_cache(struct page static inline int sync_page(struct page *page) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); if (mapping && mapping->a_ops && mapping->a_ops->sync_page) return mapping->a_ops->sync_page(page); + if (PageSwapCache(page)) + blk_run_queues(); return 0; } @@ -235,13 +237,9 @@ EXPORT_SYMBOL(filemap_fdatawait); * This function is used for two things: adding newly allocated pagecache * pages and for moving existing anon pages into swapcache. * - * In the case of pagecache pages, the page is new, so we can just run - * SetPageLocked() against it. The other page state flags were set by - * rmqueue() - * - * In the case of swapcache, try_to_swap_out() has already locked the page, so - * SetPageLocked() is ugly-but-OK there too. The required page state has been - * set up by swap_out_add_to_swap_cache(). + * This function is used to add newly allocated pagecache pages: + * the page is new, so we can just run SetPageLocked() against it. + * The other page state flags were set by rmqueue(). * * This function does not add the page to the LRU. The caller must do that. */ @@ -256,7 +254,11 @@ int add_to_page_cache(struct page *page, error = radix_tree_insert(&mapping->page_tree, offset, page); if (!error) { SetPageLocked(page); - ___add_to_page_cache(page, mapping, offset); + list_add(&page->list, &mapping->clean_pages); + page->mapping = mapping; + page->index = offset; + mapping->nrpages++; + pagecache_acct(1); } else { page_cache_release(page); } diff -purN -X /home/mbligh/.diff.exclude reference/mm/fremap.c current/mm/fremap.c --- reference/mm/fremap.c 2004-03-11 14:35:39.000000000 -0800 +++ current/mm/fremap.c 2004-04-12 10:22:28.000000000 -0700 @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include @@ -36,7 +36,7 @@ static inline void zap_pte(struct mm_str if (!PageReserved(page)) { if (pte_dirty(pte)) set_page_dirty(page); - page_remove_rmap(page, ptep); + page_remove_rmap(page); page_cache_release(page); mm->rss--; } @@ -49,7 +49,7 @@ static inline void zap_pte(struct mm_str } /* - * Install a page to a given virtual memory address, release any + * Install a file page to a given virtual memory address, release any * previously existing mapping. */ int install_page(struct mm_struct *mm, struct vm_area_struct *vma, @@ -60,11 +60,13 @@ int install_page(struct mm_struct *mm, s pgd_t *pgd; pmd_t *pmd; pte_t pte_val; - struct pte_chain *pte_chain; - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto err; + /* + * We use page_add_obj_rmap below: if install_page is + * ever extended to anonymous pages, this will warn us. + */ + BUG_ON(!page_mapping(page)); + pgd = pgd_offset(mm, addr); spin_lock(&mm->page_table_lock); @@ -81,18 +83,14 @@ int install_page(struct mm_struct *mm, s mm->rss++; flush_icache_page(vma, page); set_pte(pte, mk_pte(page, prot)); - pte_chain = page_add_rmap(page, pte, pte_chain); + page_add_obj_rmap(page); pte_val = *pte; pte_unmap(pte); update_mmu_cache(vma, addr, pte_val); - spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); - return 0; + err = 0; err_unlock: spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); -err: return err; } EXPORT_SYMBOL(install_page); @@ -188,9 +186,12 @@ asmlinkage long sys_remap_file_pages(uns /* * Make sure the vma is shared, that it supports prefaulting, * and that the remapped range is valid and fully within - * the single existing vma: + * the single existing vma. vm_private_data is used as a + * swapout cursor in a VM_NONLINEAR vma (unless VM_RESERVED + * or VM_LOCKED, but VM_LOCKED could be revoked later on). */ if (vma && (vma->vm_flags & VM_SHARED) && + (!vma->vm_private_data || (vma->vm_flags & VM_RESERVED)) && vma->vm_ops && vma->vm_ops->populate && end > start && start >= vma->vm_start && end <= vma->vm_end) { diff -purN -X /home/mbligh/.diff.exclude reference/mm/memory.c current/mm/memory.c --- reference/mm/memory.c 2004-04-07 14:54:38.000000000 -0700 +++ current/mm/memory.c 2004-04-12 10:22:28.000000000 -0700 @@ -43,12 +43,11 @@ #include #include #include -#include +#include #include #include #include -#include #include #include #include @@ -105,7 +104,7 @@ static inline void free_one_pmd(struct m } page = pmd_page(*dir); pmd_clear(dir); - pgtable_remove_rmap(page); + dec_page_state(nr_page_table_pages); pte_free_tlb(tlb, page); } @@ -164,7 +163,7 @@ pte_t fastcall * pte_alloc_map(struct mm pte_free(new); goto out; } - pgtable_add_rmap(new, mm, address); + inc_page_state(nr_page_table_pages); pmd_populate(mm, pmd, new); } out: @@ -190,7 +189,6 @@ pte_t fastcall * pte_alloc_kernel(struct pte_free_kernel(new); goto out; } - pgtable_add_rmap(virt_to_page(new), mm, address); pmd_populate_kernel(mm, pmd, new); } out: @@ -217,20 +215,10 @@ int copy_page_range(struct mm_struct *ds unsigned long address = vma->vm_start; unsigned long end = vma->vm_end; unsigned long cow; - struct pte_chain *pte_chain = NULL; if (is_vm_hugetlb_page(vma)) return copy_hugetlb_page_range(dst, src, vma); - pte_chain = pte_chain_alloc(GFP_ATOMIC | __GFP_NOWARN); - if (!pte_chain) { - spin_unlock(&dst->page_table_lock); - pte_chain = pte_chain_alloc(GFP_KERNEL); - spin_lock(&dst->page_table_lock); - if (!pte_chain) - goto nomem; - } - cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; src_pgd = pgd_offset(src, address)-1; dst_pgd = pgd_offset(dst, address)-1; @@ -329,32 +317,8 @@ skip_copy_pte_range: pte = pte_mkold(pte); get_page(page); dst->rss++; - set_pte(dst_pte, pte); - pte_chain = page_add_rmap(page, dst_pte, - pte_chain); - if (pte_chain) - goto cont_copy_pte_range_noset; - pte_chain = pte_chain_alloc(GFP_ATOMIC | __GFP_NOWARN); - if (pte_chain) - goto cont_copy_pte_range_noset; - - /* - * pte_chain allocation failed, and we need to - * run page reclaim. - */ - pte_unmap_nested(src_pte); - pte_unmap(dst_pte); - spin_unlock(&src->page_table_lock); - spin_unlock(&dst->page_table_lock); - pte_chain = pte_chain_alloc(GFP_KERNEL); - spin_lock(&dst->page_table_lock); - if (!pte_chain) - goto nomem; - spin_lock(&src->page_table_lock); - dst_pte = pte_offset_map(dst_pmd, address); - src_pte = pte_offset_map_nested(src_pmd, - address); + page_dup_rmap(page); cont_copy_pte_range_noset: address += PAGE_SIZE; if (address >= end) { @@ -377,10 +341,8 @@ cont_copy_pmd_range: out_unlock: spin_unlock(&src->page_table_lock); out: - pte_chain_free(pte_chain); return 0; nomem: - pte_chain_free(pte_chain); return -ENOMEM; } @@ -417,11 +379,11 @@ zap_pte_range(struct mmu_gather *tlb, pm if (!PageReserved(page)) { if (pte_dirty(pte)) set_page_dirty(page); - if (page->mapping && pte_young(pte) && - !PageSwapCache(page)) + if (pte_young(pte) && + page_mapping(page)) mark_page_accessed(page); tlb->freed++; - page_remove_rmap(page, ptep); + page_remove_rmap(page); tlb_remove_page(tlb, page); } } @@ -1014,7 +976,6 @@ static int do_wp_page(struct mm_struct * { struct page *old_page, *new_page; unsigned long pfn = pte_pfn(pte); - struct pte_chain *pte_chain; pte_t entry; if (unlikely(!pfn_valid(pfn))) { @@ -1039,6 +1000,14 @@ static int do_wp_page(struct mm_struct * entry = maybe_mkwrite(pte_mkyoung(pte_mkdirty(pte)), vma); ptep_establish(vma, address, page_table, entry); + if (PageAnon(old_page)) { + /* + * Optimization: the page may have been + * registered under a long defunct mm: + * now we know it belongs only to this. + */ + page_update_anon_rmap(old_page, mm, address); + } update_mmu_cache(vma, address, entry); pte_unmap(page_table); spin_unlock(&mm->page_table_lock); @@ -1053,9 +1022,6 @@ static int do_wp_page(struct mm_struct * page_cache_get(old_page); spin_unlock(&mm->page_table_lock); - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto no_pte_chain; new_page = alloc_page(GFP_HIGHUSER); if (!new_page) goto no_new_page; @@ -1069,10 +1035,11 @@ static int do_wp_page(struct mm_struct * if (pte_same(*page_table, pte)) { if (PageReserved(old_page)) ++mm->rss; - page_remove_rmap(old_page, page_table); + else + page_remove_rmap(old_page); break_cow(vma, new_page, address, page_table); - pte_chain = page_add_rmap(new_page, page_table, pte_chain); lru_cache_add_active(new_page); + page_add_anon_rmap(new_page, mm, address); /* Free the old page.. */ new_page = old_page; @@ -1081,12 +1048,9 @@ static int do_wp_page(struct mm_struct * page_cache_release(new_page); page_cache_release(old_page); spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); return VM_FAULT_MINOR; no_new_page: - pte_chain_free(pte_chain); -no_pte_chain: page_cache_release(old_page); return VM_FAULT_OOM; } @@ -1244,7 +1208,6 @@ static int do_swap_page(struct mm_struct swp_entry_t entry = pte_to_swp_entry(orig_pte); pte_t pte; int ret = VM_FAULT_MINOR; - struct pte_chain *pte_chain = NULL; pte_unmap(page_table); spin_unlock(&mm->page_table_lock); @@ -1274,11 +1237,6 @@ static int do_swap_page(struct mm_struct } mark_page_accessed(page); - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) { - ret = VM_FAULT_OOM; - goto out; - } lock_page(page); /* @@ -1298,11 +1256,13 @@ static int do_swap_page(struct mm_struct /* The page isn't present yet, go ahead with the fault. */ + mm->rss++; + page_add_anon_rmap(page, mm, address); + swap_free(entry); if (vm_swap_full()) remove_exclusive_swap_page(page); - mm->rss++; pte = mk_pte(page, vma->vm_page_prot); if (write_access && can_share_swap_page(page)) pte = maybe_mkwrite(pte_mkdirty(pte), vma); @@ -1310,14 +1270,12 @@ static int do_swap_page(struct mm_struct flush_icache_page(vma, page); set_pte(page_table, pte); - pte_chain = page_add_rmap(page, page_table, pte_chain); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, pte); pte_unmap(page_table); spin_unlock(&mm->page_table_lock); out: - pte_chain_free(pte_chain); return ret; } @@ -1333,20 +1291,7 @@ do_anonymous_page(struct mm_struct *mm, { pte_t entry; struct page * page = ZERO_PAGE(addr); - struct pte_chain *pte_chain; - int ret; - pte_chain = pte_chain_alloc(GFP_ATOMIC | __GFP_NOWARN); - if (!pte_chain) { - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto no_mem; - spin_lock(&mm->page_table_lock); - page_table = pte_offset_map(pmd, addr); - } - /* Read-only mapping of ZERO_PAGE. */ entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); @@ -1368,7 +1313,6 @@ do_anonymous_page(struct mm_struct *mm, pte_unmap(page_table); page_cache_release(page); spin_unlock(&mm->page_table_lock); - ret = VM_FAULT_MINOR; goto out; } mm->rss++; @@ -1377,24 +1321,19 @@ do_anonymous_page(struct mm_struct *mm, vma); lru_cache_add_active(page); mark_page_accessed(page); + page_add_anon_rmap(page, mm, addr); } set_pte(page_table, entry); - /* ignores ZERO_PAGE */ - pte_chain = page_add_rmap(page, page_table, pte_chain); pte_unmap(page_table); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, addr, entry); spin_unlock(&mm->page_table_lock); - ret = VM_FAULT_MINOR; - goto out; - -no_mem: - ret = VM_FAULT_OOM; out: - pte_chain_free(pte_chain); - return ret; + return VM_FAULT_MINOR; +no_mem: + return VM_FAULT_OOM; } /* @@ -1416,9 +1355,9 @@ do_no_page(struct mm_struct *mm, struct struct page * new_page; struct address_space *mapping = NULL; pte_t entry; - struct pte_chain *pte_chain; int sequence = 0; int ret = VM_FAULT_MINOR; + int anon = 0; if (!vma->vm_ops || !vma->vm_ops->nopage) return do_anonymous_page(mm, vma, page_table, @@ -1440,10 +1379,6 @@ retry: if (new_page == NOPAGE_OOM) return VM_FAULT_OOM; - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto oom; - /* * Should we do an early C-O-W break? */ @@ -1453,8 +1388,8 @@ retry: goto oom; copy_user_highpage(page, new_page, address); page_cache_release(new_page); - lru_cache_add_active(page); new_page = page; + anon = 1; } spin_lock(&mm->page_table_lock); @@ -1468,7 +1403,6 @@ retry: sequence = atomic_read(&mapping->truncate_count); spin_unlock(&mm->page_table_lock); page_cache_release(new_page); - pte_chain_free(pte_chain); goto retry; } page_table = pte_offset_map(pmd, address); @@ -1492,7 +1426,11 @@ retry: if (write_access) entry = maybe_mkwrite(pte_mkdirty(entry), vma); set_pte(page_table, entry); - pte_chain = page_add_rmap(new_page, page_table, pte_chain); + if (anon) { + lru_cache_add_active(new_page); + page_add_anon_rmap(new_page, mm, address); + } else + page_add_obj_rmap(new_page); pte_unmap(page_table); } else { /* One of our sibling threads was faster, back out. */ @@ -1510,7 +1448,6 @@ oom: page_cache_release(new_page); ret = VM_FAULT_OOM; out: - pte_chain_free(pte_chain); return ret; } diff -purN -X /home/mbligh/.diff.exclude reference/mm/mmap.c current/mm/mmap.c --- reference/mm/mmap.c 2004-04-07 14:54:38.000000000 -0700 +++ current/mm/mmap.c 2004-04-12 10:22:28.000000000 -0700 @@ -333,8 +333,6 @@ static inline int is_mergeable_vma(struc return 0; if (vma->vm_flags != vm_flags) return 0; - if (vma->vm_private_data) - return 0; return 1; } @@ -385,7 +383,8 @@ can_vma_merge_after(struct vm_area_struc * whether that can be merged with its predecessor or its successor. Or * both (it neatly fills a hole). */ -static int vma_merge(struct mm_struct *mm, struct vm_area_struct *prev, +static struct vm_area_struct *vma_merge(struct mm_struct *mm, + struct vm_area_struct *prev, struct rb_node *rb_parent, unsigned long addr, unsigned long end, unsigned long vm_flags, struct file *file, unsigned long pgoff) @@ -399,7 +398,7 @@ static int vma_merge(struct mm_struct *m * vma->vm_flags & VM_SPECIAL, too. */ if (vm_flags & VM_SPECIAL) - return 0; + return NULL; i_shared_sem = file ? &file->f_mapping->i_shared_sem : NULL; @@ -412,7 +411,6 @@ static int vma_merge(struct mm_struct *m * Can it merge with the predecessor? */ if (prev->vm_end == addr && - is_mergeable_vma(prev, file, vm_flags) && can_vma_merge_after(prev, vm_flags, file, pgoff)) { struct vm_area_struct *next; int need_up = 0; @@ -443,12 +441,12 @@ static int vma_merge(struct mm_struct *m mm->map_count--; kmem_cache_free(vm_area_cachep, next); - return 1; + return prev; } spin_unlock(lock); if (need_up) up(i_shared_sem); - return 1; + return prev; } /* @@ -459,7 +457,7 @@ static int vma_merge(struct mm_struct *m merge_next: if (!can_vma_merge_before(prev, vm_flags, file, pgoff, (end - addr) >> PAGE_SHIFT)) - return 0; + return NULL; if (end == prev->vm_start) { if (file) down(i_shared_sem); @@ -469,11 +467,11 @@ static int vma_merge(struct mm_struct *m spin_unlock(lock); if (file) up(i_shared_sem); - return 1; + return prev; } } - return 0; + return NULL; } /* @@ -1492,5 +1490,57 @@ void insert_vm_struct(struct mm_struct * if (__vma && __vma->vm_start < vma->vm_end) BUG(); vma_link(mm, vma, prev, rb_link, rb_parent); - validate_mm(mm); +} + +/* + * Copy the vma structure to a new location in the same mm, + * prior to moving page table entries, to effect an mremap move. + */ +struct vm_area_struct *copy_vma(struct vm_area_struct *vma, + unsigned long addr, unsigned long len, unsigned long pgoff) +{ + struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *new_vma, *prev; + struct rb_node **rb_link, *rb_parent; + + find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); + new_vma = vma_merge(mm, prev, rb_parent, addr, addr + len, + vma->vm_flags, vma->vm_file, pgoff); + if (!new_vma) { + new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (new_vma) { + *new_vma = *vma; + INIT_LIST_HEAD(&new_vma->shared); + new_vma->vm_start = addr; + new_vma->vm_end = addr + len; + new_vma->vm_pgoff = pgoff; + if (new_vma->vm_file) + get_file(new_vma->vm_file); + if (new_vma->vm_ops && new_vma->vm_ops->open) + new_vma->vm_ops->open(new_vma); + vma_link(mm, new_vma, prev, rb_link, rb_parent); + } + } + return new_vma; +} + +/* + * Position vma after prev in shared file list: + * for mremap move error recovery racing against vmtruncate. + */ +void vma_relink_file(struct vm_area_struct *vma, struct vm_area_struct *prev) +{ + struct mm_struct *mm = vma->vm_mm; + struct address_space *mapping; + + if (vma->vm_file) { + mapping = vma->vm_file->f_mapping; + if (mapping) { + down(&mapping->i_shared_sem); + spin_lock(&mm->page_table_lock); + list_move(&vma->shared, &prev->shared); + spin_unlock(&mm->page_table_lock); + up(&mapping->i_shared_sem); + } + } } diff -purN -X /home/mbligh/.diff.exclude reference/mm/mremap.c current/mm/mremap.c --- reference/mm/mremap.c 2004-02-18 14:57:24.000000000 -0800 +++ current/mm/mremap.c 2004-04-12 10:22:28.000000000 -0700 @@ -15,7 +15,9 @@ #include #include #include -#include +#include +#include +#include #include #include @@ -79,32 +81,102 @@ static inline pte_t *alloc_one_pte_map(s return pte; } -static int -copy_one_pte(struct vm_area_struct *vma, unsigned long old_addr, - pte_t *src, pte_t *dst, struct pte_chain **pte_chainp) +#ifdef CONFIG_SWAP +/* + * rmap_needs_broken_cow is for mremap MAYMOVE's move_one_page. + * The anonmm objrmap can only track anon page movements if the + * page (or swap entry) is exclusive to the mm, but we don't + * want the waste of early COW break unless it's necessary. + * This tells us, with side-effect to update anon rmap if okay. + * page_table_lock (and mmap_sem) are held throughout. + */ +static int rmap_needs_broken_cow(pte_t *ptep, unsigned long new_addr) { - int error = 0; - pte_t pte; - struct page *page = NULL; - - if (pte_present(*src)) - page = pte_page(*src); + pte_t pte = *ptep; + unsigned long pfn; + struct page *page; + swp_entry_t entry; + struct swap_info_struct *si; + unsigned int mapcount = 0; + + if (pte_present(pte)) { + pfn = pte_pfn(pte); + if (!pfn_valid(pfn)) + return 0; + page = pfn_to_page(pfn); + if (!PageAnon(page)) + return 0; + if (pte_write(pte)) + goto update; +again: + /* + * page->private on a PageAnon page is always the + * swap entry (if PageSwapCache) or 0 (if not): + * so we can peep at page->private without taking + * a lock, no need to check PageSwapCache too. + */ + entry.val = page->private; + smp_rmb(); + mapcount = page->mapcount; + if (mapcount > 1) + return 1; + if (!entry.val) + goto update; + /* + * This is tricky: entry can get freed right here, + * since we don't hold the page lock (and cannot wait + * for it). Use swap_duplicate which, already allows + * for that, before the less forgiving swap_info_get. + */ + if (!swap_duplicate(entry)) + goto again; + si = swap_info_get(entry); + if (si) { + mapcount = si->swap_map[swp_offset(entry)] + + page->mapcount - 2; + swap_info_put(si); + } else + mapcount = 0; + swap_free(entry); + if (entry.val != page->private) + goto again; + if (mapcount > 1) + return 1; +update: + /* Before we forget the struct page, update its rmap */ + page_update_anon_rmap(page, current->mm, new_addr); + return 0; + } - if (!pte_none(*src)) { - if (page) - page_remove_rmap(page, src); - pte = ptep_clear_flush(vma, old_addr, src); - if (!dst) { - /* No dest? We must put it back. */ - dst = src; - error++; + if (!pte_file(pte) && !pte_none(pte)) { + entry = pte_to_swp_entry(pte); + si = swap_info_get(entry); + if (si) { + page = NULL; + mapcount = si->swap_map[swp_offset(entry)]; + if (mapcount == 2) { + page = lookup_swap_cache(entry); + if (page) + mapcount = page->mapcount + 1; + } + swap_info_put(si); + if (page) + page_cache_release(page); } - set_pte(dst, pte); - if (page) - *pte_chainp = page_add_rmap(page, dst, *pte_chainp); } - return error; + + return mapcount > 1; } +#else /* !CONFIG_SWAP */ + +/* + * The swap interfaces used above are not available. Actually, + * all of the anonymous rmap is just a waste of space-time in this case. + * But no enthusiam for peppering the code with #ifdefs right now. + */ +#define rmap_needs_broken_cow(ptep, new_addr) 0 + +#endif /* CONFIG_SWAP */ static int move_one_page(struct vm_area_struct *vma, unsigned long old_addr, @@ -113,13 +185,7 @@ move_one_page(struct vm_area_struct *vma struct mm_struct *mm = vma->vm_mm; int error = 0; pte_t *src, *dst; - struct pte_chain *pte_chain; - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) { - error = -ENOMEM; - goto out; - } spin_lock(&mm->page_table_lock); src = get_one_pte_map_nested(mm, old_addr); if (src) { @@ -140,22 +206,28 @@ move_one_page(struct vm_area_struct *vma * page_table_lock, we should re-check the src entry... */ if (src) { - error = copy_one_pte(vma, old_addr, src, - dst, &pte_chain); + if (!dst) + error = -ENOMEM; + else if (rmap_needs_broken_cow(src, new_addr)) + error = -EAGAIN; + else { + pte_t pte; + pte = ptep_clear_flush(vma, old_addr, src); + set_pte(dst, pte); + } pte_unmap_nested(src); } pte_unmap(dst); } spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); -out: return error; } static int move_page_tables(struct vm_area_struct *vma, unsigned long new_addr, unsigned long old_addr, unsigned long len) { - unsigned long offset = len; + unsigned long offset = 0; + int ret; flush_cache_range(vma, old_addr, old_addr + len); @@ -164,137 +236,107 @@ static int move_page_tables(struct vm_ar * easy way out on the assumption that most remappings will be * only a few pages.. This also makes error recovery easier. */ - while (offset) { - offset -= PAGE_SIZE; - if (move_one_page(vma, old_addr + offset, new_addr + offset)) - goto oops_we_failed; + while (offset < len) { + ret = move_one_page(vma, old_addr+offset, new_addr+offset); + if (!ret) { + offset += PAGE_SIZE; + continue; + } + if (ret != -EAGAIN) + break; + /* + * The anonmm objrmap can only track anon page movements + * if the page (or swap entry) is exclusive to this mm. + * In the very unusual case when it's shared, break COW + * (take a copy of the page) to make it exclusive. If + * the page is shared and on swap, move_one_page will + * normally succeed on the third attempt (do_swap_page + * does not break COW); but under very great pressure it + * could get swapped out again and need more attempts. + */ + ret = handle_mm_fault(vma->vm_mm, vma, old_addr+offset, 1); + if (ret != VM_FAULT_MINOR && ret != VM_FAULT_MAJOR) + break; } - return 0; - - /* - * Ok, the move failed because we didn't have enough pages for - * the new page table tree. This is unlikely, but we have to - * take the possibility into account. In that case we just move - * all the pages back (this will work, because we still have - * the old page tables) - */ -oops_we_failed: - flush_cache_range(vma, new_addr, new_addr + len); - while ((offset += PAGE_SIZE) < len) - move_one_page(vma, new_addr + offset, old_addr + offset); - zap_page_range(vma, new_addr, len); - return -1; + return offset; } static unsigned long move_vma(struct vm_area_struct *vma, - unsigned long addr, unsigned long old_len, unsigned long new_len, - unsigned long new_addr) + unsigned long old_addr, unsigned long old_len, + unsigned long new_len, unsigned long new_addr) { struct mm_struct *mm = vma->vm_mm; - struct vm_area_struct *new_vma, *next, *prev; - int allocated_vma; + struct vm_area_struct *new_vma; + unsigned long vm_flags = vma->vm_flags; + unsigned long new_pgoff; + unsigned long moved_len; + unsigned long excess = 0; int split = 0; - new_vma = NULL; - next = find_vma_prev(mm, new_addr, &prev); - if (next) { - if (prev && prev->vm_end == new_addr && - can_vma_merge(prev, vma->vm_flags) && !vma->vm_file && - !(vma->vm_flags & VM_SHARED)) { - spin_lock(&mm->page_table_lock); - prev->vm_end = new_addr + new_len; - spin_unlock(&mm->page_table_lock); - new_vma = prev; - if (next != prev->vm_next) - BUG(); - if (prev->vm_end == next->vm_start && - can_vma_merge(next, prev->vm_flags)) { - spin_lock(&mm->page_table_lock); - prev->vm_end = next->vm_end; - __vma_unlink(mm, next, prev); - spin_unlock(&mm->page_table_lock); - if (vma == next) - vma = prev; - mm->map_count--; - kmem_cache_free(vm_area_cachep, next); - } - } else if (next->vm_start == new_addr + new_len && - can_vma_merge(next, vma->vm_flags) && - !vma->vm_file && !(vma->vm_flags & VM_SHARED)) { - spin_lock(&mm->page_table_lock); - next->vm_start = new_addr; - spin_unlock(&mm->page_table_lock); - new_vma = next; - } - } else { - prev = find_vma(mm, new_addr-1); - if (prev && prev->vm_end == new_addr && - can_vma_merge(prev, vma->vm_flags) && !vma->vm_file && - !(vma->vm_flags & VM_SHARED)) { - spin_lock(&mm->page_table_lock); - prev->vm_end = new_addr + new_len; - spin_unlock(&mm->page_table_lock); - new_vma = prev; - } - } + /* + * We'd prefer to avoid failure later on in do_munmap: + * which may split one vma into three before unmapping. + */ + if (mm->map_count >= sysctl_max_map_count - 3) + return -ENOMEM; - allocated_vma = 0; - if (!new_vma) { - new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); - if (!new_vma) - goto out; - allocated_vma = 1; - } - - if (!move_page_tables(vma, new_addr, addr, old_len)) { - unsigned long vm_locked = vma->vm_flags & VM_LOCKED; - - if (allocated_vma) { - *new_vma = *vma; - INIT_LIST_HEAD(&new_vma->shared); - new_vma->vm_start = new_addr; - new_vma->vm_end = new_addr+new_len; - new_vma->vm_pgoff += (addr-vma->vm_start) >> PAGE_SHIFT; - if (new_vma->vm_file) - get_file(new_vma->vm_file); - if (new_vma->vm_ops && new_vma->vm_ops->open) - new_vma->vm_ops->open(new_vma); - insert_vm_struct(current->mm, new_vma); - } + new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); + new_vma = copy_vma(vma, new_addr, new_len, new_pgoff); + if (!new_vma) + return -ENOMEM; - /* Conceal VM_ACCOUNT so old reservation is not undone */ - if (vma->vm_flags & VM_ACCOUNT) { - vma->vm_flags &= ~VM_ACCOUNT; - if (addr > vma->vm_start) { - if (addr + old_len < vma->vm_end) - split = 1; - } else if (addr + old_len == vma->vm_end) - vma = NULL; /* it will be removed */ - } else - vma = NULL; /* nothing more to do */ + moved_len = move_page_tables(vma, new_addr, old_addr, old_len); + if (moved_len < old_len) { + /* + * On error, move entries back from new area to old, + * which will succeed since page tables still there, + * and then proceed to unmap new area instead of old. + * + * Subtle point from Rajesh Venkatasubramanian: before + * moving file-based ptes, move new_vma before old vma + * in the i_mmap or i_mmap_shared list, so when racing + * against vmtruncate we cannot propagate pages to be + * truncated back from new_vma into just cleaned old. + */ + vma_relink_file(vma, new_vma); + move_page_tables(new_vma, old_addr, new_addr, moved_len); + vma = new_vma; + old_len = new_len; + old_addr = new_addr; + new_addr = -ENOMEM; + } - do_munmap(current->mm, addr, old_len); + /* Conceal VM_ACCOUNT so old reservation is not undone */ + if (vm_flags & VM_ACCOUNT) { + vma->vm_flags &= ~VM_ACCOUNT; + excess = vma->vm_end - vma->vm_start - old_len; + if (old_addr > vma->vm_start && + old_addr + old_len < vma->vm_end) + split = 1; + } - /* Restore VM_ACCOUNT if one or two pieces of vma left */ - if (vma) { - vma->vm_flags |= VM_ACCOUNT; - if (split) - vma->vm_next->vm_flags |= VM_ACCOUNT; - } + if (do_munmap(mm, old_addr, old_len) < 0) { + /* OOM: unable to split vma, just get accounts right */ + vm_unacct_memory(excess >> PAGE_SHIFT); + excess = 0; + } - current->mm->total_vm += new_len >> PAGE_SHIFT; - if (vm_locked) { - current->mm->locked_vm += new_len >> PAGE_SHIFT; - if (new_len > old_len) - make_pages_present(new_addr + old_len, - new_addr + new_len); - } - return new_addr; + /* Restore VM_ACCOUNT if one or two pieces of vma left */ + if (excess) { + vma->vm_flags |= VM_ACCOUNT; + if (split) + vma->vm_next->vm_flags |= VM_ACCOUNT; + } + + mm->total_vm += new_len >> PAGE_SHIFT; + if (vm_flags & VM_LOCKED) { + mm->locked_vm += new_len >> PAGE_SHIFT; + if (new_len > old_len) + make_pages_present(new_addr + old_len, + new_addr + new_len); } - if (allocated_vma) - kmem_cache_free(vm_area_cachep, new_vma); - out: - return -ENOMEM; + + return new_addr; } /* @@ -438,6 +480,7 @@ unsigned long do_mremap(unsigned long ad if (flags & MREMAP_MAYMOVE) { if (!(flags & MREMAP_FIXED)) { unsigned long map_flags = 0; + if (vma->vm_flags & VM_MAYSHARE) map_flags |= MAP_SHARED; diff -purN -X /home/mbligh/.diff.exclude reference/mm/nommu.c current/mm/nommu.c --- reference/mm/nommu.c 2004-02-04 16:24:35.000000000 -0800 +++ current/mm/nommu.c 2004-04-12 10:22:26.000000000 -0700 @@ -567,7 +567,3 @@ unsigned long get_unmapped_area(struct f { return -ENOMEM; } - -void pte_chain_init(void) -{ -} diff -purN -X /home/mbligh/.diff.exclude reference/mm/page-writeback.c current/mm/page-writeback.c --- reference/mm/page-writeback.c 2004-02-04 16:24:35.000000000 -0800 +++ current/mm/page-writeback.c 2004-04-12 10:22:26.000000000 -0700 @@ -532,6 +532,24 @@ int __set_page_dirty_nobuffers(struct pa EXPORT_SYMBOL(__set_page_dirty_nobuffers); /* + * If the mapping doesn't provide a set_page_dirty a_op, then + * just fall through and assume that it wants buffer_heads. + */ +int set_page_dirty(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + int (*spd)(struct page *); + + if (!mapping) { + SetPageDirty(page); + return 0; + } + spd = mapping->a_ops->set_page_dirty; + return spd? (*spd)(page): __set_page_dirty_buffers(page); +} +EXPORT_SYMBOL(set_page_dirty); + +/* * set_page_dirty() is racy if the caller has no reference against * page->mapping->host, and if the page is unlocked. This is because another * CPU could truncate the page off the mapping and then free the mapping. @@ -559,7 +577,7 @@ EXPORT_SYMBOL(set_page_dirty_lock); int test_clear_page_dirty(struct page *page) { if (TestClearPageDirty(page)) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); if (mapping && !mapping->backing_dev_info->memory_backed) dec_page_state(nr_dirty); diff -purN -X /home/mbligh/.diff.exclude reference/mm/page_alloc.c current/mm/page_alloc.c --- reference/mm/page_alloc.c 2004-04-07 14:54:38.000000000 -0700 +++ current/mm/page_alloc.c 2004-04-12 10:22:26.000000000 -0700 @@ -83,6 +83,9 @@ static void bad_page(const char *functio 1 << PG_lru | 1 << PG_active | 1 << PG_dirty | + 1 << PG_rmaplock | + 1 << PG_anon | + 1 << PG_swapcache | 1 << PG_writeback); set_page_count(page, 0); page->mapping = NULL; @@ -220,6 +223,9 @@ static inline void free_pages_check(cons 1 << PG_active | 1 << PG_reclaim | 1 << PG_slab | + 1 << PG_rmaplock | + 1 << PG_anon | + 1 << PG_swapcache | 1 << PG_writeback ))) bad_page(function, page); if (PageDirty(page)) @@ -327,6 +333,9 @@ static void prep_new_page(struct page *p 1 << PG_active | 1 << PG_dirty | 1 << PG_reclaim | + 1 << PG_rmaplock | + 1 << PG_anon | + 1 << PG_swapcache | 1 << PG_writeback ))) bad_page(__FUNCTION__, page); diff -purN -X /home/mbligh/.diff.exclude reference/mm/page_io.c current/mm/page_io.c --- reference/mm/page_io.c 2002-12-17 11:36:36.000000000 -0800 +++ current/mm/page_io.c 2004-04-12 10:22:26.000000000 -0700 @@ -16,8 +16,6 @@ #include #include #include -#include /* for block_sync_page() */ -#include #include #include @@ -32,7 +30,7 @@ get_swap_bio(int gfp_flags, struct page swp_entry_t entry; BUG_ON(!PageSwapCache(page)); - entry.val = page->index; + entry.val = page->private; sis = get_swap_info_struct(swp_type(entry)); bio->bi_sector = map_swap_page(sis, swp_offset(entry)) * @@ -130,13 +128,6 @@ out: return ret; } -struct address_space_operations swap_aops = { - .writepage = swap_writepage, - .readpage = swap_readpage, - .sync_page = block_sync_page, - .set_page_dirty = __set_page_dirty_nobuffers, -}; - /* * A scruffy utility function to read or write an arbitrary swap page * and wait on the I/O. @@ -149,10 +140,8 @@ int rw_swap_page_sync(int rw, swp_entry_ }; lock_page(page); - - BUG_ON(page->mapping); - page->mapping = &swapper_space; - page->index = entry.val; + SetPageSwapCache(page); + page->private = entry.val; if (rw == READ) { ret = swap_readpage(NULL, page); @@ -161,7 +150,7 @@ int rw_swap_page_sync(int rw, swp_entry_ ret = swap_writepage(page, &swap_wbc); wait_on_page_writeback(page); } - page->mapping = NULL; + ClearPageSwapCache(page); if (ret == 0 && (!PageUptodate(page) || PageError(page))) ret = -EIO; return ret; diff -purN -X /home/mbligh/.diff.exclude reference/mm/rmap.c current/mm/rmap.c --- reference/mm/rmap.c 2004-04-07 14:54:38.000000000 -0700 +++ current/mm/rmap.c 2004-04-12 10:22:28.000000000 -0700 @@ -4,17 +4,14 @@ * Copyright 2001, Rik van Riel * Released under the General Public License (GPL). * - * - * Simple, low overhead pte-based reverse mapping scheme. - * This is kept modular because we may want to experiment - * with object-based reverse mapping schemes. Please try - * to keep this thing as modular as possible. + * Simple, low overhead reverse mapping scheme. + * Please try to keep this thing as modular as possible. */ /* * Locking: - * - the page->pte.chain is protected by the PG_chainlock bit, - * which nests within the the mm->page_table_lock, + * - the page->mapcount field is protected by the PG_rmaplock bit, + * which nests within the mm->page_table_lock, * which nests within the page lock. * - because swapout locking is opposite to the locking order * in the page fault path, the swapout path uses trylocks @@ -26,96 +23,306 @@ #include #include #include -#include -#include -#include - -#include -#include -#include -#include +#include -/* #define DEBUG_RMAP */ +#include /* - * Shared pages have a chain of pte_chain structures, used to locate - * all the mappings to this page. We only need a pointer to the pte - * here, the page struct for the page table page contains the process - * it belongs to and the offset within that process. + * struct anonmm: to track a bundle of anonymous memory mappings. * - * We use an array of pte pointers in this structure to minimise cache misses - * while traversing reverse maps. - */ -#define NRPTE ((L1_CACHE_BYTES - sizeof(unsigned long))/sizeof(pte_addr_t)) + * Could be embedded in mm_struct, but mm_struct is rather heavyweight, + * and we may need the anonmm to stay around long after the mm_struct + * and its pgd have been freed: because pages originally faulted into + * that mm have been duped into forked mms, and still need tracking. + */ +struct anonmm { + atomic_t count; /* ref count, incl. 1 per page */ + spinlock_t lock; /* head's locks list; others unused */ + struct mm_struct *mm; /* assoc mm_struct, NULL when gone */ + struct anonmm *head; /* exec starts new chain from head */ + struct list_head list; /* chain of associated anonmms */ +}; +static kmem_cache_t *anonmm_cachep; -/* - * next_and_idx encodes both the address of the next pte_chain and the - * offset of the lowest-index used pte in ptes[] (which is equal also - * to the offset of the highest-index unused pte in ptes[], plus one). - */ -struct pte_chain { - unsigned long next_and_idx; - pte_addr_t ptes[NRPTE]; -} ____cacheline_aligned; - -kmem_cache_t *pte_chain_cache; +/** + ** Functions for creating and destroying struct anonmm. + **/ -static inline struct pte_chain *pte_chain_next(struct pte_chain *pte_chain) +void __init init_rmap(void) { - return (struct pte_chain *)(pte_chain->next_and_idx & ~NRPTE); + anonmm_cachep = kmem_cache_create("anonmm", + sizeof(struct anonmm), 0, + SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!anonmm_cachep) + panic("init_rmap: Cannot alloc anonmm SLAB cache"); } -static inline struct pte_chain *pte_chain_ptr(unsigned long pte_chain_addr) +int exec_rmap(struct mm_struct *mm) { - return (struct pte_chain *)(pte_chain_addr & ~NRPTE); + struct anonmm *anonmm; + + anonmm = kmem_cache_alloc(anonmm_cachep, SLAB_KERNEL); + if (unlikely(!anonmm)) + return -ENOMEM; + + atomic_set(&anonmm->count, 2); /* ref by mm and head */ + anonmm->lock = SPIN_LOCK_UNLOCKED; /* this lock is used */ + anonmm->mm = mm; + anonmm->head = anonmm; + INIT_LIST_HEAD(&anonmm->list); + mm->anonmm = anonmm; + return 0; } -static inline int pte_chain_idx(struct pte_chain *pte_chain) +int dup_rmap(struct mm_struct *mm, struct mm_struct *oldmm) { - return pte_chain->next_and_idx & NRPTE; + struct anonmm *anonmm; + struct anonmm *anonhd = oldmm->anonmm->head; + + anonmm = kmem_cache_alloc(anonmm_cachep, SLAB_KERNEL); + if (unlikely(!anonmm)) + return -ENOMEM; + + /* + * copy_mm calls us before dup_mmap has reset the mm fields, + * so reset rss ourselves before adding to anonhd's list, + * to keep away from this mm until it's worth examining. + */ + mm->rss = 0; + + atomic_set(&anonmm->count, 1); /* ref by mm */ + anonmm->lock = SPIN_LOCK_UNLOCKED; /* this lock is not used */ + anonmm->mm = mm; + anonmm->head = anonhd; + spin_lock(&anonhd->lock); + atomic_inc(&anonhd->count); /* ref by anonmm's head */ + list_add_tail(&anonmm->list, &anonhd->list); + spin_unlock(&anonhd->lock); + mm->anonmm = anonmm; + return 0; +} + +void exit_rmap(struct mm_struct *mm) +{ + struct anonmm *anonmm = mm->anonmm; + struct anonmm *anonhd = anonmm->head; + + mm->anonmm = NULL; + spin_lock(&anonhd->lock); + anonmm->mm = NULL; + if (atomic_dec_and_test(&anonmm->count)) { + BUG_ON(anonmm == anonhd); + list_del(&anonmm->list); + kmem_cache_free(anonmm_cachep, anonmm); + if (atomic_dec_and_test(&anonhd->count)) + BUG(); + } + spin_unlock(&anonhd->lock); + if (atomic_read(&anonhd->count) == 1) { + BUG_ON(anonhd->mm); + BUG_ON(!list_empty(&anonhd->list)); + kmem_cache_free(anonmm_cachep, anonhd); + } } -static inline unsigned long -pte_chain_encode(struct pte_chain *pte_chain, int idx) +static void free_anonmm(struct anonmm *anonmm) +{ + struct anonmm *anonhd = anonmm->head; + + BUG_ON(anonmm->mm); + BUG_ON(anonmm == anonhd); + spin_lock(&anonhd->lock); + list_del(&anonmm->list); + if (atomic_dec_and_test(&anonhd->count)) + BUG(); + spin_unlock(&anonhd->lock); + kmem_cache_free(anonmm_cachep, anonmm); +} + +static inline void clear_page_anon(struct page *page) { - return (unsigned long)pte_chain | idx; + struct anonmm *anonmm = (struct anonmm *) page->mapping; + + page->mapping = NULL; + ClearPageAnon(page); + if (atomic_dec_and_test(&anonmm->count)) + free_anonmm(anonmm); } +/** + ** VM stuff below this comment + **/ + /* - * pte_chain list management policy: - * - * - If a page has a pte_chain list then it is shared by at least two processes, - * because a single sharing uses PageDirect. (Well, this isn't true yet, - * coz this code doesn't collapse singletons back to PageDirect on the remove - * path). - * - A pte_chain list has free space only in the head member - all succeeding - * members are 100% full. - * - If the head element has free space, it occurs in its leading slots. - * - All free space in the pte_chain is at the start of the head member. - * - Insertion into the pte_chain puts a pte pointer in the last free slot of - * the head member. - * - Removal from a pte chain moves the head pte of the head member onto the - * victim pte and frees the head member if it became empty. + * At what user virtual address is page expected in file-backed vma? */ +#define NOADDR (~0UL) /* impossible user virtual address */ +static inline unsigned long +vma_address(struct page *page, struct vm_area_struct *vma) +{ + unsigned long pgoff; + unsigned long address; + + pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + return (address >= vma->vm_start && address < vma->vm_end)? + address: NOADDR; +} /** - ** VM stuff below this comment + ** Subfunctions of page_referenced: page_referenced_one called + ** repeatedly from either page_referenced_anon or page_referenced_obj. **/ +static int page_referenced_one(struct page *page, + struct mm_struct *mm, unsigned long address, int *mapcount) +{ + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + int referenced = 0; + + if (!spin_trylock(&mm->page_table_lock)) + return 0; + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out_unlock; + + pmd = pmd_offset(pgd, address); + if (!pmd_present(*pmd)) + goto out_unlock; + + pte = pte_offset_map(pmd, address); + if (!pte_present(*pte)) + goto out_unmap; + + if (page_to_pfn(page) != pte_pfn(*pte)) + goto out_unmap; + + if (ptep_test_and_clear_young(pte)) + referenced++; + + (*mapcount)--; + +out_unmap: + pte_unmap(pte); + +out_unlock: + spin_unlock(&mm->page_table_lock); + return referenced; +} + +static inline int page_referenced_anon(struct page *page, int *mapcount) +{ + struct anonmm *anonmm = (struct anonmm *) page->mapping; + struct anonmm *anonhd = anonmm->head; + struct list_head *seek_head; + int referenced = 0; + + spin_lock(&anonhd->lock); + /* + * First try the indicated mm, it's the most likely. + */ + if (anonmm->mm && anonmm->mm->rss) { + referenced += page_referenced_one( + page, anonmm->mm, page->index, mapcount); + if (!*mapcount) + goto out; + } + + /* + * Then down the rest of the list, from that as the head. Stop + * when we reach anonhd? No: although a page cannot get dup'ed + * into an older mm, once swapped, its indicated mm may not be + * the oldest, just the first into which it was faulted back. + */ + seek_head = &anonmm->list; + list_for_each_entry(anonmm, seek_head, list) { + if (!anonmm->mm || !anonmm->mm->rss) + continue; + referenced += page_referenced_one( + page, anonmm->mm, page->index, mapcount); + if (!*mapcount) + goto out; + } +out: + spin_unlock(&anonhd->lock); + return referenced; +} + +/** + * page_referenced_obj - referenced check for object-based rmap + * @page: the page we're checking references on. + * + * For an object-based mapped page, find all the places it is mapped and + * check/clear the referenced flag. This is done by following the page->mapping + * pointer, then walking the chain of vmas it holds. It returns the number + * of references it found. + * + * This function is only called from page_referenced for object-based pages. + * + * The semaphore address_space->i_shared_sem is tried. If it can't be gotten, + * assume a reference count of 0, so try_to_unmap will then have a go. + */ +static inline int page_referenced_obj(struct page *page, int *mapcount) +{ + struct address_space *mapping = page->mapping; + struct vm_area_struct *vma; + unsigned long address; + int referenced = 0; + + if (down_trylock(&mapping->i_shared_sem)) + return 0; + + list_for_each_entry(vma, &mapping->i_mmap, shared) { + if (!vma->vm_mm->rss) + continue; + address = vma_address(page, vma); + if (address == NOADDR) + continue; + if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE)) == + (VM_LOCKED|VM_MAYSHARE)) { + referenced++; + goto out; + } + referenced += page_referenced_one( + page, vma->vm_mm, address, mapcount); + if (!*mapcount) + goto out; + } + + list_for_each_entry(vma, &mapping->i_mmap_shared, shared) { + if (!vma->vm_mm->rss || (vma->vm_flags & VM_NONLINEAR)) + continue; + address = vma_address(page, vma); + if (address == NOADDR) + continue; + if (vma->vm_flags & (VM_LOCKED|VM_RESERVED)) { + referenced++; + goto out; + } + referenced += page_referenced_one( + page, vma->vm_mm, address, mapcount); + if (!*mapcount) + goto out; + } +out: + up(&mapping->i_shared_sem); + return referenced; +} + /** * page_referenced - test if the page was referenced * @page: the page to test * * Quick test_and_clear_referenced for all mappings to a page, - * returns the number of processes which referenced the page. - * Caller needs to hold the pte_chain_lock. - * - * If the page has a single-entry pte_chain, collapse that back to a PageDirect - * representation. This way, it's only done under memory pressure. + * returns the number of ptes which referenced the page. + * Caller needs to hold the rmap_lock. */ int fastcall page_referenced(struct page * page) { - struct pte_chain *pc; + int mapcount = page->mapcount; int referenced = 0; if (page_test_and_clear_young(page)) @@ -124,410 +331,505 @@ int fastcall page_referenced(struct page if (TestClearPageReferenced(page)) referenced++; - if (PageDirect(page)) { - pte_t *pte = rmap_ptep_map(page->pte.direct); - if (ptep_test_and_clear_young(pte)) - referenced++; - rmap_ptep_unmap(pte); - } else { - int nr_chains = 0; - - /* Check all the page tables mapping this page. */ - for (pc = page->pte.chain; pc; pc = pte_chain_next(pc)) { - int i; - - for (i = pte_chain_idx(pc); i < NRPTE; i++) { - pte_addr_t pte_paddr = pc->ptes[i]; - pte_t *p; - - p = rmap_ptep_map(pte_paddr); - if (ptep_test_and_clear_young(p)) - referenced++; - rmap_ptep_unmap(p); - nr_chains++; - } - } - if (nr_chains == 1) { - pc = page->pte.chain; - page->pte.direct = pc->ptes[NRPTE-1]; - SetPageDirect(page); - pc->ptes[NRPTE-1] = 0; - __pte_chain_free(pc); - } + if (page->mapcount && page->mapping) { + if (PageAnon(page)) + referenced += page_referenced_anon(page, &mapcount); + else + referenced += page_referenced_obj(page, &mapcount); } return referenced; } /** - * page_add_rmap - add reverse mapping entry to a page - * @page: the page to add the mapping to - * @ptep: the page table entry mapping this page + * page_add_anon_rmap - add pte mapping to an anonymous page + * @page: the page to add the mapping to + * @mm: the mm in which the mapping is added + * @address: the user virtual address mapped * - * Add a new pte reverse mapping to a page. * The caller needs to hold the mm->page_table_lock. */ -struct pte_chain * fastcall -page_add_rmap(struct page *page, pte_t *ptep, struct pte_chain *pte_chain) +void fastcall page_add_anon_rmap(struct page *page, + struct mm_struct *mm, unsigned long address) { - pte_addr_t pte_paddr = ptep_to_paddr(ptep); - struct pte_chain *cur_pte_chain; + struct anonmm *anonmm; - if (PageReserved(page)) - return pte_chain; + BUG_ON(PageReserved(page)); + BUG_ON(page_mapping(page)); - pte_chain_lock(page); - - if (page->pte.direct == 0) { - page->pte.direct = pte_paddr; - SetPageDirect(page); + rmap_lock(page); + if (!page->mapcount) { + anonmm = mm->anonmm; + SetPageAnon(page); + page->index = address & PAGE_MASK; + page->mapping = (void *) anonmm; + atomic_inc(&anonmm->count); inc_page_state(nr_mapped); - goto out; } + page->mapcount++; + rmap_unlock(page); +} - if (PageDirect(page)) { - /* Convert a direct pointer into a pte_chain */ - ClearPageDirect(page); - pte_chain->ptes[NRPTE-1] = page->pte.direct; - pte_chain->ptes[NRPTE-2] = pte_paddr; - pte_chain->next_and_idx = pte_chain_encode(NULL, NRPTE-2); - page->pte.direct = 0; - page->pte.chain = pte_chain; - pte_chain = NULL; /* We consumed it */ - goto out; - } +/** + * page_update_anon_rmap - move pte mapping of an anonymous page + * @page: the page to update the mapping of + * @mm: the new mm in which the mapping is found + * @address: the new user virtual address mapped + * + * The caller needs to hold the mm->page_table_lock. + * + * For do_wp_page: to update mapping to the one remaining mm. + * For copy_one_pte: to update address when vma is mremapped. + */ +void fastcall page_update_anon_rmap(struct page *page, + struct mm_struct *mm, unsigned long address) +{ + struct anonmm *anonmm; - cur_pte_chain = page->pte.chain; - if (cur_pte_chain->ptes[0]) { /* It's full */ - pte_chain->next_and_idx = pte_chain_encode(cur_pte_chain, - NRPTE - 1); - page->pte.chain = pte_chain; - pte_chain->ptes[NRPTE-1] = pte_paddr; - pte_chain = NULL; /* We consumed it */ - goto out; + BUG_ON(!PageAnon(page)); + if (page->mapcount != 1) + return; + + anonmm = mm->anonmm; + address &= PAGE_MASK; + if (anonmm == (struct anonmm *) page->mapping && + address == page->index) + return; + + rmap_lock(page); + if (page->mapcount == 1) { + page->index = address; + if (anonmm != (struct anonmm *) page->mapping) { + clear_page_anon(page); + SetPageAnon(page); + page->mapping = (void *) anonmm; + atomic_inc(&anonmm->count); + } } - cur_pte_chain->ptes[pte_chain_idx(cur_pte_chain) - 1] = pte_paddr; - cur_pte_chain->next_and_idx--; -out: - pte_chain_unlock(page); - return pte_chain; + rmap_unlock(page); } /** - * page_remove_rmap - take down reverse mapping to a page - * @page: page to remove mapping from - * @ptep: page table entry to remove + * page_add_obj_rmap - add pte mapping to a file page + * @page: the page to add the mapping to * - * Removes the reverse mapping from the pte_chain of the page, - * after that the caller can clear the page table entry and free - * the page. - * Caller needs to hold the mm->page_table_lock. + * The caller needs to hold the mm->page_table_lock. */ -void fastcall page_remove_rmap(struct page *page, pte_t *ptep) +void fastcall page_add_obj_rmap(struct page *page) { - pte_addr_t pte_paddr = ptep_to_paddr(ptep); - struct pte_chain *pc; - + BUG_ON(PageAnon(page)); if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) return; - pte_chain_lock(page); + rmap_lock(page); + if (!page->mapcount) + inc_page_state(nr_mapped); + page->mapcount++; + rmap_unlock(page); +} - if (!page_mapped(page)) - goto out_unlock; /* remap_page_range() from a driver? */ +/** + * page_remove_rmap - take down pte mapping from a page + * @page: page to remove mapping from + * + * Caller needs to hold the mm->page_table_lock. + */ +void fastcall page_remove_rmap(struct page *page) +{ + BUG_ON(PageReserved(page)); + BUG_ON(!page->mapcount); - if (PageDirect(page)) { - if (page->pte.direct == pte_paddr) { - page->pte.direct = 0; - ClearPageDirect(page); - goto out; - } - } else { - struct pte_chain *start = page->pte.chain; - struct pte_chain *next; - int victim_i = pte_chain_idx(start); - - for (pc = start; pc; pc = next) { - int i; - - next = pte_chain_next(pc); - if (next) - prefetch(next); - for (i = pte_chain_idx(pc); i < NRPTE; i++) { - pte_addr_t pa = pc->ptes[i]; - - if (pa != pte_paddr) - continue; - pc->ptes[i] = start->ptes[victim_i]; - start->ptes[victim_i] = 0; - if (victim_i == NRPTE-1) { - /* Emptied a pte_chain */ - page->pte.chain = pte_chain_next(start); - __pte_chain_free(start); - } else { - start->next_and_idx++; - } - goto out; - } - } - } -out: - if (page->pte.direct == 0 && page_test_and_clear_dirty(page)) - set_page_dirty(page); - if (!page_mapped(page)) + rmap_lock(page); + page->mapcount--; + if (!page->mapcount) { + if (page_test_and_clear_dirty(page)) + set_page_dirty(page); + if (PageAnon(page)) + clear_page_anon(page); dec_page_state(nr_mapped); -out_unlock: - pte_chain_unlock(page); - return; + } + rmap_unlock(page); } /** - * try_to_unmap_one - worker function for try_to_unmap - * @page: page to unmap - * @ptep: page table entry to unmap from page - * - * Internal helper function for try_to_unmap, called for each page - * table entry mapping a page. Because locking order here is opposite - * to the locking order used by the page fault path, we use trylocks. - * Locking: - * page lock shrink_list(), trylock - * pte_chain_lock shrink_list() - * mm->page_table_lock try_to_unmap_one(), trylock - */ -static int FASTCALL(try_to_unmap_one(struct page *, pte_addr_t)); -static int fastcall try_to_unmap_one(struct page * page, pte_addr_t paddr) -{ - pte_t *ptep = rmap_ptep_map(paddr); - unsigned long address = ptep_to_address(ptep); - struct mm_struct * mm = ptep_to_mm(ptep); - struct vm_area_struct * vma; - pte_t pte; - int ret; + ** Subfunctions of try_to_unmap: try_to_unmap_one called + ** repeatedly from either try_to_unmap_anon or try_to_unmap_obj. + **/ - if (!mm) - BUG(); +static int try_to_unmap_one(struct page *page, struct mm_struct *mm, + unsigned long address, int *mapcount, struct vm_area_struct *vma) +{ + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + pte_t pteval; + int ret = SWAP_AGAIN; /* * We need the page_table_lock to protect us from page faults, * munmap, fork, etc... */ - if (!spin_trylock(&mm->page_table_lock)) { - rmap_ptep_unmap(ptep); - return SWAP_AGAIN; - } + if (!spin_trylock(&mm->page_table_lock)) + goto out; + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out_unlock; - /* During mremap, it's possible pages are not in a VMA. */ - vma = find_vma(mm, address); - if (!vma) { - ret = SWAP_FAIL; + pmd = pmd_offset(pgd, address); + if (!pmd_present(*pmd)) goto out_unlock; + + pte = pte_offset_map(pmd, address); + if (!pte_present(*pte)) + goto out_unmap; + + if (page_to_pfn(page) != pte_pfn(*pte)) + goto out_unmap; + + (*mapcount)--; + + if (!vma) { + vma = find_vma(mm, address); + /* unmap_vmas drops page_table_lock with vma unlinked */ + if (!vma) + goto out_unmap; } - /* The page is mlock()d, we cannot swap it out. */ - if (vma->vm_flags & VM_LOCKED) { + /* + * If the page is mlock()d, we cannot swap it out. + * If it's recently referenced (perhaps page_referenced + * skipped over this mm) then we should reactivate it. + */ + if ((vma->vm_flags & (VM_LOCKED|VM_RESERVED)) || + ptep_test_and_clear_young(pte)) { ret = SWAP_FAIL; - goto out_unlock; + goto out_unmap; } /* Nuke the page table entry. */ flush_cache_page(vma, address); - pte = ptep_clear_flush(vma, address, ptep); + pteval = ptep_clear_flush(vma, address, pte); - if (PageSwapCache(page)) { + if (PageAnon(page)) { + swp_entry_t entry = { .val = page->private }; /* * Store the swap location in the pte. * See handle_pte_fault() ... */ - swp_entry_t entry = { .val = page->index }; + BUG_ON(!PageSwapCache(page)); swap_duplicate(entry); - set_pte(ptep, swp_entry_to_pte(entry)); - BUG_ON(pte_file(*ptep)); - } else { - unsigned long pgidx; - /* - * If a nonlinear mapping then store the file page offset - * in the pte. - */ - pgidx = (address - vma->vm_start) >> PAGE_SHIFT; - pgidx += vma->vm_pgoff; - pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; - if (page->index != pgidx) { - set_pte(ptep, pgoff_to_pte(page->index)); - BUG_ON(!pte_file(*ptep)); - } + set_pte(pte, swp_entry_to_pte(entry)); + BUG_ON(pte_file(*pte)); } /* Move the dirty bit to the physical page now the pte is gone. */ - if (pte_dirty(pte)) + if (pte_dirty(pteval)) set_page_dirty(page); mm->rss--; + BUG_ON(!page->mapcount); + page->mapcount--; page_cache_release(page); - ret = SWAP_SUCCESS; + +out_unmap: + pte_unmap(pte); out_unlock: - rmap_ptep_unmap(ptep); spin_unlock(&mm->page_table_lock); + +out: return ret; } -/** - * try_to_unmap - try to remove all page table mappings to a page - * @page: the page to get unmapped - * - * Tries to remove all the page table entries which are mapping this - * page, used in the pageout path. Caller must hold the page lock - * and its pte chain lock. Return values are: - * - * SWAP_SUCCESS - we succeeded in removing all mappings - * SWAP_AGAIN - we missed a trylock, try again later - * SWAP_FAIL - the page is unswappable +/* + * try_to_unmap_cluster is only used on VM_NONLINEAR shared object vmas, + * in which objrmap is unable to predict where a page will be found. */ -int fastcall try_to_unmap(struct page * page) -{ - struct pte_chain *pc, *next_pc, *start; - int ret = SWAP_SUCCESS; - int victim_i; +#define CLUSTER_SIZE (32 * PAGE_SIZE) +#if CLUSTER_SIZE > PMD_SIZE +#undef CLUSTER_SIZE +#define CLUSTER_SIZE PMD_SIZE +#endif +#define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) + +static int try_to_unmap_cluster(struct mm_struct *mm, + unsigned long cursor, int *mapcount, struct vm_area_struct *vma) +{ + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + pte_t pteval; + struct page *page; + unsigned long address; + unsigned long end; + unsigned long pfn; + unsigned long pgidx; - /* This page should not be on the pageout lists. */ - if (PageReserved(page)) - BUG(); - if (!PageLocked(page)) - BUG(); - /* We need backing store to swap out a page. */ - if (!page->mapping) - BUG(); + /* + * We need the page_table_lock to protect us from page faults, + * munmap, fork, etc... + */ + if (!spin_trylock(&mm->page_table_lock)) + return SWAP_FAIL; - if (PageDirect(page)) { - ret = try_to_unmap_one(page, page->pte.direct); - if (ret == SWAP_SUCCESS) { - if (page_test_and_clear_dirty(page)) - set_page_dirty(page); - page->pte.direct = 0; - ClearPageDirect(page); - } - goto out; - } + address = (vma->vm_start + cursor) & CLUSTER_MASK; + end = address + CLUSTER_SIZE; + if (address < vma->vm_start) + address = vma->vm_start; + if (end > vma->vm_end) + end = vma->vm_end; - start = page->pte.chain; - victim_i = pte_chain_idx(start); - for (pc = start; pc; pc = next_pc) { - int i; - - next_pc = pte_chain_next(pc); - if (next_pc) - prefetch(next_pc); - for (i = pte_chain_idx(pc); i < NRPTE; i++) { - pte_addr_t pte_paddr = pc->ptes[i]; - - switch (try_to_unmap_one(page, pte_paddr)) { - case SWAP_SUCCESS: - /* - * Release a slot. If we're releasing the - * first pte in the first pte_chain then - * pc->ptes[i] and start->ptes[victim_i] both - * refer to the same thing. It works out. - */ - pc->ptes[i] = start->ptes[victim_i]; - start->ptes[victim_i] = 0; - victim_i++; - if (victim_i == NRPTE) { - page->pte.chain = pte_chain_next(start); - __pte_chain_free(start); - start = page->pte.chain; - victim_i = 0; - } else { - start->next_and_idx++; - } - if (page->pte.direct == 0 && - page_test_and_clear_dirty(page)) - set_page_dirty(page); - break; - case SWAP_AGAIN: - /* Skip this pte, remembering status. */ - ret = SWAP_AGAIN; - continue; - case SWAP_FAIL: - ret = SWAP_FAIL; - goto out; - } + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out_unlock; + + pmd = pmd_offset(pgd, address); + if (!pmd_present(*pmd)) + goto out_unlock; + + for (pte = pte_offset_map(pmd, address); + address < end; pte++, address += PAGE_SIZE) { + + if (!pte_present(*pte)) + continue; + + pfn = pte_pfn(*pte); + if (!pfn_valid(pfn)) + continue; + + page = pfn_to_page(pfn); + BUG_ON(PageAnon(page)); + if (PageReserved(page)) + continue; + + if (ptep_test_and_clear_young(pte)) + continue; + + /* Nuke the page table entry. */ + flush_cache_page(vma, address); + pteval = ptep_clear_flush(vma, address, pte); + + /* If nonlinear, store the file page offset in the pte. */ + pgidx = (address - vma->vm_start) >> PAGE_SHIFT; + pgidx += vma->vm_pgoff; + pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; + if (page->index != pgidx) { + set_pte(pte, pgoff_to_pte(page->index)); + BUG_ON(!pte_file(*pte)); } + + /* Move the dirty bit to the physical page now the pte is gone. */ + if (pte_dirty(pteval)) + set_page_dirty(page); + + page_remove_rmap(page); + page_cache_release(page); + mm->rss--; + (*mapcount)--; + } + + pte_unmap(pte); + +out_unlock: + spin_unlock(&mm->page_table_lock); + return SWAP_AGAIN; +} + +static inline int try_to_unmap_anon(struct page *page, int *mapcount) +{ + struct anonmm *anonmm = (struct anonmm *) page->mapping; + struct anonmm *anonhd = anonmm->head; + struct list_head *seek_head; + int ret = SWAP_AGAIN; + + spin_lock(&anonhd->lock); + /* + * First try the indicated mm, it's the most likely. + */ + if (anonmm->mm && anonmm->mm->rss) { + ret = try_to_unmap_one( + page, anonmm->mm, page->index, mapcount, NULL); + if (ret == SWAP_FAIL || !*mapcount) + goto out; + } + + /* + * Then down the rest of the list, from that as the head. Stop + * when we reach anonhd? No: although a page cannot get dup'ed + * into an older mm, once swapped, its indicated mm may not be + * the oldest, just the first into which it was faulted back. + */ + seek_head = &anonmm->list; + list_for_each_entry(anonmm, seek_head, list) { + if (!anonmm->mm || !anonmm->mm->rss) + continue; + ret = try_to_unmap_one( + page, anonmm->mm, page->index, mapcount, NULL); + if (ret == SWAP_FAIL || !*mapcount) + goto out; } out: - if (!page_mapped(page)) - dec_page_state(nr_mapped); + spin_unlock(&anonhd->lock); return ret; } /** - ** No more VM stuff below this comment, only pte_chain helper - ** functions. - **/ - -static void pte_chain_ctor(void *p, kmem_cache_t *cachep, unsigned long flags) -{ - struct pte_chain *pc = p; + * try_to_unmap_obj - unmap a page using the object-based rmap method + * @page: the page to unmap + * + * Find all the mappings of a page using the mapping pointer and the vma chains + * contained in the address_space struct it points to. + * + * This function is only called from try_to_unmap for object-based pages. + * + * The semaphore address_space->i_shared_sem is tried. If it can't be gotten, + * return a temporary error. + */ +static inline int try_to_unmap_obj(struct page *page, int *mapcount) +{ + struct address_space *mapping = page->mapping; + struct vm_area_struct *vma; + unsigned long address; + int ret = SWAP_AGAIN; + unsigned long cursor; + unsigned long max_nl_cursor = 0; + unsigned long max_nl_size = 0; + + if (down_trylock(&mapping->i_shared_sem)) + return ret; + + list_for_each_entry(vma, &mapping->i_mmap, shared) { + if (!vma->vm_mm->rss) + continue; + address = vma_address(page, vma); + if (address == NOADDR) + continue; + ret = try_to_unmap_one( + page, vma->vm_mm, address, mapcount, vma); + if (ret == SWAP_FAIL || !*mapcount) + goto out; + } - memset(pc, 0, sizeof(*pc)); -} + list_for_each_entry(vma, &mapping->i_mmap_shared, shared) { + if (unlikely(vma->vm_flags & VM_NONLINEAR)) { + /* + * Defer unmapping nonlinear to the next loop, + * but take notes while we're here e.g. don't + * want to loop again when no nonlinear vmas. + */ + if (vma->vm_flags & (VM_LOCKED|VM_RESERVED)) + continue; + cursor = (unsigned long) vma->vm_private_data; + if (cursor > max_nl_cursor) + max_nl_cursor = cursor; + cursor = vma->vm_end - vma->vm_start; + if (cursor > max_nl_size) + max_nl_size = cursor; + continue; + } + if (!vma->vm_mm->rss) + continue; + address = vma_address(page, vma); + if (address == NOADDR) + continue; + ret = try_to_unmap_one( + page, vma->vm_mm, address, mapcount, vma); + if (ret == SWAP_FAIL || !*mapcount) + goto out; + } -DEFINE_PER_CPU(struct pte_chain *, local_pte_chain) = 0; + if (max_nl_size == 0) /* no nonlinear vmas of this file */ + goto out; -/** - * __pte_chain_free - free pte_chain structure - * @pte_chain: pte_chain struct to free - */ -void __pte_chain_free(struct pte_chain *pte_chain) -{ - struct pte_chain **pte_chainp; + /* + * We don't try to search for this page in the nonlinear vmas, + * and page_referenced wouldn't have found it anyway. Instead + * just walk the nonlinear vmas trying to age and unmap some. + * The mapcount of the page we came in with is irrelevant, + * but even so use it as a guide to how hard we should try? + */ + rmap_unlock(page); - pte_chainp = &get_cpu_var(local_pte_chain); - if (pte_chain->next_and_idx) - pte_chain->next_and_idx = 0; - if (*pte_chainp) - kmem_cache_free(pte_chain_cache, *pte_chainp); - *pte_chainp = pte_chain; - put_cpu_var(local_pte_chain); -} + max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; + if (max_nl_cursor == 0) + max_nl_cursor = CLUSTER_SIZE; + + do { + list_for_each_entry(vma, &mapping->i_mmap_shared, shared) { + if (VM_NONLINEAR != (vma->vm_flags & + (VM_NONLINEAR|VM_LOCKED|VM_RESERVED))) + continue; + cursor = (unsigned long) vma->vm_private_data; + while (vma->vm_mm->rss && + cursor < max_nl_cursor && + cursor < vma->vm_end - vma->vm_start) { + ret = try_to_unmap_cluster(vma->vm_mm, + cursor, mapcount, vma); + if (ret == SWAP_FAIL) + break; + cursor += CLUSTER_SIZE; + vma->vm_private_data = (void *) cursor; + if (*mapcount <= 0) + goto relock; + } + if (ret != SWAP_FAIL) + vma->vm_private_data = + (void *) max_nl_cursor; + ret = SWAP_AGAIN; + } + max_nl_cursor += CLUSTER_SIZE; + } while (max_nl_cursor <= max_nl_size); -/* - * pte_chain_alloc(): allocate a pte_chain structure for use by page_add_rmap(). - * - * The caller of page_add_rmap() must perform the allocation because - * page_add_rmap() is invariably called under spinlock. Often, page_add_rmap() - * will not actually use the pte_chain, because there is space available in one - * of the existing pte_chains which are attached to the page. So the case of - * allocating and then freeing a single pte_chain is specially optimised here, - * with a one-deep per-cpu cache. - */ -struct pte_chain *pte_chain_alloc(int gfp_flags) -{ - struct pte_chain *ret; - struct pte_chain **pte_chainp; - - might_sleep_if(gfp_flags & __GFP_WAIT); - - pte_chainp = &get_cpu_var(local_pte_chain); - if (*pte_chainp) { - ret = *pte_chainp; - *pte_chainp = NULL; - put_cpu_var(local_pte_chain); - } else { - put_cpu_var(local_pte_chain); - ret = kmem_cache_alloc(pte_chain_cache, gfp_flags); + /* + * Don't loop forever (perhaps all the remaining pages are + * in locked vmas). Reset cursor on all unreserved nonlinear + * vmas, now forgetting on which ones it had fallen behind. + */ + list_for_each_entry(vma, &mapping->i_mmap_shared, shared) { + if ((vma->vm_flags & (VM_NONLINEAR|VM_RESERVED)) == + VM_NONLINEAR) + vma->vm_private_data = 0; } +relock: + rmap_lock(page); +out: + up(&mapping->i_shared_sem); return ret; } -void __init pte_chain_init(void) +/** + * try_to_unmap - try to remove all page table mappings to a page + * @page: the page to get unmapped + * + * Tries to remove all the page table entries which are mapping this + * page, used in the pageout path. Caller must hold the page lock + * and its rmap_lock. Return values are: + * + * SWAP_SUCCESS - we succeeded in removing all mappings + * SWAP_AGAIN - we missed a trylock, try again later + * SWAP_FAIL - the page is unswappable + */ +int fastcall try_to_unmap(struct page * page) { - pte_chain_cache = kmem_cache_create( "pte_chain", - sizeof(struct pte_chain), - 0, - SLAB_MUST_HWCACHE_ALIGN, - pte_chain_ctor, - NULL); + int mapcount = page->mapcount; + int ret; - if (!pte_chain_cache) - panic("failed to create pte_chain cache!\n"); + BUG_ON(PageReserved(page)); + BUG_ON(!PageLocked(page)); + BUG_ON(!page->mapcount); + + if (PageAnon(page)) + ret = try_to_unmap_anon(page, &mapcount); + else + ret = try_to_unmap_obj(page, &mapcount); + + if (!page->mapcount) { + if (page_test_and_clear_dirty(page)) + set_page_dirty(page); + if (PageAnon(page)) + clear_page_anon(page); + dec_page_state(nr_mapped); + ret = SWAP_SUCCESS; + } + return ret; } diff -purN -X /home/mbligh/.diff.exclude reference/mm/swap_state.c current/mm/swap_state.c --- reference/mm/swap_state.c 2003-10-01 11:35:37.000000000 -0700 +++ current/mm/swap_state.c 2004-04-12 10:22:28.000000000 -0700 @@ -21,23 +21,20 @@ static struct backing_dev_info swap_back .memory_backed = 1, /* Does not contribute to dirty memory */ }; -extern struct address_space_operations swap_aops; +static struct address_space_operations swap_aops = { + .writepage = swap_writepage, + .readpage = swap_readpage, + /* + * sync_page and set_page_dirty are special-cased. + */ +}; struct address_space swapper_space = { .page_tree = RADIX_TREE_INIT(GFP_ATOMIC), .page_lock = SPIN_LOCK_UNLOCKED, - .clean_pages = LIST_HEAD_INIT(swapper_space.clean_pages), - .dirty_pages = LIST_HEAD_INIT(swapper_space.dirty_pages), - .io_pages = LIST_HEAD_INIT(swapper_space.io_pages), - .locked_pages = LIST_HEAD_INIT(swapper_space.locked_pages), + .nrpages = 0, .a_ops = &swap_aops, .backing_dev_info = &swap_backing_dev_info, - .i_mmap = LIST_HEAD_INIT(swapper_space.i_mmap), - .i_mmap_shared = LIST_HEAD_INIT(swapper_space.i_mmap_shared), - .i_shared_sem = __MUTEX_INITIALIZER(swapper_space.i_shared_sem), - .truncate_count = ATOMIC_INIT(0), - .private_lock = SPIN_LOCK_UNLOCKED, - .private_list = LIST_HEAD_INIT(swapper_space.private_list), }; #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) @@ -59,30 +56,55 @@ void show_swap_cache_info(void) swap_cache_info.noent_race, swap_cache_info.exist_race); } +/* + * __add_to_swap_cache resembles add_to_page_cache on swapper_space, + * but sets SwapCache flag and private instead of mapping and index. + */ +static int __add_to_swap_cache(struct page *page, + swp_entry_t entry, int gfp_mask) +{ + int error; + + BUG_ON(PageSwapCache(page)); + BUG_ON(PagePrivate(page)); + error = radix_tree_preload(gfp_mask); + if (!error) { + page_cache_get(page); + spin_lock(&swapper_space.page_lock); + error = radix_tree_insert(&swapper_space.page_tree, + entry.val, page); + if (!error) { + SetPageLocked(page); + SetPageSwapCache(page); + page->private = entry.val; + total_swapcache_pages++; + pagecache_acct(1); + } else + page_cache_release(page); + spin_unlock(&swapper_space.page_lock); + radix_tree_preload_end(); + } + return error; +} + static int add_to_swap_cache(struct page *page, swp_entry_t entry) { int error; - if (page->mapping) - BUG(); if (!swap_duplicate(entry)) { INC_CACHE_INFO(noent_race); return -ENOENT; } - error = add_to_page_cache(page, &swapper_space, entry.val, GFP_KERNEL); + error = __add_to_swap_cache(page, entry, GFP_KERNEL); /* * Anon pages are already on the LRU, we don't run lru_cache_add here. */ - if (error != 0) { + if (error) { swap_free(entry); if (error == -EEXIST) INC_CACHE_INFO(exist_race); return error; } - if (!PageLocked(page)) - BUG(); - if (!PageSwapCache(page)) - BUG(); INC_CACHE_INFO(add_total); return 0; } @@ -96,7 +118,12 @@ void __delete_from_swap_cache(struct pag BUG_ON(!PageLocked(page)); BUG_ON(!PageSwapCache(page)); BUG_ON(PageWriteback(page)); - __remove_from_page_cache(page); + + radix_tree_delete(&swapper_space.page_tree, page->private); + page->private = 0; + ClearPageSwapCache(page); + total_swapcache_pages--; + pagecache_acct(-1); INC_CACHE_INFO(del_total); } @@ -140,8 +167,7 @@ int add_to_swap(struct page * page) /* * Add it to the swap cache and mark it dirty */ - err = add_to_page_cache(page, &swapper_space, - entry.val, GFP_ATOMIC); + err = __add_to_swap_cache(page, entry, GFP_ATOMIC); if (pf_flags & PF_MEMALLOC) current->flags |= PF_MEMALLOC; @@ -149,8 +175,7 @@ int add_to_swap(struct page * page) switch (err) { case 0: /* Success */ SetPageUptodate(page); - ClearPageDirty(page); - set_page_dirty(page); + SetPageDirty(page); INC_CACHE_INFO(add_total); return 1; case -EEXIST: @@ -176,11 +201,12 @@ void delete_from_swap_cache(struct page { swp_entry_t entry; + BUG_ON(!PageSwapCache(page)); BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); BUG_ON(PagePrivate(page)); - entry.val = page->index; + entry.val = page->private; spin_lock(&swapper_space.page_lock); __delete_from_swap_cache(page); @@ -192,27 +218,13 @@ void delete_from_swap_cache(struct page int move_to_swap_cache(struct page *page, swp_entry_t entry) { - struct address_space *mapping = page->mapping; - int err; - - spin_lock(&swapper_space.page_lock); - spin_lock(&mapping->page_lock); - - err = radix_tree_insert(&swapper_space.page_tree, entry.val, page); - if (!err) { - __remove_from_page_cache(page); - ___add_to_page_cache(page, &swapper_space, entry.val); - } - - spin_unlock(&mapping->page_lock); - spin_unlock(&swapper_space.page_lock); - + int err = __add_to_swap_cache(page, entry, GFP_ATOMIC); if (!err) { + remove_from_page_cache(page); + page_cache_release(page); /* pagecache ref */ if (!swap_duplicate(entry)) BUG(); - /* shift page from clean_pages to dirty_pages list */ - BUG_ON(PageDirty(page)); - set_page_dirty(page); + SetPageDirty(page); INC_CACHE_INFO(add_total); } else if (err == -EEXIST) INC_CACHE_INFO(exist_race); @@ -222,29 +234,9 @@ int move_to_swap_cache(struct page *page int move_from_swap_cache(struct page *page, unsigned long index, struct address_space *mapping) { - swp_entry_t entry; - int err; - - BUG_ON(!PageLocked(page)); - BUG_ON(PageWriteback(page)); - BUG_ON(PagePrivate(page)); - - entry.val = page->index; - - spin_lock(&swapper_space.page_lock); - spin_lock(&mapping->page_lock); - - err = radix_tree_insert(&mapping->page_tree, index, page); + int err = add_to_page_cache(page, mapping, index, GFP_ATOMIC); if (!err) { - __delete_from_swap_cache(page); - ___add_to_page_cache(page, mapping, index); - } - - spin_unlock(&mapping->page_lock); - spin_unlock(&swapper_space.page_lock); - - if (!err) { - swap_free(entry); + delete_from_swap_cache(page); /* shift page from clean_pages to dirty_pages list */ ClearPageDirty(page); set_page_dirty(page); @@ -252,7 +244,6 @@ int move_from_swap_cache(struct page *pa return err; } - /* * If we are the only user, then try to free up the swap cache. * @@ -310,19 +301,17 @@ void free_pages_and_swap_cache(struct pa */ struct page * lookup_swap_cache(swp_entry_t entry) { - struct page *found; + struct page *page; - found = find_get_page(&swapper_space, entry.val); - /* - * Unsafe to assert PageSwapCache and mapping on page found: - * if SMP nothing prevents swapoff from deleting this page from - * the swap cache at this moment. find_lock_page would prevent - * that, but no need to change: we _have_ got the right page. - */ - INC_CACHE_INFO(find_total); - if (found) + spin_lock(&swapper_space.page_lock); + page = radix_tree_lookup(&swapper_space.page_tree, entry.val); + if (page) { + page_cache_get(page); INC_CACHE_INFO(find_success); - return found; + } + spin_unlock(&swapper_space.page_lock); + INC_CACHE_INFO(find_total); + return page; } /* @@ -340,10 +329,14 @@ struct page * read_swap_cache_async(swp_ /* * First check the swap cache. Since this is normally * called after lookup_swap_cache() failed, re-calling - * that would confuse statistics: use find_get_page() - * directly. + * that would confuse statistics. */ - found_page = find_get_page(&swapper_space, entry.val); + spin_lock(&swapper_space.page_lock); + found_page = radix_tree_lookup(&swapper_space.page_tree, + entry.val); + if (found_page) + page_cache_get(found_page); + spin_unlock(&swapper_space.page_lock); if (found_page) break; diff -purN -X /home/mbligh/.diff.exclude reference/mm/swapfile.c current/mm/swapfile.c --- reference/mm/swapfile.c 2004-04-07 14:54:38.000000000 -0700 +++ current/mm/swapfile.c 2004-04-12 10:22:28.000000000 -0700 @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include @@ -158,7 +158,7 @@ out: return entry; } -static struct swap_info_struct * swap_info_get(swp_entry_t entry) +struct swap_info_struct * swap_info_get(swp_entry_t entry) { struct swap_info_struct * p; unsigned long offset, type; @@ -197,7 +197,7 @@ out: return NULL; } -static void swap_info_put(struct swap_info_struct * p) +void swap_info_put(struct swap_info_struct * p) { swap_device_unlock(p); swap_list_unlock(); @@ -247,14 +247,14 @@ static int exclusive_swap_page(struct pa struct swap_info_struct * p; swp_entry_t entry; - entry.val = page->index; + entry.val = page->private; p = swap_info_get(entry); if (p) { /* Is the only swap cache user the cache itself? */ if (p->swap_map[swp_offset(entry)] == 1) { /* Recheck the page count with the pagecache lock held.. */ spin_lock(&swapper_space.page_lock); - if (page_count(page) - !!PagePrivate(page) == 2) + if (page_count(page) == 2) retval = 1; spin_unlock(&swapper_space.page_lock); } @@ -315,7 +315,7 @@ int remove_exclusive_swap_page(struct pa if (page_count(page) != 2) /* 2: us + cache */ return 0; - entry.val = page->index; + entry.val = page->private; p = swap_info_get(entry); if (!p) return 0; @@ -353,8 +353,14 @@ void free_swap_and_cache(swp_entry_t ent p = swap_info_get(entry); if (p) { - if (swap_entry_free(p, swp_offset(entry)) == 1) - page = find_trylock_page(&swapper_space, entry.val); + if (swap_entry_free(p, swp_offset(entry)) == 1) { + spin_lock(&swapper_space.page_lock); + page = radix_tree_lookup(&swapper_space.page_tree, + entry.val); + if (page && TestSetPageLocked(page)) + page = NULL; + spin_unlock(&swapper_space.page_lock); + } swap_info_put(p); } if (page) { @@ -385,19 +391,19 @@ void free_swap_and_cache(swp_entry_t ent /* vma->vm_mm->page_table_lock is held */ static void unuse_pte(struct vm_area_struct *vma, unsigned long address, pte_t *dir, - swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) + swp_entry_t entry, struct page *page) { vma->vm_mm->rss++; get_page(page); set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); - *pte_chainp = page_add_rmap(page, dir, *pte_chainp); + page_add_anon_rmap(page, vma->vm_mm, address); swap_free(entry); } /* vma->vm_mm->page_table_lock is held */ static int unuse_pmd(struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long size, unsigned long offset, - swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) + swp_entry_t entry, struct page *page) { pte_t * pte; unsigned long end; @@ -422,8 +428,7 @@ static int unuse_pmd(struct vm_area_stru * Test inline before going to call unuse_pte. */ if (unlikely(pte_same(*pte, swp_pte))) { - unuse_pte(vma, offset + address, pte, - entry, page, pte_chainp); + unuse_pte(vma, offset + address, pte, entry, page); pte_unmap(pte); return 1; } @@ -437,7 +442,7 @@ static int unuse_pmd(struct vm_area_stru /* vma->vm_mm->page_table_lock is held */ static int unuse_pgd(struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long size, - swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) + swp_entry_t entry, struct page *page) { pmd_t * pmd; unsigned long offset, end; @@ -459,7 +464,7 @@ static int unuse_pgd(struct vm_area_stru BUG(); do { if (unuse_pmd(vma, pmd, address, end - address, - offset, entry, page, pte_chainp)) + offset, entry, page)) return 1; address = (address + PMD_SIZE) & PMD_MASK; pmd++; @@ -469,15 +474,14 @@ static int unuse_pgd(struct vm_area_stru /* vma->vm_mm->page_table_lock is held */ static int unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir, - swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) + swp_entry_t entry, struct page *page) { unsigned long start = vma->vm_start, end = vma->vm_end; if (start >= end) BUG(); do { - if (unuse_pgd(vma, pgdir, start, end - start, - entry, page, pte_chainp)) + if (unuse_pgd(vma, pgdir, start, end - start, entry, page)) return 1; start = (start + PGDIR_SIZE) & PGDIR_MASK; pgdir++; @@ -485,15 +489,10 @@ static int unuse_vma(struct vm_area_stru return 0; } -static int unuse_process(struct mm_struct * mm, +static void unuse_process(struct mm_struct * mm, swp_entry_t entry, struct page* page) { struct vm_area_struct* vma; - struct pte_chain *pte_chain; - - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - return -ENOMEM; /* * Go through process' page directory. @@ -501,12 +500,10 @@ static int unuse_process(struct mm_struc spin_lock(&mm->page_table_lock); for (vma = mm->mmap; vma; vma = vma->vm_next) { pgd_t * pgd = pgd_offset(mm, vma->vm_start); - if (unuse_vma(vma, pgd, entry, page, &pte_chain)) + if (unuse_vma(vma, pgd, entry, page)) break; } spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); - return 0; } /* @@ -654,7 +651,7 @@ static int try_to_unuse(unsigned int typ if (start_mm == &init_mm) shmem = shmem_unuse(entry, page); else - retval = unuse_process(start_mm, entry, page); + unuse_process(start_mm, entry, page); } if (*swap_map > 1) { int set_start_mm = (*swap_map >= swcount); @@ -666,7 +663,7 @@ static int try_to_unuse(unsigned int typ atomic_inc(&new_start_mm->mm_users); atomic_inc(&prev_mm->mm_users); spin_lock(&mmlist_lock); - while (*swap_map > 1 && !retval && + while (*swap_map > 1 && (p = p->next) != &start_mm->mmlist) { mm = list_entry(p, struct mm_struct, mmlist); atomic_inc(&mm->mm_users); @@ -683,7 +680,7 @@ static int try_to_unuse(unsigned int typ set_start_mm = 1; shmem = shmem_unuse(entry, page); } else - retval = unuse_process(mm, entry, page); + unuse_process(mm, entry, page); if (set_start_mm && *swap_map < swcount) { mmput(new_start_mm); atomic_inc(&mm->mm_users); @@ -697,11 +694,6 @@ static int try_to_unuse(unsigned int typ mmput(start_mm); start_mm = new_start_mm; } - if (retval) { - unlock_page(page); - page_cache_release(page); - break; - } /* * How could swap count reach 0x7fff when the maximum @@ -996,14 +988,14 @@ int page_queue_congested(struct page *pa BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */ - bdi = page->mapping->backing_dev_info; if (PageSwapCache(page)) { - swp_entry_t entry = { .val = page->index }; + swp_entry_t entry = { .val = page->private }; struct swap_info_struct *sis; sis = get_swap_info_struct(swp_type(entry)); bdi = sis->bdev->bd_inode->i_mapping->backing_dev_info; - } + } else + bdi = page->mapping->backing_dev_info; return bdi_write_congested(bdi); } #endif diff -purN -X /home/mbligh/.diff.exclude reference/mm/vmscan.c current/mm/vmscan.c --- reference/mm/vmscan.c 2004-04-07 14:54:38.000000000 -0700 +++ current/mm/vmscan.c 2004-04-12 10:22:26.000000000 -0700 @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include #include @@ -173,23 +173,23 @@ static int shrink_slab(unsigned long sca return 0; } -/* Must be called with page's pte_chain_lock held. */ +/* Must be called with page's rmap_lock held. */ static inline int page_mapping_inuse(struct page *page) { - struct address_space *mapping = page->mapping; + struct address_space *mapping; /* Page is in somebody's page tables. */ if (page_mapped(page)) return 1; - /* XXX: does this happen ? */ - if (!mapping) - return 0; - /* Be more reluctant to reclaim swapcache than pagecache */ if (PageSwapCache(page)) return 1; + mapping = page_mapping(page); + if (!mapping) + return 0; + /* File is mmap'd by somebody. */ if (!list_empty(&mapping->i_mmap)) return 1; @@ -233,7 +233,7 @@ static void handle_write_error(struct ad struct page *page, int error) { lock_page(page); - if (page->mapping == mapping) { + if (page_mapping(page) == mapping) { if (error == -ENOSPC) set_bit(AS_ENOSPC, &mapping->flags); else @@ -277,29 +277,31 @@ shrink_list(struct list_head *page_list, if (PageWriteback(page)) goto keep_locked; - pte_chain_lock(page); + rmap_lock(page); referenced = page_referenced(page); if (referenced && page_mapping_inuse(page)) { /* In active use or really unfreeable. Activate it. */ - pte_chain_unlock(page); + rmap_unlock(page); goto activate_locked; } - mapping = page->mapping; + mapping = page_mapping(page); #ifdef CONFIG_SWAP /* - * Anonymous process memory without backing store. Try to - * allocate it some swap space here. + * Anonymous process memory has backing store? + * Try to allocate it some swap space here. * * XXX: implement swap clustering ? */ - if (page_mapped(page) && !mapping && !PagePrivate(page)) { - pte_chain_unlock(page); + if (PageSwapCache(page)) + mapping = &swapper_space; + else if (PageAnon(page)) { + rmap_unlock(page); if (!add_to_swap(page)) goto activate_locked; - pte_chain_lock(page); - mapping = page->mapping; + rmap_lock(page); + mapping = &swapper_space; } #endif /* CONFIG_SWAP */ @@ -313,16 +315,16 @@ shrink_list(struct list_head *page_list, if (page_mapped(page) && mapping) { switch (try_to_unmap(page)) { case SWAP_FAIL: - pte_chain_unlock(page); + rmap_unlock(page); goto activate_locked; case SWAP_AGAIN: - pte_chain_unlock(page); + rmap_unlock(page); goto keep_locked; case SWAP_SUCCESS: ; /* try to free the page below */ } } - pte_chain_unlock(page); + rmap_unlock(page); /* * If the page is dirty, only perform writeback if that write @@ -364,7 +366,9 @@ shrink_list(struct list_head *page_list, .for_reclaim = 1, }; - list_move(&page->list, &mapping->locked_pages); + if (!PageSwapCache(page)) + list_move(&page->list, + &mapping->locked_pages); spin_unlock(&mapping->page_lock); SetPageReclaim(page); @@ -429,7 +433,7 @@ shrink_list(struct list_head *page_list, #ifdef CONFIG_SWAP if (PageSwapCache(page)) { - swp_entry_t swap = { .val = page->index }; + swp_entry_t swap = { .val = page->private }; __delete_from_swap_cache(page); spin_unlock(&mapping->page_lock); swap_free(swap); @@ -658,20 +662,19 @@ refill_inactive_zone(struct zone *zone, list_add(&page->lru, &l_active); continue; } - pte_chain_lock(page); + rmap_lock(page); if (page_referenced(page)) { - pte_chain_unlock(page); + rmap_unlock(page); list_add(&page->lru, &l_active); continue; } - pte_chain_unlock(page); + rmap_unlock(page); } /* * FIXME: need to consider page_count(page) here if/when we * reap orphaned pages via the LRU (Daniel's locking stuff) */ - if (total_swap_pages == 0 && !page->mapping && - !PagePrivate(page)) { + if (total_swap_pages == 0 && PageAnon(page)) { list_add(&page->lru, &l_active); continue; }