Patch from: Ingo Molnar the attached patch, against BK-curr, is a preparation to make remap_file_pages() usable on swappable vmas as well. When 'swapping out' shared-named mappings the page offset is written into the pte. it takes one bit from the swap-type bits, otherwise it does not change the pte layout - so it should be easy to adapt any other architecture to this change as well. (this patch does not introduce the protection-bits-in-pte approach used in my previous patch.) On 32-bit pte sizes with an effective usable pte range of 29 bits, this limits mmap()-able file size to 4096 * 2^29 == 2 TBs. If the usable range is smaller, then the maximum mmap() size is reduced as well. The worst-case i found (PPC) was 2 hw-reserved bits in the swap-case, which limits us to 1 TB filesize. Is there any other hw that has an even worse ratio of sw-usable pte bits? this mmap() limit can be eliminated by simply not converting the swapped out pte to a file-pte, but clearning it and falling back to the linear mapping upon swapin. This puts the limit into remap_file_pages() alone, but i really hope no-one wants to use remap_file_pages() on a 32-bit platform, on a larger than 1-2 TB file. sys_remap_file_pages() is now enforcing the 'prot' parameter to be zero. This restriction might be lifted in the future - i really hope we can have more flexible remapping once 64-bit platforms are commonplace - eg. things like memory debuggers could just use the permission bits directly, instead of creating many small vmas. i've tested swappable nonlinear ptes and they are swapped out/in correctly. some other changes in -A0 relative to 2.5.63-BK: - slightly smarter TLB flushing in install_page(). This is still only a stupid helper functions - a more efficient 'walk the pagecache tree and pagetable at once and use TLB-gather' implementation is preferred. - cleanup: pass on pgprot_t instead of unsigned long prot. - some sanity checks to make sure file_pte() rules are followed. - do not reduce the vma's default protection to PROT_NONE when using remap_file_pages() on it. With swappable ptes this is now safe. 25-akpm/include/asm-i386/pgtable-2level.h | 10 +++++ 25-akpm/include/asm-i386/pgtable-3level.h | 7 ++++ 25-akpm/include/asm-i386/pgtable.h | 4 +- 25-akpm/include/linux/mm.h | 4 +- 25-akpm/include/linux/swapops.h | 2 + 25-akpm/mm/filemap.c | 2 - 25-akpm/mm/fremap.c | 52 ++++++++++++++---------------- 25-akpm/mm/memory.c | 43 +++++++++++++++++++++++- 25-akpm/mm/rmap.c | 17 +++++++++ 25-akpm/mm/shmem.c | 2 - 25-akpm/mm/swapfile.c | 2 + 11 files changed, 110 insertions(+), 35 deletions(-) diff -puN include/asm-i386/pgtable-2level.h~remap-file-pages-2.5.63-a1 include/asm-i386/pgtable-2level.h --- 25/include/asm-i386/pgtable-2level.h~remap-file-pages-2.5.63-a1 Wed Mar 12 14:16:23 2003 +++ 25-akpm/include/asm-i386/pgtable-2level.h Wed Mar 12 14:16:23 2003 @@ -63,4 +63,14 @@ static inline pmd_t * pmd_offset(pgd_t * #define pfn_pte(pfn, prot) __pte(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) #define pfn_pmd(pfn, prot) __pmd(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) +/* + * Bits 0, 6 and 7 are taken, split up the 29 bits of offset + * into this range: + */ +#define pte_to_pgoff(pte) \ + ((((pte).pte_low >> 1) & 0x1f ) + (((pte).pte_low >> 8) << 5 )) + +#define pgoff_to_pte(off) \ + ((pte_t) { (((off) & 0x1f) << 1) + (((off) >> 5) << 8) + _PAGE_FILE }) + #endif /* _I386_PGTABLE_2LEVEL_H */ diff -puN include/asm-i386/pgtable-3level.h~remap-file-pages-2.5.63-a1 include/asm-i386/pgtable-3level.h --- 25/include/asm-i386/pgtable-3level.h~remap-file-pages-2.5.63-a1 Wed Mar 12 14:16:23 2003 +++ 25-akpm/include/asm-i386/pgtable-3level.h Wed Mar 12 14:16:23 2003 @@ -115,4 +115,11 @@ static inline pmd_t pfn_pmd(unsigned lon return __pmd(((unsigned long long)page_nr << PAGE_SHIFT) | pgprot_val(pgprot)); } +/* + * Bits 0, 6 and 7 are taken in the low part of the pte, + * put the 32 bits of offset into the high part. + */ +#define pte_to_pgoff(pte) ((pte).pte_high) +#define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) }) + #endif /* _I386_PGTABLE_3LEVEL_H */ diff -puN include/asm-i386/pgtable.h~remap-file-pages-2.5.63-a1 include/asm-i386/pgtable.h --- 25/include/asm-i386/pgtable.h~remap-file-pages-2.5.63-a1 Wed Mar 12 14:16:23 2003 +++ 25-akpm/include/asm-i386/pgtable.h Wed Mar 12 14:16:23 2003 @@ -110,6 +110,7 @@ void pgtable_cache_init(void); #define _PAGE_PSE 0x080 /* 4 MB (or 2MB) page, Pentium+, if present.. */ #define _PAGE_GLOBAL 0x100 /* Global TLB entry PPro+ */ +#define _PAGE_FILE 0x040 /* pagecache or swap? */ #define _PAGE_PROTNONE 0x080 /* If not present */ #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY) @@ -187,6 +188,7 @@ static inline int pte_exec(pte_t pte) { static inline int pte_dirty(pte_t pte) { return (pte).pte_low & _PAGE_DIRTY; } static inline int pte_young(pte_t pte) { return (pte).pte_low & _PAGE_ACCESSED; } static inline int pte_write(pte_t pte) { return (pte).pte_low & _PAGE_RW; } +static inline int pte_file(pte_t pte) { return (pte).pte_low & _PAGE_FILE; } static inline pte_t pte_rdprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_USER; return pte; } static inline pte_t pte_exprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_USER; return pte; } @@ -282,7 +284,7 @@ typedef pte_t *pte_addr_t; #define update_mmu_cache(vma,address,pte) do { } while (0) /* Encode and de-code a swap entry */ -#define __swp_type(x) (((x).val >> 1) & 0x3f) +#define __swp_type(x) (((x).val >> 1) & 0x1f) #define __swp_offset(x) ((x).val >> 8) #define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | ((offset) << 8) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low }) diff -puN include/linux/mm.h~remap-file-pages-2.5.63-a1 include/linux/mm.h --- 25/include/linux/mm.h~remap-file-pages-2.5.63-a1 Wed Mar 12 14:16:23 2003 +++ 25-akpm/include/linux/mm.h Wed Mar 12 14:16:23 2003 @@ -136,7 +136,7 @@ struct vm_operations_struct { void (*open)(struct vm_area_struct * area); void (*close)(struct vm_area_struct * area); struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int unused); - int (*populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, unsigned long prot, unsigned long pgoff, int nonblock); + int (*populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock); }; /* forward declaration; pte_chain is meant to be internal to rmap.c */ @@ -417,7 +417,7 @@ extern int vmtruncate(struct inode * ino extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)); extern pte_t *FASTCALL(pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); extern pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); -extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, unsigned long prot); +extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot); extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access); extern int make_pages_present(unsigned long addr, unsigned long end); extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); diff -puN include/linux/swapops.h~remap-file-pages-2.5.63-a1 include/linux/swapops.h --- 25/include/linux/swapops.h~remap-file-pages-2.5.63-a1 Wed Mar 12 14:16:23 2003 +++ 25-akpm/include/linux/swapops.h Wed Mar 12 14:16:23 2003 @@ -51,6 +51,7 @@ static inline swp_entry_t pte_to_swp_ent { swp_entry_t arch_entry; + BUG_ON(pte_file(pte)); arch_entry = __pte_to_swp_entry(pte); return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); } @@ -64,5 +65,6 @@ static inline pte_t swp_entry_to_pte(swp swp_entry_t arch_entry; arch_entry = __swp_entry(swp_type(entry), swp_offset(entry)); + BUG_ON(pte_file(__swp_entry_to_pte(arch_entry))); return __swp_entry_to_pte(arch_entry); } diff -puN mm/filemap.c~remap-file-pages-2.5.63-a1 mm/filemap.c --- 25/mm/filemap.c~remap-file-pages-2.5.63-a1 Wed Mar 12 14:16:23 2003 +++ 25-akpm/mm/filemap.c Wed Mar 12 14:16:23 2003 @@ -1195,7 +1195,7 @@ err: static int filemap_populate(struct vm_area_struct *vma, unsigned long addr, unsigned long len, - unsigned long prot, + pgprot_t prot, unsigned long pgoff, int nonblock) { diff -puN mm/fremap.c~remap-file-pages-2.5.63-a1 mm/fremap.c --- 25/mm/fremap.c~remap-file-pages-2.5.63-a1 Wed Mar 12 14:16:23 2003 +++ 25-akpm/mm/fremap.c Wed Mar 12 14:18:18 2003 @@ -16,12 +16,12 @@ #include #include -static inline void zap_pte(struct mm_struct *mm, pte_t *ptep) +static inline int zap_pte(struct mm_struct *mm, pte_t *ptep) { pte_t pte = *ptep; if (pte_none(pte)) - return; + return 0; if (pte_present(pte)) { unsigned long pfn = pte_pfn(pte); @@ -36,9 +36,12 @@ static inline void zap_pte(struct mm_str mm->rss--; } } + return 1; } else { - free_swap_and_cache(pte_to_swp_entry(pte)); + if (!pte_file(pte)) + free_swap_and_cache(pte_to_swp_entry(pte)); pte_clear(ptep); + return 0; } } @@ -47,9 +50,9 @@ static inline void zap_pte(struct mm_str * previously existing mapping. */ int install_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, struct page *page, unsigned long prot) + unsigned long addr, struct page *page, pgprot_t prot) { - int err = -ENOMEM; + int err = -ENOMEM, flush; pte_t *pte, entry; pgd_t *pgd; pmd_t *pmd; @@ -69,18 +72,17 @@ int install_page(struct mm_struct *mm, s if (!pte) goto err_unlock; - zap_pte(mm, pte); + flush = zap_pte(mm, pte); mm->rss++; flush_page_to_ram(page); flush_icache_page(vma, page); - entry = mk_pte(page, protection_map[prot]); - if (prot & PROT_WRITE) - entry = pte_mkwrite(pte_mkdirty(entry)); + entry = mk_pte(page, prot); set_pte(pte, entry); pte_chain = page_add_rmap(page, pte, pte_chain); pte_unmap(pte); - flush_tlb_page(vma, addr); + if (flush) + flush_tlb_page(vma, addr); spin_unlock(&mm->page_table_lock); pte_chain_free(pte_chain); @@ -104,26 +106,28 @@ err: * * this syscall works purely via pagetables, so it's the most efficient * way to map the same (large) file into a given virtual window. Unlike - * mremap()/mmap() it does not create any new vmas. + * mmap()/mremap() it does not create any new vmas. The new mappings are + * also safe across swapout. * - * The new mappings do not live across swapout, so either use MAP_LOCKED - * or use PROT_NONE in the original linear mapping and add a special - * SIGBUS pagefault handler to reinstall zapped mappings. + * NOTE: the 'prot' parameter right now is ignored, and the vma's default + * protection is used. Arbitrary protections might be implemented in the + * future. */ int sys_remap_file_pages(unsigned long start, unsigned long size, - unsigned long prot, unsigned long pgoff, unsigned long flags) + unsigned long __prot, unsigned long pgoff, unsigned long flags) { struct mm_struct *mm = current->mm; unsigned long end = start + size; struct vm_area_struct *vma; int err = -EINVAL; + if (__prot) + return err; /* * Sanitize the syscall parameters: */ - start = PAGE_ALIGN(start); - size = PAGE_ALIGN(size); - prot &= 0xf; + start = start & PAGE_MASK; + size = size & PAGE_MASK; down_read(&mm->mmap_sem); @@ -136,15 +140,9 @@ int sys_remap_file_pages(unsigned long s if (vma && (vma->vm_flags & VM_SHARED) && vma->vm_ops && vma->vm_ops->populate && end > start && start >= vma->vm_start && - end <= vma->vm_end) { - /* - * Change the default protection to PROT_NONE: - */ - if (pgprot_val(vma->vm_page_prot) != pgprot_val(__S000)) - vma->vm_page_prot = __S000; - err = vma->vm_ops->populate(vma, start, size, prot, - pgoff, flags & MAP_NONBLOCK); - } + end <= vma->vm_end) + err = vma->vm_ops->populate(vma, start, size, vma->vm_page_prot, + pgoff, flags & MAP_NONBLOCK); up_read(&mm->mmap_sem); diff -puN mm/memory.c~remap-file-pages-2.5.63-a1 mm/memory.c --- 25/mm/memory.c~remap-file-pages-2.5.63-a1 Wed Mar 12 14:16:23 2003 +++ 25-akpm/mm/memory.c Wed Mar 12 14:16:23 2003 @@ -293,7 +293,8 @@ skip_copy_pte_range: goto cont_copy_pte_range_noset; /* pte contains position in swap, so copy. */ if (!pte_present(pte)) { - swap_duplicate(pte_to_swp_entry(pte)); + if (!pte_file(pte)) + swap_duplicate(pte_to_swp_entry(pte)); set_pte(dst_pte, pte); goto cont_copy_pte_range_noset; } @@ -427,7 +428,8 @@ zap_pte_range(struct mmu_gather *tlb, pm } } } else { - free_swap_and_cache(pte_to_swp_entry(pte)); + if (!pte_file(pte)) + free_swap_and_cache(pte_to_swp_entry(pte)); pte_clear(ptep); } } @@ -1410,6 +1412,41 @@ out: } /* + * Fault of a previously existing named mapping. Repopulate the pte + * from the encoded file_pte if possible. This enables swappable + * nonlinear vmas. + */ +static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma, + unsigned long address, int write_access, pte_t *pte, pmd_t *pmd) +{ + unsigned long pgoff; + int err; + + BUG_ON(!vma->vm_ops || !vma->vm_ops->nopage); + /* + * Fall back to the linear mapping if the fs does not support + * ->populate: + */ + if (!vma->vm_ops || !vma->vm_ops->populate || + (write_access && !(vma->vm_flags & VM_SHARED))) { + pte_clear(pte); + return do_no_page(mm, vma, address, write_access, pte, pmd); + } + + pgoff = pte_to_pgoff(*pte); + + pte_unmap(pte); + spin_unlock(&mm->page_table_lock); + + err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0); + if (err == -ENOMEM) + return VM_FAULT_OOM; + if (err) + return VM_FAULT_SIGBUS; + return VM_FAULT_MAJOR; +} + +/* * These routines also need to handle stuff like marking pages dirty * and/or accessed for architectures that don't do it in hardware (most * RISC architectures). The early dirtying is also good on the i386. @@ -1445,6 +1482,8 @@ static inline int handle_pte_fault(struc */ if (pte_none(entry)) return do_no_page(mm, vma, address, write_access, pte, pmd); + if (pte_file(entry)) + return do_file_page(mm, vma, address, write_access, pte, pmd); return do_swap_page(mm, vma, address, pte, pmd, entry, write_access); } diff -puN mm/rmap.c~remap-file-pages-2.5.63-a1 mm/rmap.c --- 25/mm/rmap.c~remap-file-pages-2.5.63-a1 Wed Mar 12 14:16:23 2003 +++ 25-akpm/mm/rmap.c Wed Mar 12 14:16:23 2003 @@ -365,11 +365,26 @@ static int try_to_unmap_one(struct page pte = ptep_get_and_clear(ptep); flush_tlb_page(vma, address); - /* Store the swap location in the pte. See handle_pte_fault() ... */ if (PageSwapCache(page)) { + /* + * Store the swap location in the pte. + * See handle_pte_fault() ... + */ swp_entry_t entry = { .val = page->index }; swap_duplicate(entry); set_pte(ptep, swp_entry_to_pte(entry)); + BUG_ON(pte_file(*ptep)); + } else { + unsigned long pgidx; + /* + * If a nonlinear mapping then store the file page offset + * in the pte. + */ + pgidx = (address - vma->vm_start) >> PAGE_SHIFT; + if (page->index != pgidx) { + set_pte(ptep, pgoff_to_pte(page->index)); + BUG_ON(!pte_file(*ptep)); + } } /* Move the dirty bit to the physical page now the pte is gone. */ diff -puN mm/shmem.c~remap-file-pages-2.5.63-a1 mm/shmem.c --- 25/mm/shmem.c~remap-file-pages-2.5.63-a1 Wed Mar 12 14:16:23 2003 +++ 25-akpm/mm/shmem.c Wed Mar 12 14:16:23 2003 @@ -945,7 +945,7 @@ struct page *shmem_nopage(struct vm_area static int shmem_populate(struct vm_area_struct *vma, unsigned long addr, unsigned long len, - unsigned long prot, unsigned long pgoff, int nonblock) + pgprot_t prot, unsigned long pgoff, int nonblock) { struct inode *inode = vma->vm_file->f_dentry->d_inode; struct mm_struct *mm = vma->vm_mm; diff -puN mm/swapfile.c~remap-file-pages-2.5.63-a1 mm/swapfile.c --- 25/mm/swapfile.c~remap-file-pages-2.5.63-a1 Wed Mar 12 14:16:23 2003 +++ 25-akpm/mm/swapfile.c Wed Mar 12 14:16:23 2003 @@ -384,6 +384,8 @@ unuse_pte(struct vm_area_struct *vma, un { pte_t pte = *dir; + if (pte_file(pte)) + return; if (likely(pte_to_swp_entry(pte).val != entry.val)) return; if (unlikely(pte_none(pte) || pte_present(pte))) _