From: Ingo Molnar - enable the 'prot' parameter for shared-writable mappings (the ones which are the primary target for fremap), without breaking up the vma. Costs: max # of swapfiles is 16 instead of 32 (could be fixed on PAE if we want). - added MAP_INHERIT: this both cleanly implements the old sys_remap_file_pages() semantics, and might be useful as well. it adds a new syscall because the old sys_remap_file_pages() syscall was messed up in an unfixable way :-( [prot == 0 means PROT_NONE and now we need this.] But the patch is ABI-compatible, it keeps the old syscall and wraps it cleanly internally. patch applies, compiles & boots cleanly on SMP x86. Non-x86 architectures wont compile, then need to do this: - add MAP_INHERIT - add pte_to_pgprot and change pte_to_pgoff if the bit comes off the offset bits. - change pgoff_to_pte to pgoff_prot_to_pte. - chop a bit off __swp_type() if there's no more space. it should be quite straightforward for them, but needs to be tested. 25-akpm/arch/i386/kernel/entry.S | 3 25-akpm/include/asm-i386/mman.h | 1 25-akpm/include/asm-i386/pgtable-2level.h | 15 ++-- 25-akpm/include/asm-i386/pgtable-3level.h | 11 ++- 25-akpm/include/linux/mm.h | 2 25-akpm/mm/fremap.c | 105 +++++++++++++++++++----------- 25-akpm/mm/memory.c | 4 - 25-akpm/mm/mmap.c | 8 +- 25-akpm/mm/rmap.c | 7 +- 9 files changed, 105 insertions(+), 51 deletions(-) diff -puN arch/i386/kernel/entry.S~remap_file_pages-prot-2.6.1-H2 arch/i386/kernel/entry.S --- 25/arch/i386/kernel/entry.S~remap_file_pages-prot-2.6.1-H2 Mon Jan 5 14:49:48 2004 +++ 25-akpm/arch/i386/kernel/entry.S Mon Jan 5 14:49:48 2004 @@ -891,7 +891,7 @@ ENTRY(sys_call_table) .long sys_epoll_create .long sys_epoll_ctl /* 255 */ .long sys_epoll_wait - .long sys_remap_file_pages + .long old_remap_file_pages .long sys_set_tid_address .long sys_timer_create .long sys_timer_settime /* 260 */ @@ -908,5 +908,6 @@ ENTRY(sys_call_table) .long sys_utimes .long sys_fadvise64_64 .long sys_ni_syscall /* sys_vserver */ + .long sys_remap_file_pages syscall_table_size=(.-sys_call_table) diff -puN include/asm-i386/mman.h~remap_file_pages-prot-2.6.1-H2 include/asm-i386/mman.h --- 25/include/asm-i386/mman.h~remap_file_pages-prot-2.6.1-H2 Mon Jan 5 14:49:48 2004 +++ 25-akpm/include/asm-i386/mman.h Mon Jan 5 14:49:48 2004 @@ -22,6 +22,7 @@ #define MAP_NORESERVE 0x4000 /* don't check for reservations */ #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ #define MAP_NONBLOCK 0x10000 /* do not block on IO */ +#define MAP_INHERIT 0x20000 /* inherit the protection bits of the underlying vma*/ #define MS_ASYNC 1 /* sync memory asynchronously */ #define MS_INVALIDATE 2 /* invalidate the caches */ diff -puN include/asm-i386/pgtable-2level.h~remap_file_pages-prot-2.6.1-H2 include/asm-i386/pgtable-2level.h --- 25/include/asm-i386/pgtable-2level.h~remap_file_pages-prot-2.6.1-H2 Mon Jan 5 14:49:48 2004 +++ 25-akpm/include/asm-i386/pgtable-2level.h Mon Jan 5 14:49:48 2004 @@ -64,15 +64,20 @@ static inline pmd_t * pmd_offset(pgd_t * #define pfn_pmd(pfn, prot) __pmd(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) /* - * Bits 0, 6 and 7 are taken, split up the 29 bits of offset + * Bits 0, 1, 6 and 7 are taken, split up the 28 bits of offset * into this range: */ -#define PTE_FILE_MAX_BITS 29 +#define PTE_FILE_MAX_BITS 28 #define pte_to_pgoff(pte) \ - ((((pte).pte_low >> 1) & 0x1f ) + (((pte).pte_low >> 8) << 5 )) + ((((pte).pte_low >> 2) & 0xf ) + (((pte).pte_low >> 8) << 4 )) +#define pte_to_pgprot(pte) \ + __pgprot(((pte).pte_low & (_PAGE_RW | _PAGE_PROTNONE)) \ + | (((pte).pte_low & _PAGE_PROTNONE) ? 0 : \ + (_PAGE_USER | _PAGE_PRESENT)) | _PAGE_ACCESSED) -#define pgoff_to_pte(off) \ - ((pte_t) { (((off) & 0x1f) << 1) + (((off) >> 5) << 8) + _PAGE_FILE }) +#define pgoff_prot_to_pte(off, prot) \ + ((pte_t) { (((off) & 0xf) << 2) + (((off) >> 4) << 8) + \ + (pgprot_val(prot) & (_PAGE_RW | _PAGE_PROTNONE)) + _PAGE_FILE }) #endif /* _I386_PGTABLE_2LEVEL_H */ diff -puN include/asm-i386/pgtable-3level.h~remap_file_pages-prot-2.6.1-H2 include/asm-i386/pgtable-3level.h --- 25/include/asm-i386/pgtable-3level.h~remap_file_pages-prot-2.6.1-H2 Mon Jan 5 14:49:48 2004 +++ 25-akpm/include/asm-i386/pgtable-3level.h Mon Jan 5 14:49:48 2004 @@ -120,7 +120,16 @@ static inline pmd_t pfn_pmd(unsigned lon * put the 32 bits of offset into the high part. */ #define pte_to_pgoff(pte) ((pte).pte_high) -#define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) }) + +#define pte_to_pgprot(pte) \ + __pgprot(((pte).pte_low & (_PAGE_RW | _PAGE_PROTNONE)) \ + | (((pte).pte_low & _PAGE_PROTNONE) ? 0 : \ + (_PAGE_USER | _PAGE_PRESENT)) | _PAGE_ACCESSED) + +#define pgoff_prot_to_pte(off, prot) \ + ((pte_t) { _PAGE_FILE + \ + (pgprot_val(prot) & (_PAGE_RW | _PAGE_PROTNONE)) , (off) }) + #define PTE_FILE_MAX_BITS 32 #endif /* _I386_PGTABLE_3LEVEL_H */ diff -puN include/linux/mm.h~remap_file_pages-prot-2.6.1-H2 include/linux/mm.h --- 25/include/linux/mm.h~remap_file_pages-prot-2.6.1-H2 Mon Jan 5 14:49:48 2004 +++ 25-akpm/include/linux/mm.h Mon Jan 5 14:49:48 2004 @@ -445,7 +445,7 @@ extern int install_file_pte(struct mm_st extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access); extern int make_pages_present(unsigned long addr, unsigned long end); extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); -extern long sys_remap_file_pages(unsigned long start, unsigned long size, unsigned long prot, unsigned long pgoff, unsigned long nonblock); +extern long __remap_file_pages(struct mm_struct *mm, unsigned long start, unsigned long size, unsigned long prot, unsigned long pgoff, unsigned long flags); extern long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice); void put_dirty_page(struct task_struct *tsk, struct page *page, unsigned long address, pgprot_t prot); diff -puN mm/fremap.c~remap_file_pages-prot-2.6.1-H2 mm/fremap.c --- 25/mm/fremap.c~remap_file_pages-prot-2.6.1-H2 Mon Jan 5 14:49:48 2004 +++ 25-akpm/mm/fremap.c Mon Jan 5 14:49:48 2004 @@ -152,7 +152,7 @@ int install_file_pte(struct mm_struct *m flush = zap_pte(mm, vma, addr, pte); - set_pte(pte, pgoff_to_pte(pgoff)); + set_pte(pte, pgoff_prot_to_pte(pgoff, pgprot)); pte_val = *pte; pte_unmap(pte); if (flush) @@ -174,27 +174,22 @@ err_unlock: * @size: size of the remapped virtual memory range * @prot: new protection bits of the range * @pgoff: to be mapped page of the backing store file - * @flags: 0 or MAP_NONBLOCKED - the later will cause no IO. + * @flags: bits MAP_INHERIT or MAP_NONBLOCKED - the later will cause no IO. * * this syscall works purely via pagetables, so it's the most efficient * way to map the same (large) file into a given virtual window. Unlike * mmap()/mremap() it does not create any new vmas. The new mappings are * also safe across swapout. - * - * NOTE: the 'prot' parameter right now is ignored, and the vma's default - * protection is used. Arbitrary protections might be implemented in the - * future. */ -long sys_remap_file_pages(unsigned long start, unsigned long size, - unsigned long __prot, unsigned long pgoff, unsigned long flags) +long __remap_file_pages(struct mm_struct *mm, unsigned long start, + unsigned long size, unsigned long prot, + unsigned long pgoff, unsigned long flags) { - struct mm_struct *mm = current->mm; + pgprot_t pgprot = protection_map[calc_vm_prot_bits(prot) | VM_SHARED]; unsigned long end = start + size; struct vm_area_struct *vma; int err = -EINVAL; - if (__prot) - return err; /* * Sanitize the syscall parameters: */ @@ -214,37 +209,71 @@ long sys_remap_file_pages(unsigned long /* We need down_write() to change vma->vm_flags. */ down_write(&mm->mmap_sem); vma = find_vma(mm, start); - /* - * Make sure the vma is shared, that it supports prefaulting, - * and that the remapped range is valid and fully within - * the single existing vma: + * Make sure the permissions are right, the vma is shared + * (or linearly remapped - ie. prefaulted), that it supports + * prefaulting, and that the remapped range is valid and fully + * within the single existing vma: */ - if (vma && (vma->vm_flags & VM_SHARED) && - vma->vm_ops && vma->vm_ops->populate && - end > start && start >= vma->vm_start && - end <= vma->vm_end) { - - /* Must set VM_NONLINEAR before any pages are populated. */ - if (pgoff != ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff) - vma->vm_flags |= VM_NONLINEAR; - - /* ->populate can take a long time, so downgrade the lock. */ - downgrade_write(&mm->mmap_sem); - err = vma->vm_ops->populate(vma, start, size, - vma->vm_page_prot, - pgoff, flags & MAP_NONBLOCK); - - /* - * We can't clear VM_NONLINEAR because we'd have to do - * it after ->populate completes, and that would prevent - * downgrading the lock. (Locks can't be upgraded). - */ - up_read(&mm->mmap_sem); - } else { - up_write(&mm->mmap_sem); + if (!vma) + goto out_unlock; + if (unlikely(flags & MAP_INHERIT)) + pgprot = vma->vm_page_prot; + else { + err = -EPERM; + if (((prot & PROT_READ) && !(vma->vm_flags & VM_MAYREAD))) + goto out_unlock; + if (((prot & PROT_WRITE) && !(vma->vm_flags & VM_MAYWRITE))) + goto out_unlock; + if (((prot & PROT_EXEC) && !(vma->vm_flags & VM_MAYEXEC))) + goto out_unlock; } + if (!vma->vm_ops || !vma->vm_ops->populate || end <= start || + start < vma->vm_start || end > vma->vm_end) + goto out_unlock; + + if (pgoff != ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff) { + if (!(vma->vm_flags & VM_SHARED)) + goto out_unlock; + vma->vm_flags |= VM_NONLINEAR; + } + + /* + * ->populate can take a long time, so downgrade the lock: + */ + downgrade_write(&mm->mmap_sem); + err = vma->vm_ops->populate(vma, start, size, + pgprot, pgoff, flags & MAP_NONBLOCK); + + /* + * We can't clear VM_NONLINEAR because we'd have to do + * it after ->populate completes, and that would prevent + * downgrading the lock. (Locks can't be upgraded). + */ + up_read(&mm->mmap_sem); return err; + +out_unlock: + up_write(&mm->mmap_sem); + return err; +} + +long sys_remap_file_pages(unsigned long start, unsigned long size, + unsigned long prot, unsigned long pgoff, unsigned long flags) +{ + return __remap_file_pages(current->mm, start, size, prot, pgoff, flags); } +/* + * sys_remap_file_pages - the old API. Implies MAP_INHERIT. + */ +long old_remap_file_pages(unsigned long start, unsigned long size, + unsigned long __prot, unsigned long pgoff, unsigned long flags) +{ + if (__prot) + return -EINVAL; + + return __remap_file_pages(current->mm, start, size, PROT_NONE, + pgoff, flags | MAP_INHERIT); +} diff -puN mm/memory.c~remap_file_pages-prot-2.6.1-H2 mm/memory.c --- 25/mm/memory.c~remap_file_pages-prot-2.6.1-H2 Mon Jan 5 14:49:48 2004 +++ 25-akpm/mm/memory.c Mon Jan 5 14:49:48 2004 @@ -1522,6 +1522,7 @@ static int do_file_page(struct mm_struct unsigned long address, int write_access, pte_t *pte, pmd_t *pmd) { unsigned long pgoff; + pgprot_t pgprot; int err; BUG_ON(!vma->vm_ops || !vma->vm_ops->nopage); @@ -1536,11 +1537,12 @@ static int do_file_page(struct mm_struct } pgoff = pte_to_pgoff(*pte); + pgprot = pte_to_pgprot(*pte); pte_unmap(pte); spin_unlock(&mm->page_table_lock); - err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0); + err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, pgprot, pgoff, 0); if (err == -ENOMEM) return VM_FAULT_OOM; if (err) diff -puN mm/mmap.c~remap_file_pages-prot-2.6.1-H2 mm/mmap.c --- 25/mm/mmap.c~remap_file_pages-prot-2.6.1-H2 Mon Jan 5 14:49:48 2004 +++ 25-akpm/mm/mmap.c Mon Jan 5 14:49:48 2004 @@ -690,8 +690,12 @@ out: } if (flags & MAP_POPULATE) { up_write(&mm->mmap_sem); - sys_remap_file_pages(addr, len, 0, - pgoff, flags & MAP_NONBLOCK); + /* + * remap_file_pages() works even if the mapping is private, + * in the linearly-mapped case: + */ + __remap_file_pages(mm, addr, len, PROT_NONE, pgoff, + MAP_INHERIT | (flags & MAP_NONBLOCK)); down_write(&mm->mmap_sem); } return addr; diff -puN mm/rmap.c~remap_file_pages-prot-2.6.1-H2 mm/rmap.c --- 25/mm/rmap.c~remap_file_pages-prot-2.6.1-H2 Mon Jan 5 14:49:48 2004 +++ 25-akpm/mm/rmap.c Mon Jan 5 14:49:48 2004 @@ -343,6 +343,7 @@ static int try_to_unmap_one(struct page BUG_ON(pte_file(*ptep)); } else { unsigned long pgidx; + pgprot_t pgprot = pte_to_pgprot(pte); /* * If a nonlinear mapping then store the file page offset * in the pte. @@ -350,8 +351,10 @@ static int try_to_unmap_one(struct page pgidx = (address - vma->vm_start) >> PAGE_SHIFT; pgidx += vma->vm_pgoff; pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; - if (page->index != pgidx) { - set_pte(ptep, pgoff_to_pte(page->index)); + if (page->index != pgidx || + pgprot_val(pgprot) != pgprot_val(vma->vm_page_prot)) { + + set_pte(ptep, pgoff_prot_to_pte(page->index, pgprot)); BUG_ON(!pte_file(*ptep)); } } _