Earlier on, in 2.6.6, we took the vma merging code out of mremap.c and let it rely on vma_merge instead (via copy_vma). Now take the vma merging code out of mprotect.c and let it rely on vma_merge too: so vma_merge becomes the sole vma merging engine. The fruit of this consolidation is that mprotect now merges file-backed vmas naturally. Make this change now because anon_vma will complicate the vma merging rules, let's keep them all in one place. vma_merge remains where the decisions are made, whether to merge with prev and/or next; but now [addr,end) may be the latter part of prev, or first part or whole of next, whereas before it was always a new area. vma_adjust carries out vma_merge's decision, but when sliding the boundary between vma and next, must temporarily remove next from the prio_tree too. And it turned out (by oops) to have a surer idea of whether next needs to be removed than vma_merge, so the fput and freeing moves into vma_adjust. Too much decipherment of what's going on at the start of vma_adjust? Yes, and there's a delicate assumption that you may use vma_adjust in sliding a boundary, or splitting in two, or growing a vma (mremap uses it in that way), but not for simply shrinking a vma. Which is so, and must be so (how could pages mapped in the part to go, be zapped without first splitting?), but would feel better with some protection. __vma_unlink can then be moved from mm.h to mmap.c, and mm.h's more misleading than helpful can_vma_merge is deleted. --- 25-akpm/include/linux/mm.h | 29 +------ 25-akpm/mm/mmap.c | 172 +++++++++++++++++++++++++++++++++------------ 25-akpm/mm/mprotect.c | 110 ++++++---------------------- 3 files changed, 159 insertions(+), 152 deletions(-) diff -puN include/linux/mm.h~rmap-36-mprotect-use-vma_merge include/linux/mm.h --- 25/include/linux/mm.h~rmap-36-mprotect-use-vma_merge Tue May 18 16:25:50 2004 +++ 25-akpm/include/linux/mm.h Tue May 18 16:25:50 2004 @@ -607,7 +607,12 @@ struct vm_area_struct *vma_prio_tree_nex /* mmap.c */ extern void vma_adjust(struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgoff_t pgoff, struct vm_area_struct *next); + unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert); +extern struct vm_area_struct *vma_merge(struct mm_struct *, + struct vm_area_struct *prev, unsigned long addr, unsigned long end, + unsigned long vm_flags, struct file *, pgoff_t, struct mempolicy *); +extern int split_vma(struct mm_struct *, + struct vm_area_struct *, unsigned long addr, int new_below); extern void insert_vm_struct(struct mm_struct *, struct vm_area_struct *); extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *, struct rb_node **, struct rb_node *); @@ -638,26 +643,6 @@ extern int do_munmap(struct mm_struct *, extern unsigned long do_brk(unsigned long, unsigned long); -static inline void -__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev) -{ - prev->vm_next = vma->vm_next; - rb_erase(&vma->vm_rb, &mm->mm_rb); - if (mm->mmap_cache == vma) - mm->mmap_cache = prev; -} - -static inline int -can_vma_merge(struct vm_area_struct *vma, unsigned long vm_flags) -{ -#ifdef CONFIG_MMU - if (!vma->vm_file && vma->vm_flags == vm_flags) - return 1; -#endif - return 0; -} - /* filemap.c */ extern unsigned long page_unuse(struct page *); extern void truncate_inode_pages(struct address_space *, loff_t); @@ -691,8 +676,6 @@ extern int expand_stack(struct vm_area_s extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, struct vm_area_struct **pprev); -extern int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, - unsigned long addr, int new_below); /* Look up the first VMA which intersects the interval start_addr..end_addr-1, NULL if none. Assume start_addr < end_addr. */ diff -puN mm/mmap.c~rmap-36-mprotect-use-vma_merge mm/mmap.c --- 25/mm/mmap.c~rmap-36-mprotect-use-vma_merge Tue May 18 16:25:50 2004 +++ 25-akpm/mm/mmap.c Tue May 18 16:25:50 2004 @@ -339,20 +339,51 @@ __insert_vm_struct(struct mm_struct * mm validate_mm(mm); } +static inline void +__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, + struct vm_area_struct *prev) +{ + prev->vm_next = vma->vm_next; + rb_erase(&vma->vm_rb, &mm->mm_rb); + if (mm->mmap_cache == vma) + mm->mmap_cache = prev; +} + /* * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that * is already present in an i_mmap tree without adjusting the tree. * The following helper function should be used when such adjustments - * are necessary. The "next" vma (if any) is to be removed or inserted + * are necessary. The "insert" vma (if any) is to be inserted * before we drop the necessary locks. */ void vma_adjust(struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgoff_t pgoff, struct vm_area_struct *next) + unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) { struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *next = vma->vm_next; struct address_space *mapping = NULL; struct prio_tree_root *root = NULL; struct file *file = vma->vm_file; + long adjust_next = 0; + int remove_next = 0; + + if (next && !insert) { + if (end >= next->vm_end) { +again: remove_next = 1 + (end > next->vm_end); + end = next->vm_end; + } else if (end < vma->vm_end || end > next->vm_start) { + /* + * vma shrinks, and !insert tells it's not + * split_vma inserting another: so it must + * be mprotect shifting the boundary down. + * Or: + * vma expands, overlapping part of the next: + * must be mprotect shifting the boundary up. + */ + BUG_ON(vma->vm_end != next->vm_start); + adjust_next = end - next->vm_start; + } + } if (file) { mapping = file->f_mapping; @@ -365,38 +396,67 @@ void vma_adjust(struct vm_area_struct *v if (root) { flush_dcache_mmap_lock(mapping); vma_prio_tree_remove(vma, root); + if (adjust_next) + vma_prio_tree_remove(next, root); } + vma->vm_start = start; vma->vm_end = end; vma->vm_pgoff = pgoff; + if (adjust_next) { + next->vm_start += adjust_next; + next->vm_pgoff += adjust_next >> PAGE_SHIFT; + } + if (root) { + if (adjust_next) { + vma_prio_tree_init(next); + vma_prio_tree_insert(next, root); + } vma_prio_tree_init(vma); vma_prio_tree_insert(vma, root); flush_dcache_mmap_unlock(mapping); } - if (next) { - if (next == vma->vm_next) { - /* - * vma_merge has merged next into vma, and needs - * us to remove next before dropping the locks. - */ - __vma_unlink(mm, next, vma); - if (file) - __remove_shared_vm_struct(next, file, mapping); - } else { - /* - * split_vma has split next from vma, and needs - * us to insert next before dropping the locks - * (next may either follow vma or precede it). - */ - __insert_vm_struct(mm, next); - } + if (remove_next) { + /* + * vma_merge has merged next into vma, and needs + * us to remove next before dropping the locks. + */ + __vma_unlink(mm, next, vma); + if (file) + __remove_shared_vm_struct(next, file, mapping); + } else if (insert) { + /* + * split_vma has split insert from vma, and needs + * us to insert it before dropping the locks + * (it may either follow vma or precede it). + */ + __insert_vm_struct(mm, insert); } spin_unlock(&mm->page_table_lock); if (mapping) spin_unlock(&mapping->i_mmap_lock); + + if (remove_next) { + if (file) + fput(file); + mm->map_count--; + mpol_free(vma_policy(next)); + kmem_cache_free(vm_area_cachep, next); + /* + * In mprotect's case 6 (see comments on vma_merge), + * we must remove another next too. It would clutter + * up the code too much to do both in one go. + */ + if (remove_next == 2) { + next = vma->vm_next; + goto again; + } + } + + validate_mm(mm); } /* @@ -460,18 +520,42 @@ can_vma_merge_after(struct vm_area_struc } /* - * Given a new mapping request (addr,end,vm_flags,file,pgoff), figure out - * whether that can be merged with its predecessor or its successor. Or - * both (it neatly fills a hole). + * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out + * whether that can be merged with its predecessor or its successor. + * Or both (it neatly fills a hole). + * + * In most cases - when called for mmap, brk or mremap - [addr,end) is + * certain not to be mapped by the time vma_merge is called; but when + * called for mprotect, it is certain to be already mapped (either at + * an offset within prev, or at the start of next), and the flags of + * this area are about to be changed to vm_flags - and the no-change + * case has already been eliminated. + * + * The following mprotect cases have to be considered, where AAAA is + * the area passed down from mprotect_fixup, never extending beyond one + * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after: + * + * AAAA AAAA AAAA AAAA + * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN PPPPNNNNXXXX + * cannot merge might become might become might become + * PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or + * mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or + * mremap move: PPPPNNNNNNNN 8 + * AAAA + * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN + * might become case 1 below case 2 below case 3 below + * + * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX: + * mprotect_fixup updates vm_flags & vm_page_prot on successful return. */ -static struct vm_area_struct *vma_merge(struct mm_struct *mm, +struct vm_area_struct *vma_merge(struct mm_struct *mm, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct file *file, pgoff_t pgoff, struct mempolicy *policy) { pgoff_t pglen = (end - addr) >> PAGE_SHIFT; - struct vm_area_struct *next; + struct vm_area_struct *area, *next; /* * We later require that vma->vm_flags == vm_flags, @@ -480,16 +564,18 @@ static struct vm_area_struct *vma_merge( if (vm_flags & VM_SPECIAL) return NULL; - if (!prev) { + if (prev) + next = prev->vm_next; + else next = mm->mmap; - goto merge_next; - } - next = prev->vm_next; + area = next; + if (next && next->vm_end == end) /* cases 6, 7, 8 */ + next = next->vm_next; /* * Can it merge with the predecessor? */ - if (prev->vm_end == addr && + if (prev && prev->vm_end == addr && mpol_equal(vma_policy(prev), policy) && can_vma_merge_after(prev, vm_flags, file, pgoff)) { /* @@ -499,33 +585,29 @@ static struct vm_area_struct *vma_merge( mpol_equal(policy, vma_policy(next)) && can_vma_merge_before(next, vm_flags, file, pgoff+pglen)) { + /* cases 1, 6 */ vma_adjust(prev, prev->vm_start, - next->vm_end, prev->vm_pgoff, next); - if (file) - fput(file); - mm->map_count--; - mpol_free(vma_policy(next)); - kmem_cache_free(vm_area_cachep, next); - } else + next->vm_end, prev->vm_pgoff, NULL); + } else /* cases 2, 5, 7 */ vma_adjust(prev, prev->vm_start, end, prev->vm_pgoff, NULL); return prev; } -merge_next: - /* * Can this new request be merged in front of next? */ - if (next) { - if (end == next->vm_start && - mpol_equal(policy, vma_policy(next)) && - can_vma_merge_before(next, vm_flags, file, + if (next && end == next->vm_start && + mpol_equal(policy, vma_policy(next)) && + can_vma_merge_before(next, vm_flags, file, pgoff+pglen)) { - vma_adjust(next, addr, next->vm_end, + if (prev && addr < prev->vm_end) /* case 4 */ + vma_adjust(prev, prev->vm_start, + addr, prev->vm_pgoff, NULL); + else /* cases 3, 8 */ + vma_adjust(area, addr, next->vm_end, next->vm_pgoff - pglen, NULL); - return next; - } + return area; } return NULL; diff -puN mm/mprotect.c~rmap-36-mprotect-use-vma_merge mm/mprotect.c --- 25/mm/mprotect.c~rmap-36-mprotect-use-vma_merge Tue May 18 16:25:50 2004 +++ 25-akpm/mm/mprotect.c Tue May 18 16:25:50 2004 @@ -107,53 +107,6 @@ change_protection(struct vm_area_struct spin_unlock(¤t->mm->page_table_lock); return; } -/* - * Try to merge a vma with the previous flag, return 1 if successful or 0 if it - * was impossible. - */ -static int -mprotect_attempt_merge(struct vm_area_struct *vma, struct vm_area_struct *prev, - unsigned long end, int newflags) -{ - struct mm_struct * mm; - - if (!prev || !vma) - return 0; - mm = vma->vm_mm; - if (prev->vm_end != vma->vm_start) - return 0; - if (!can_vma_merge(prev, newflags)) - return 0; - if (vma->vm_file || (vma->vm_flags & VM_SHARED)) - return 0; - if (!vma_mpol_equal(vma, prev)) - return 0; - - /* - * If the whole area changes to the protection of the previous one - * we can just get rid of it. - */ - if (end == vma->vm_end) { - spin_lock(&mm->page_table_lock); - prev->vm_end = end; - __vma_unlink(mm, vma, prev); - spin_unlock(&mm->page_table_lock); - - mpol_free(vma_policy(vma)); - kmem_cache_free(vm_area_cachep, vma); - mm->map_count--; - return 1; - } - - /* - * Otherwise extend it. - */ - spin_lock(&mm->page_table_lock); - prev->vm_end = end; - vma->vm_start = end; - spin_unlock(&mm->page_table_lock); - return 1; -} static int mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, @@ -162,6 +115,7 @@ mprotect_fixup(struct vm_area_struct *vm struct mm_struct * mm = vma->vm_mm; unsigned long charged = 0; pgprot_t newprot; + pgoff_t pgoff; int error; if (newflags == vma->vm_flags) { @@ -188,15 +142,18 @@ mprotect_fixup(struct vm_area_struct *vm newprot = protection_map[newflags & 0xf]; - if (start == vma->vm_start) { - /* - * Try to merge with the previous vma. - */ - if (mprotect_attempt_merge(vma, *pprev, end, newflags)) { - vma = *pprev; - goto success; - } - } else { + /* + * First try to merge with previous and/or next vma. + */ + pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); + *pprev = vma_merge(mm, *pprev, start, end, newflags, + vma->vm_file, pgoff, vma_policy(vma)); + if (*pprev) { + vma = *pprev; + goto success; + } + + if (start != vma->vm_start) { error = split_vma(mm, vma, start, 1); if (error) goto fail; @@ -213,13 +170,13 @@ mprotect_fixup(struct vm_area_struct *vm goto fail; } +success: /* * vm_flags and vm_page_prot are protected by the mmap_sem * held in write mode. */ vma->vm_flags = newflags; vma->vm_page_prot = newprot; -success: change_protection(vma, start, end, newprot); return 0; @@ -232,7 +189,7 @@ asmlinkage long sys_mprotect(unsigned long start, size_t len, unsigned long prot) { unsigned long vm_flags, nstart, end, tmp; - struct vm_area_struct * vma, * next, * prev; + struct vm_area_struct *vma, *prev; int error = -EINVAL; const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP); prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP); @@ -276,10 +233,11 @@ sys_mprotect(unsigned long start, size_t goto out; } } + if (start > vma->vm_start) + prev = vma; for (nstart = start ; ; ) { unsigned int newflags; - int last = 0; /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ @@ -299,41 +257,25 @@ sys_mprotect(unsigned long start, size_t if (error) goto out; - if (vma->vm_end > end) { - error = mprotect_fixup(vma, &prev, nstart, end, newflags); - goto out; - } - if (vma->vm_end == end) - last = 1; - tmp = vma->vm_end; - next = vma->vm_next; + if (tmp > end) + tmp = end; error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); if (error) goto out; - if (last) - break; nstart = tmp; - vma = next; + + if (nstart < prev->vm_end) + nstart = prev->vm_end; + if (nstart >= end) + goto out; + + vma = prev->vm_next; if (!vma || vma->vm_start != nstart) { error = -ENOMEM; goto out; } } - - if (next && prev->vm_end == next->vm_start && - can_vma_merge(next, prev->vm_flags) && - vma_mpol_equal(prev, next) && - !prev->vm_file && !(prev->vm_flags & VM_SHARED)) { - spin_lock(&prev->vm_mm->page_table_lock); - prev->vm_end = next->vm_end; - __vma_unlink(prev->vm_mm, next, prev); - spin_unlock(&prev->vm_mm->page_table_lock); - - mpol_free(vma_policy(next)); - kmem_cache_free(vm_area_cachep, next); - prev->vm_mm->map_count--; - } out: up_write(¤t->mm->mmap_sem); return error; _