From: Hugh Dickins Partial rewrite of mremap's move_vma. Rajesh Venkatasubramanian has pointed out that vmtruncate could miss ptes, leaving orphaned pages, because move_vma only made the new vma visible after filling it. We see no good reason for that, and time to make move_vma more robust. Removed all its vma merging decisions, leave them to mmap.c's vma_merge, with copy_vma added. Removed duplicated is_mergeable_vma test from vma_merge, and duplicated validate_mm from insert_vm_struct. move_vma move from old to new then unmap old; but on error move back from new to old and unmap new. Don't unwind within move_page_tables, let move_vma call it explicitly to unwind, with the right source vma. Get the VM_ACCOUNTing right even when the final do_munmap fails. --- 25-akpm/include/linux/mm.h | 2 25-akpm/mm/mmap.c | 49 ++++++++++-- 25-akpm/mm/mremap.c | 176 ++++++++++++++------------------------------- 3 files changed, 99 insertions(+), 128 deletions(-) diff -puN include/linux/mm.h~mremap-move_vma-fix include/linux/mm.h --- 25/include/linux/mm.h~mremap-move_vma-fix 2004-04-03 03:00:18.084482296 -0800 +++ 25-akpm/include/linux/mm.h 2004-04-03 03:00:18.090481384 -0800 @@ -541,6 +541,8 @@ extern void si_meminfo_node(struct sysin extern void insert_vm_struct(struct mm_struct *, struct vm_area_struct *); extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *, struct rb_node **, struct rb_node *); +extern struct vm_area_struct *copy_vma(struct vm_area_struct *, + unsigned long addr, unsigned long len, unsigned long pgoff); extern void exit_mmap(struct mm_struct *); extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); diff -puN mm/mmap.c~mremap-move_vma-fix mm/mmap.c --- 25/mm/mmap.c~mremap-move_vma-fix 2004-04-03 03:00:18.085482144 -0800 +++ 25-akpm/mm/mmap.c 2004-04-03 03:00:18.091481232 -0800 @@ -385,7 +385,8 @@ can_vma_merge_after(struct vm_area_struc * whether that can be merged with its predecessor or its successor. Or * both (it neatly fills a hole). */ -static int vma_merge(struct mm_struct *mm, struct vm_area_struct *prev, +static struct vm_area_struct *vma_merge(struct mm_struct *mm, + struct vm_area_struct *prev, struct rb_node *rb_parent, unsigned long addr, unsigned long end, unsigned long vm_flags, struct file *file, unsigned long pgoff) @@ -399,7 +400,7 @@ static int vma_merge(struct mm_struct *m * vma->vm_flags & VM_SPECIAL, too. */ if (vm_flags & VM_SPECIAL) - return 0; + return NULL; i_shared_sem = file ? &file->f_mapping->i_shared_sem : NULL; @@ -412,7 +413,6 @@ static int vma_merge(struct mm_struct *m * Can it merge with the predecessor? */ if (prev->vm_end == addr && - is_mergeable_vma(prev, file, vm_flags) && can_vma_merge_after(prev, vm_flags, file, pgoff)) { struct vm_area_struct *next; int need_up = 0; @@ -443,12 +443,12 @@ static int vma_merge(struct mm_struct *m mm->map_count--; kmem_cache_free(vm_area_cachep, next); - return 1; + return prev; } spin_unlock(lock); if (need_up) up(i_shared_sem); - return 1; + return prev; } /* @@ -459,7 +459,7 @@ static int vma_merge(struct mm_struct *m merge_next: if (!can_vma_merge_before(prev, vm_flags, file, pgoff, (end - addr) >> PAGE_SHIFT)) - return 0; + return NULL; if (end == prev->vm_start) { if (file) down(i_shared_sem); @@ -469,11 +469,11 @@ static int vma_merge(struct mm_struct *m spin_unlock(lock); if (file) up(i_shared_sem); - return 1; + return prev; } } - return 0; + return NULL; } /* @@ -1492,5 +1492,36 @@ void insert_vm_struct(struct mm_struct * if (__vma && __vma->vm_start < vma->vm_end) BUG(); vma_link(mm, vma, prev, rb_link, rb_parent); - validate_mm(mm); +} + +/* + * Copy the vma structure to a new location in the same mm, + * prior to moving page table entries, to effect an mremap move. + */ +struct vm_area_struct *copy_vma(struct vm_area_struct *vma, + unsigned long addr, unsigned long len, unsigned long pgoff) +{ + struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *new_vma, *prev; + struct rb_node **rb_link, *rb_parent; + + find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); + new_vma = vma_merge(mm, prev, rb_parent, addr, addr + len, + vma->vm_flags, vma->vm_file, pgoff); + if (!new_vma) { + new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (new_vma) { + *new_vma = *vma; + INIT_LIST_HEAD(&new_vma->shared); + new_vma->vm_start = addr; + new_vma->vm_end = addr + len; + new_vma->vm_pgoff = pgoff; + if (new_vma->vm_file) + get_file(new_vma->vm_file); + if (new_vma->vm_ops && new_vma->vm_ops->open) + new_vma->vm_ops->open(new_vma); + vma_link(mm, new_vma, prev, rb_link, rb_parent); + } + } + return new_vma; } diff -puN mm/mremap.c~mremap-move_vma-fix mm/mremap.c --- 25/mm/mremap.c~mremap-move_vma-fix 2004-04-03 03:00:18.087481840 -0800 +++ 25-akpm/mm/mremap.c 2004-04-03 03:00:18.093480928 -0800 @@ -148,7 +148,7 @@ out: static int move_page_tables(struct vm_area_struct *vma, unsigned long new_addr, unsigned long old_addr, unsigned long len) { - unsigned long offset = len; + unsigned long offset; flush_cache_range(vma, old_addr, old_addr + len); @@ -157,137 +157,75 @@ static int move_page_tables(struct vm_ar * easy way out on the assumption that most remappings will be * only a few pages.. This also makes error recovery easier. */ - while (offset) { - offset -= PAGE_SIZE; - if (move_one_page(vma, old_addr + offset, new_addr + offset)) - goto oops_we_failed; + for (offset = 0; offset < len; offset += PAGE_SIZE) { + if (move_one_page(vma, old_addr+offset, new_addr+offset) < 0) + break; } - return 0; - - /* - * Ok, the move failed because we didn't have enough pages for - * the new page table tree. This is unlikely, but we have to - * take the possibility into account. In that case we just move - * all the pages back (this will work, because we still have - * the old page tables) - */ -oops_we_failed: - flush_cache_range(vma, new_addr, new_addr + len); - while ((offset += PAGE_SIZE) < len) - move_one_page(vma, new_addr + offset, old_addr + offset); - zap_page_range(vma, new_addr, len); - return -1; + return offset; } static unsigned long move_vma(struct vm_area_struct *vma, - unsigned long addr, unsigned long old_len, unsigned long new_len, - unsigned long new_addr) + unsigned long old_addr, unsigned long old_len, + unsigned long new_len, unsigned long new_addr) { struct mm_struct *mm = vma->vm_mm; - struct vm_area_struct *new_vma, *next, *prev; - int allocated_vma; + struct vm_area_struct *new_vma; + unsigned long vm_flags = vma->vm_flags; + unsigned long new_pgoff; + unsigned long moved_len; + unsigned long excess = 0; int split = 0; - new_vma = NULL; - next = find_vma_prev(mm, new_addr, &prev); - if (next) { - if (prev && prev->vm_end == new_addr && - can_vma_merge(prev, vma->vm_flags) && !vma->vm_file && - !(vma->vm_flags & VM_SHARED)) { - spin_lock(&mm->page_table_lock); - prev->vm_end = new_addr + new_len; - spin_unlock(&mm->page_table_lock); - new_vma = prev; - if (next != prev->vm_next) - BUG(); - if (prev->vm_end == next->vm_start && - can_vma_merge(next, prev->vm_flags)) { - spin_lock(&mm->page_table_lock); - prev->vm_end = next->vm_end; - __vma_unlink(mm, next, prev); - spin_unlock(&mm->page_table_lock); - if (vma == next) - vma = prev; - mm->map_count--; - kmem_cache_free(vm_area_cachep, next); - } - } else if (next->vm_start == new_addr + new_len && - can_vma_merge(next, vma->vm_flags) && - !vma->vm_file && !(vma->vm_flags & VM_SHARED)) { - spin_lock(&mm->page_table_lock); - next->vm_start = new_addr; - spin_unlock(&mm->page_table_lock); - new_vma = next; - } - } else { - prev = find_vma(mm, new_addr-1); - if (prev && prev->vm_end == new_addr && - can_vma_merge(prev, vma->vm_flags) && !vma->vm_file && - !(vma->vm_flags & VM_SHARED)) { - spin_lock(&mm->page_table_lock); - prev->vm_end = new_addr + new_len; - spin_unlock(&mm->page_table_lock); - new_vma = prev; - } + new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); + new_vma = copy_vma(vma, new_addr, new_len, new_pgoff); + if (!new_vma) + return -ENOMEM; + + moved_len = move_page_tables(vma, new_addr, old_addr, old_len); + if (moved_len < old_len) { + /* + * On error, move entries back from new area to old, + * which will succeed since page tables still there, + * and then proceed to unmap new area instead of old. + */ + move_page_tables(new_vma, old_addr, new_addr, moved_len); + vma = new_vma; + old_len = new_len; + old_addr = new_addr; + new_addr = -ENOMEM; } - allocated_vma = 0; - if (!new_vma) { - new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); - if (!new_vma) - goto out; - allocated_vma = 1; - } - - if (!move_page_tables(vma, new_addr, addr, old_len)) { - unsigned long vm_locked = vma->vm_flags & VM_LOCKED; - - if (allocated_vma) { - *new_vma = *vma; - INIT_LIST_HEAD(&new_vma->shared); - new_vma->vm_start = new_addr; - new_vma->vm_end = new_addr+new_len; - new_vma->vm_pgoff += (addr-vma->vm_start) >> PAGE_SHIFT; - if (new_vma->vm_file) - get_file(new_vma->vm_file); - if (new_vma->vm_ops && new_vma->vm_ops->open) - new_vma->vm_ops->open(new_vma); - insert_vm_struct(current->mm, new_vma); - } + /* Conceal VM_ACCOUNT so old reservation is not undone */ + if (vm_flags & VM_ACCOUNT) { + vma->vm_flags &= ~VM_ACCOUNT; + excess = vma->vm_end - vma->vm_start - old_len; + if (old_addr > vma->vm_start && + old_addr + old_len < vma->vm_end) + split = 1; + } - /* Conceal VM_ACCOUNT so old reservation is not undone */ - if (vma->vm_flags & VM_ACCOUNT) { - vma->vm_flags &= ~VM_ACCOUNT; - if (addr > vma->vm_start) { - if (addr + old_len < vma->vm_end) - split = 1; - } else if (addr + old_len == vma->vm_end) - vma = NULL; /* it will be removed */ - } else - vma = NULL; /* nothing more to do */ - - do_munmap(current->mm, addr, old_len); - - /* Restore VM_ACCOUNT if one or two pieces of vma left */ - if (vma) { - vma->vm_flags |= VM_ACCOUNT; - if (split) - vma->vm_next->vm_flags |= VM_ACCOUNT; - } + if (do_munmap(mm, old_addr, old_len) < 0) { + /* OOM: unable to split vma, just get accounts right */ + vm_unacct_memory(excess >> PAGE_SHIFT); + excess = 0; + } - current->mm->total_vm += new_len >> PAGE_SHIFT; - if (vm_locked) { - current->mm->locked_vm += new_len >> PAGE_SHIFT; - if (new_len > old_len) - make_pages_present(new_addr + old_len, - new_addr + new_len); - } - return new_addr; + /* Restore VM_ACCOUNT if one or two pieces of vma left */ + if (excess) { + vma->vm_flags |= VM_ACCOUNT; + if (split) + vma->vm_next->vm_flags |= VM_ACCOUNT; } - if (allocated_vma) - kmem_cache_free(vm_area_cachep, new_vma); - out: - return -ENOMEM; + + mm->total_vm += new_len >> PAGE_SHIFT; + if (vm_flags & VM_LOCKED) { + mm->locked_vm += new_len >> PAGE_SHIFT; + if (new_len > old_len) + make_pages_present(new_addr + old_len, + new_addr + new_len); + } + + return new_addr; } /* _