From: Hugh Dickins <hugh@veritas.com>

If file-based vmas are to be kept in a tree, according to the file offsets
they map, then adjusting the vma's start pgoff or its end involves
repositioning in the tree, while holding i_shared_lock (and page_table_lock). 
We used to avoid that if possible, e.g.  when just moving end; but if we're
heading that way, let's now tidy up vma_merge and split_vma, and do all the
locking and adjustment in a new helper vma_adjust.  And please, let's call the
next vma in vma_merge "next" rather than "prev".

Since these patches are diffed over 2.6.6-rc2-mm2, they include the NUMA
mpolicy mods which you'll have to remove to go earlier in the series, sorry
for that nuisance.  I have intentionally changed the one vma_mpol_equal to

diff -upN reference/include/linux/mm.h current/include/linux/mm.h
--- reference/include/linux/mm.h	2004-05-01 10:25:16.000000000 -0700
+++ current/include/linux/mm.h	2004-05-01 10:28:27.000000000 -0700
@@ -552,6 +552,8 @@ extern void si_meminfo(struct sysinfo * 
 extern void si_meminfo_node(struct sysinfo *val, int nid);
 
 /* mmap.c */
+extern void vma_adjust(struct vm_area_struct *vma, unsigned long start,
+	unsigned long end, pgoff_t pgoff, struct vm_area_struct *next);
 extern void insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
 extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
 	struct rb_node **, struct rb_node *);
diff -upN reference/mm/mmap.c current/mm/mmap.c
--- reference/mm/mmap.c	2004-05-01 10:25:17.000000000 -0700
+++ current/mm/mmap.c	2004-05-01 10:28:27.000000000 -0700
@@ -66,13 +66,11 @@ EXPORT_SYMBOL(vm_committed_space);
  * Requires inode->i_mapping->i_shared_lock
  */
 static inline void
-__remove_shared_vm_struct(struct vm_area_struct *vma, struct inode *inode)
+__remove_shared_vm_struct(struct vm_area_struct *vma, struct file *file)
 {
-	if (inode) {
-		if (vma->vm_flags & VM_DENYWRITE)
-			atomic_inc(&inode->i_writecount);
-		list_del_init(&vma->shared);
-	}
+	if (vma->vm_flags & VM_DENYWRITE)
+		atomic_inc(&file->f_dentry->d_inode->i_writecount);
+	list_del_init(&vma->shared);
 }
 
 /*
@@ -85,7 +83,7 @@ static void remove_shared_vm_struct(stru
 	if (file) {
 		struct address_space *mapping = file->f_mapping;
 		spin_lock(&mapping->i_shared_lock);
-		__remove_shared_vm_struct(vma, file->f_dentry->d_inode);
+		__remove_shared_vm_struct(vma, file);
 		spin_unlock(&mapping->i_shared_lock);
 	}
 }
@@ -319,6 +317,54 @@ __insert_vm_struct(struct mm_struct * mm
 }
 
 /*
+ * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that is
+ * already present in an i_mmap{_shared} tree without adjusting the tree.
+ * The following helper function should be used when such adjustments
+ * are necessary.  The "next" vma (if any) is to be removed or inserted
+ * before we drop the necessary locks.
+ */
+void vma_adjust(struct vm_area_struct *vma, unsigned long start,
+	unsigned long end, pgoff_t pgoff, struct vm_area_struct *next)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	struct address_space *mapping = NULL;
+	struct file *file = vma->vm_file;
+
+	if (file) {
+		mapping = file->f_mapping;
+		spin_lock(&mapping->i_shared_lock);
+	}
+	spin_lock(&mm->page_table_lock);
+
+	vma->vm_start = start;
+	vma->vm_end = end;
+	vma->vm_pgoff = pgoff;
+
+	if (next) {
+		if (next == vma->vm_next) {
+			/*
+			 * vma_merge has merged next into vma, and needs
+			 * us to remove next before dropping the locks.
+			 */
+			__vma_unlink(mm, next, vma);
+			if (file)
+				__remove_shared_vm_struct(next, file);
+		} else {
+			/*
+			 * split_vma has split next from vma, and needs
+			 * us to insert next before dropping the locks
+			 * (next may either follow vma or precede it).
+			 */
+			__insert_vm_struct(mm, next);
+		}
+	}
+
+	spin_unlock(&mm->page_table_lock);
+	if (mapping)
+		spin_unlock(&mapping->i_shared_lock);
+}
+
+/*
  * If the vma has a ->close operation then the driver probably needs to release
  * per-vma resources, so we don't attempt to merge those.
  */
@@ -390,90 +436,59 @@ static struct vm_area_struct *vma_merge(
 		     	struct file *file, unsigned long pgoff,
 		        struct mempolicy *policy)
 {
-	spinlock_t *lock = &mm->page_table_lock;
-	struct inode *inode = file ? file->f_dentry->d_inode : NULL;
-	spinlock_t *i_shared_lock;
+	struct vm_area_struct *next;
 
 	/*
-	 * We later require that vma->vm_flags == vm_flags, so this tests
-	 * vma->vm_flags & VM_SPECIAL, too.
+	 * We later require that vma->vm_flags == vm_flags,
+	 * so this tests vma->vm_flags & VM_SPECIAL, too.
 	 */
 	if (vm_flags & VM_SPECIAL)
 		return NULL;
 
-	i_shared_lock = file ? &file->f_mapping->i_shared_lock : NULL;
-
 	if (!prev) {
-		prev = rb_entry(rb_parent, struct vm_area_struct, vm_rb);
+		next = rb_entry(rb_parent, struct vm_area_struct, vm_rb);
 		goto merge_next;
 	}
+	next = prev->vm_next;
 
 	/*
 	 * Can it merge with the predecessor?
 	 */
 	if (prev->vm_end == addr &&
-  		        mpol_equal(vma_policy(prev), policy) &&
+  			mpol_equal(vma_policy(prev), policy) &&
 			can_vma_merge_after(prev, vm_flags, file, pgoff)) {
-		struct vm_area_struct *next;
-		int need_up = 0;
-
-		if (unlikely(file && prev->vm_next &&
-				prev->vm_next->vm_file == file)) {
-			spin_lock(i_shared_lock);
-			need_up = 1;
-		}
-		spin_lock(lock);
-		prev->vm_end = end;
-
 		/*
-		 * OK, it did.  Can we now merge in the successor as well?
+		 * OK, it can.  Can we now merge in the successor as well?
 		 */
-		next = prev->vm_next;
-		if (next && prev->vm_end == next->vm_start &&
-		    		vma_mpol_equal(prev, next) &&
+		if (next && end == next->vm_start &&
+				mpol_equal(policy, vma_policy(next)) &&
 				can_vma_merge_before(next, vm_flags, file,
 					pgoff, (end - addr) >> PAGE_SHIFT)) {
-			prev->vm_end = next->vm_end;
-			__vma_unlink(mm, next, prev);
-			__remove_shared_vm_struct(next, inode);
-			spin_unlock(lock);
-			if (need_up)
-				spin_unlock(i_shared_lock);
+			vma_adjust(prev, prev->vm_start,
+				next->vm_end, prev->vm_pgoff, next);
 			if (file)
 				fput(file);
-
 			mm->map_count--;
 			mpol_free(vma_policy(next));
 			kmem_cache_free(vm_area_cachep, next);
-			return prev;
-		}
-		spin_unlock(lock);
-		if (need_up)
-			spin_unlock(i_shared_lock);
+		} else
+			vma_adjust(prev, prev->vm_start,
+				end, prev->vm_pgoff, NULL);
 		return prev;
 	}
 
 	/*
-	 * Can this new request be merged in front of prev->vm_next?
+	 * Can this new request be merged in front of next?
 	 */
-	prev = prev->vm_next;
-	if (prev) {
+	if (next) {
  merge_next:
- 		if (!mpol_equal(policy, vma_policy(prev)))
-  			return 0;
-		if (!can_vma_merge_before(prev, vm_flags, file,
-				pgoff, (end - addr) >> PAGE_SHIFT))
-			return NULL;
-		if (end == prev->vm_start) {
-			if (file)
-				spin_lock(i_shared_lock);
-			spin_lock(lock);
-			prev->vm_start = addr;
-			prev->vm_pgoff -= (end - addr) >> PAGE_SHIFT;
-			spin_unlock(lock);
-			if (file)
-				spin_unlock(i_shared_lock);
-			return prev;
+		if (end == next->vm_start &&
+ 				mpol_equal(policy, vma_policy(next)) &&
+				can_vma_merge_before(next, vm_flags, file,
+					pgoff, (end - addr) >> PAGE_SHIFT)) {
+			vma_adjust(next, addr,
+				next->vm_end, pgoff, NULL);
+			return next;
 		}
 	}
 
@@ -1213,7 +1228,6 @@ int split_vma(struct mm_struct * mm, str
 {
 	struct mempolicy *pol;
 	struct vm_area_struct *new;
-	struct address_space *mapping = NULL;
 
 	if (mm->map_count >= sysctl_max_map_count)
 		return -ENOMEM;
@@ -1247,24 +1261,11 @@ int split_vma(struct mm_struct * mm, str
 	if (new->vm_ops && new->vm_ops->open)
 		new->vm_ops->open(new);
 
-	if (vma->vm_file)
-		 mapping = vma->vm_file->f_mapping;
-
-	if (mapping)
-		spin_lock(&mapping->i_shared_lock);
-	spin_lock(&mm->page_table_lock);
-
-	if (new_below) {
-		vma->vm_start = addr;
-		vma->vm_pgoff += ((addr - new->vm_start) >> PAGE_SHIFT);
-	} else
-		vma->vm_end = addr;
-
-	__insert_vm_struct(mm, new);
-
-	spin_unlock(&mm->page_table_lock);
-	if (mapping)
-		spin_unlock(&mapping->i_shared_lock);
+	if (new_below)
+		vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
+			((addr - new->vm_start) >> PAGE_SHIFT), new);
+	else
+		vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
 
 	return 0;
 }
diff -upN reference/mm/mremap.c current/mm/mremap.c
--- reference/mm/mremap.c	2004-05-01 10:28:27.000000000 -0700
+++ current/mm/mremap.c	2004-05-01 10:28:27.000000000 -0700
@@ -392,9 +392,10 @@ unsigned long do_mremap(unsigned long ad
 		/* can we just expand the current mapping? */
 		if (max_addr - addr >= new_len) {
 			int pages = (new_len - old_len) >> PAGE_SHIFT;
-			spin_lock(&vma->vm_mm->page_table_lock);
-			vma->vm_end = addr + new_len;
-			spin_unlock(&vma->vm_mm->page_table_lock);
+
+			vma_adjust(vma, vma->vm_start,
+				addr + new_len, vma->vm_pgoff, NULL);
+
 			current->mm->total_vm += pages;
 			if (vma->vm_flags & VM_LOCKED) {
 				current->mm->locked_vm += pages;