Having a semaphore in there causes modest performance regressions on heavily
mmap-intensive workloads on some hardware.  Specifically, up to 30% in SDET on
NUMAQ and big PPC64.

So switch it back to being a spinlock.  This does mean that unmap_vmas() needs
to be told whether or not it is allowed to schedule away; that's simple to do
via the zap_details structure.

This change means that there will be high scheuling latencies when someone
truncates a large file which is currently mmapped, but nobody does that
anyway.  The scheduling points in unmap_vmas() are mainly for munmap() and
exit(), and they still will work OK for that.
DESC
i_shared_lock fix 1
EDESC
From: Hugh Dickins <hugh@veritas.com>

Sorry, my premature optimizations (trying to pass down NULL zap_details except
when needed) have caught you out doubly: unmap_mapping_range_list was NULLing
the details even though atomic was set; and if it hadn't, then zap_pte_range
would have missed free_swap_and_cache and pte_clear when pte not present. 
Moved the optimization into zap_pte_range itself.  Plus massive documentation
update.
DESC
i_shared_lock fix 2
EDESC
From: Hugh Dickins <hugh@veritas.com>

Here's a second patch to add to the first: mremap's cows can't come home
without releasing the i_shared_lock, better move the whole "Subtle point"
locking from move_vma into move_page_tables.  And it's possible for the file
that was behind an anonymous page to be truncated while we drop that lock,
don't want to abort mremap because of VM_FAULT_SIGBUS.

(Eek, should we be checking do_swap_page of a vm_file area against the
truncate_count sequence?  Technically yes, but I doubt we need bother.)
DESC
i_shared_lock mremap fix
EDESC

- We cannot hold i_shared_lock across momve_one_page() because
  move_one_page() needs to perform __GFP_WAIT allocations of pagetable pages.

- Move the cond_resched() out so we test it once per page rather than only
  when move_one_page() returns -EAGAIN.


---

 25-akpm/Documentation/vm/locking |    2 +-
 25-akpm/fs/hugetlbfs/inode.c     |    4 ++--
 25-akpm/fs/inode.c               |    2 +-
 25-akpm/include/linux/fs.h       |    2 +-
 25-akpm/include/linux/mm.h       |    1 +
 25-akpm/include/linux/rmap.h     |   10 ++--------
 25-akpm/kernel/fork.c            |    4 ++--
 25-akpm/mm/filemap.c             |    6 +++---
 25-akpm/mm/madvise.c             |   10 ++++------
 25-akpm/mm/memory.c              |   17 +++++++++--------
 25-akpm/mm/mmap.c                |   32 ++++++++++++++++----------------
 25-akpm/mm/mremap.c              |   13 -------------
 25-akpm/mm/rmap.c                |   12 ++++++------
 13 files changed, 48 insertions(+), 67 deletions(-)

diff -puN fs/hugetlbfs/inode.c~i_shared_lock fs/hugetlbfs/inode.c
--- 25/fs/hugetlbfs/inode.c~i_shared_lock	2004-05-04 20:54:36.446202464 -0700
+++ 25-akpm/fs/hugetlbfs/inode.c	2004-05-04 20:54:36.466199424 -0700
@@ -319,12 +319,12 @@ static int hugetlb_vmtruncate(struct ino
 	pgoff = offset >> HPAGE_SHIFT;
 
 	inode->i_size = offset;
-	down(&mapping->i_shared_sem);
+	spin_lock(&mapping->i_shared_lock);
 	if (!list_empty(&mapping->i_mmap))
 		hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
 	if (!list_empty(&mapping->i_mmap_shared))
 		hugetlb_vmtruncate_list(&mapping->i_mmap_shared, pgoff);
-	up(&mapping->i_shared_sem);
+	spin_unlock(&mapping->i_shared_lock);
 	truncate_hugepages(mapping, offset);
 	return 0;
 }
diff -puN fs/inode.c~i_shared_lock fs/inode.c
--- 25/fs/inode.c~i_shared_lock	2004-05-04 20:54:36.448202160 -0700
+++ 25-akpm/fs/inode.c	2004-05-04 20:54:36.467199272 -0700
@@ -184,7 +184,7 @@ void inode_init_once(struct inode *inode
 	init_rwsem(&inode->i_alloc_sem);
 	INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
 	spin_lock_init(&inode->i_data.tree_lock);
-	init_MUTEX(&inode->i_data.i_shared_sem);
+	spin_lock_init(&inode->i_data.i_shared_lock);
 	atomic_set(&inode->i_data.truncate_count, 0);
 	INIT_LIST_HEAD(&inode->i_data.private_list);
 	spin_lock_init(&inode->i_data.private_lock);
diff -puN include/linux/fs.h~i_shared_lock include/linux/fs.h
--- 25/include/linux/fs.h~i_shared_lock	2004-05-04 20:54:36.449202008 -0700
+++ 25-akpm/include/linux/fs.h	2004-05-04 20:54:36.468199120 -0700
@@ -331,7 +331,7 @@ struct address_space {
 	struct address_space_operations *a_ops;	/* methods */
 	struct list_head	i_mmap;		/* list of private mappings */
 	struct list_head	i_mmap_shared;	/* list of shared mappings */
-	struct semaphore	i_shared_sem;	/* protect both above lists */
+	spinlock_t		i_shared_lock;	/* protect both above lists */
 	atomic_t		truncate_count;	/* Cover race condition with truncate */
 	unsigned long		flags;		/* error bits/gfp mask */
 	struct backing_dev_info *backing_dev_info; /* device readahead, etc */
diff -puN include/linux/mm.h~i_shared_lock include/linux/mm.h
--- 25/include/linux/mm.h~i_shared_lock	2004-05-04 20:54:36.451201704 -0700
+++ 25-akpm/include/linux/mm.h	2004-05-04 20:54:36.469198968 -0700
@@ -442,6 +442,7 @@ struct zap_details {
 	struct address_space *check_mapping;	/* Check page->mapping if set */
 	pgoff_t	first_index;			/* Lowest page->index to unmap */
 	pgoff_t last_index;			/* Highest page->index to unmap */
+	int atomic;				/* May not schedule() */
 };
 
 void zap_page_range(struct vm_area_struct *vma, unsigned long address,
diff -puN kernel/fork.c~i_shared_lock kernel/fork.c
--- 25/kernel/fork.c~i_shared_lock	2004-05-04 20:54:36.452201552 -0700
+++ 25-akpm/kernel/fork.c	2004-05-04 20:54:36.470198816 -0700
@@ -323,9 +323,9 @@ static inline int dup_mmap(struct mm_str
 				atomic_dec(&inode->i_writecount);
       
 			/* insert tmp into the share list, just after mpnt */
-			down(&file->f_mapping->i_shared_sem);
+			spin_lock(&file->f_mapping->i_shared_lock);
 			list_add(&tmp->shared, &mpnt->shared);
-			up(&file->f_mapping->i_shared_sem);
+			spin_unlock(&file->f_mapping->i_shared_lock);
 		}
 
 		/*
diff -puN mm/filemap.c~i_shared_lock mm/filemap.c
--- 25/mm/filemap.c~i_shared_lock	2004-05-04 20:54:36.453201400 -0700
+++ 25-akpm/mm/filemap.c	2004-05-04 20:54:36.471198664 -0700
@@ -55,17 +55,17 @@
 /*
  * Lock ordering:
  *
- *  ->i_shared_sem		(vmtruncate)
+ *  ->i_shared_lock		(vmtruncate)
  *    ->private_lock		(__free_pte->__set_page_dirty_buffers)
  *      ->swap_list_lock
  *        ->swap_device_lock	(exclusive_swap_page, others)
  *          ->mapping->tree_lock
  *
  *  ->i_sem
- *    ->i_shared_sem		(truncate->unmap_mapping_range)
+ *    ->i_shared_lock		(truncate->unmap_mapping_range)
  *
  *  ->mmap_sem
- *    ->i_shared_sem		(various places)
+ *    ->i_shared_lock		(various places)
  *
  *  ->mmap_sem
  *    ->lock_page		(access_process_vm)
diff -puN mm/madvise.c~i_shared_lock mm/madvise.c
--- 25/mm/madvise.c~i_shared_lock	2004-05-04 20:54:36.455201096 -0700
+++ 25-akpm/mm/madvise.c	2004-05-04 20:54:36.472198512 -0700
@@ -92,16 +92,14 @@ static long madvise_willneed(struct vm_a
 static long madvise_dontneed(struct vm_area_struct * vma,
 			     unsigned long start, unsigned long end)
 {
-	struct zap_details details;
-
 	if (vma->vm_flags & VM_LOCKED)
 		return -EINVAL;
 
 	if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
-		details.check_mapping = NULL;
-		details.nonlinear_vma = vma;
-		details.first_index = 0;
-		details.last_index = ULONG_MAX;
+		struct zap_details details = {
+			.nonlinear_vma = vma,
+			.last_index = ULONG_MAX,
+		};
 		zap_page_range(vma, start, end - start, &details);
 	} else
 		zap_page_range(vma, start, end - start, NULL);
diff -puN mm/memory.c~i_shared_lock mm/memory.c
--- 25/mm/memory.c~i_shared_lock	2004-05-04 20:54:36.456200944 -0700
+++ 25-akpm/mm/memory.c	2004-05-04 20:54:36.474198208 -0700
@@ -365,6 +365,8 @@ static void zap_pte_range(struct mmu_gat
 	if (offset + size > PMD_SIZE)
 		size = PMD_SIZE - offset;
 	size &= PAGE_MASK;
+	if (details && !details->check_mapping && !details->nonlinear_vma)
+		details = NULL;
 	for (offset=0; offset < size; ptep++, offset += PAGE_SIZE) {
 		pte_t pte = *ptep;
 		if (pte_none(pte))
@@ -518,6 +520,7 @@ int unmap_vmas(struct mmu_gather **tlbp,
 	unsigned long tlb_start = 0;	/* For tlb_finish_mmu */
 	int tlb_start_valid = 0;
 	int ret = 0;
+	int atomic = details && details->atomic;
 
 	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
 		unsigned long start;
@@ -555,7 +558,7 @@ int unmap_vmas(struct mmu_gather **tlbp,
 			zap_bytes -= block;
 			if ((long)zap_bytes > 0)
 				continue;
-			if (need_resched()) {
+			if (!atomic && need_resched()) {
 				int fullmm = tlb_is_full_mm(*tlbp);
 				tlb_finish_mmu(*tlbp, tlb_start, start);
 				cond_resched_lock(&mm->page_table_lock);
@@ -583,8 +586,6 @@ void zap_page_range(struct vm_area_struc
 	unsigned long end = address + size;
 	unsigned long nr_accounted = 0;
 
-	might_sleep();
-
 	if (is_vm_hugetlb_page(vma)) {
 		zap_hugepage_range(vma, address, size);
 		return;
@@ -1133,8 +1134,7 @@ static void unmap_mapping_range_list(str
 			zea = vea;
 		zap_page_range(vma,
 			((zba - vba) << PAGE_SHIFT) + vma->vm_start,
-			(zea - zba + 1) << PAGE_SHIFT,
-			details->check_mapping? details: NULL);
+			(zea - zba + 1) << PAGE_SHIFT, details);
 	}
 }
 
@@ -1155,7 +1155,7 @@ static void unmap_mapping_range_list(str
  * but 0 when invalidating pagecache, don't throw away private data.
  */
 void unmap_mapping_range(struct address_space *mapping,
-	loff_t const holebegin, loff_t const holelen, int even_cows)
+		loff_t const holebegin, loff_t const holelen, int even_cows)
 {
 	struct zap_details details;
 	pgoff_t hba = holebegin >> PAGE_SHIFT;
@@ -1173,10 +1173,11 @@ void unmap_mapping_range(struct address_
 	details.nonlinear_vma = NULL;
 	details.first_index = hba;
 	details.last_index = hba + hlen - 1;
+	details.atomic = 1;	/* A spinlock is held */
 	if (details.last_index < details.first_index)
 		details.last_index = ULONG_MAX;
 
-	down(&mapping->i_shared_sem);
+	spin_lock(&mapping->i_shared_lock);
 	/* Protect against page fault */
 	atomic_inc(&mapping->truncate_count);
 	if (unlikely(!list_empty(&mapping->i_mmap)))
@@ -1187,7 +1188,7 @@ void unmap_mapping_range(struct address_
 
 	if (unlikely(!list_empty(&mapping->i_mmap_shared)))
 		unmap_mapping_range_list(&mapping->i_mmap_shared, &details);
-	up(&mapping->i_shared_sem);
+	spin_unlock(&mapping->i_shared_lock);
 }
 EXPORT_SYMBOL(unmap_mapping_range);
 
diff -puN mm/mmap.c~i_shared_lock mm/mmap.c
--- 25/mm/mmap.c~i_shared_lock	2004-05-04 20:54:36.457200792 -0700
+++ 25-akpm/mm/mmap.c	2004-05-04 20:54:36.475198056 -0700
@@ -63,7 +63,7 @@ EXPORT_SYMBOL(sysctl_max_map_count);
 EXPORT_SYMBOL(vm_committed_space);
 
 /*
- * Requires inode->i_mapping->i_shared_sem
+ * Requires inode->i_mapping->i_shared_lock
  */
 static inline void
 __remove_shared_vm_struct(struct vm_area_struct *vma, struct inode *inode)
@@ -84,9 +84,9 @@ static void remove_shared_vm_struct(stru
 
 	if (file) {
 		struct address_space *mapping = file->f_mapping;
-		down(&mapping->i_shared_sem);
+		spin_lock(&mapping->i_shared_lock);
 		__remove_shared_vm_struct(vma, file->f_dentry->d_inode);
-		up(&mapping->i_shared_sem);
+		spin_unlock(&mapping->i_shared_lock);
 	}
 }
 
@@ -286,12 +286,12 @@ static void vma_link(struct mm_struct *m
 		mapping = vma->vm_file->f_mapping;
 
 	if (mapping)
-		down(&mapping->i_shared_sem);
+		spin_lock(&mapping->i_shared_lock);
 	spin_lock(&mm->page_table_lock);
 	__vma_link(mm, vma, prev, rb_link, rb_parent);
 	spin_unlock(&mm->page_table_lock);
 	if (mapping)
-		up(&mapping->i_shared_sem);
+		spin_unlock(&mapping->i_shared_lock);
 
 	mark_mm_hugetlb(mm, vma);
 	mm->map_count++;
@@ -301,7 +301,7 @@ static void vma_link(struct mm_struct *m
 /*
  * Insert vm structure into process list sorted by address and into the inode's
  * i_mmap ring. The caller should hold mm->page_table_lock and
- * ->f_mappping->i_shared_sem if vm_file is non-NULL.
+ * ->f_mappping->i_shared_lock if vm_file is non-NULL.
  */
 static void
 __insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
@@ -391,7 +391,7 @@ static struct vm_area_struct *vma_merge(
 {
 	spinlock_t *lock = &mm->page_table_lock;
 	struct inode *inode = file ? file->f_dentry->d_inode : NULL;
-	struct semaphore *i_shared_sem;
+	spinlock_t *i_shared_lock;
 
 	/*
 	 * We later require that vma->vm_flags == vm_flags, so this tests
@@ -400,7 +400,7 @@ static struct vm_area_struct *vma_merge(
 	if (vm_flags & VM_SPECIAL)
 		return NULL;
 
-	i_shared_sem = file ? &file->f_mapping->i_shared_sem : NULL;
+	i_shared_lock = file ? &file->f_mapping->i_shared_lock : NULL;
 
 	if (!prev) {
 		prev = rb_entry(rb_parent, struct vm_area_struct, vm_rb);
@@ -417,7 +417,7 @@ static struct vm_area_struct *vma_merge(
 
 		if (unlikely(file && prev->vm_next &&
 				prev->vm_next->vm_file == file)) {
-			down(i_shared_sem);
+			spin_lock(i_shared_lock);
 			need_up = 1;
 		}
 		spin_lock(lock);
@@ -435,7 +435,7 @@ static struct vm_area_struct *vma_merge(
 			__remove_shared_vm_struct(next, inode);
 			spin_unlock(lock);
 			if (need_up)
-				up(i_shared_sem);
+				spin_unlock(i_shared_lock);
 			if (file)
 				fput(file);
 
@@ -445,7 +445,7 @@ static struct vm_area_struct *vma_merge(
 		}
 		spin_unlock(lock);
 		if (need_up)
-			up(i_shared_sem);
+			spin_unlock(i_shared_lock);
 		return prev;
 	}
 
@@ -460,13 +460,13 @@ static struct vm_area_struct *vma_merge(
 			return NULL;
 		if (end == prev->vm_start) {
 			if (file)
-				down(i_shared_sem);
+				spin_lock(i_shared_lock);
 			spin_lock(lock);
 			prev->vm_start = addr;
 			prev->vm_pgoff -= (end - addr) >> PAGE_SHIFT;
 			spin_unlock(lock);
 			if (file)
-				up(i_shared_sem);
+				spin_unlock(i_shared_lock);
 			return prev;
 		}
 	}
@@ -1232,7 +1232,7 @@ int split_vma(struct mm_struct * mm, str
 		 mapping = vma->vm_file->f_mapping;
 
 	if (mapping)
-		down(&mapping->i_shared_sem);
+		spin_lock(&mapping->i_shared_lock);
 	spin_lock(&mm->page_table_lock);
 
 	if (new_below) {
@@ -1245,7 +1245,7 @@ int split_vma(struct mm_struct * mm, str
 
 	spin_unlock(&mm->page_table_lock);
 	if (mapping)
-		up(&mapping->i_shared_sem);
+		spin_unlock(&mapping->i_shared_lock);
 
 	return 0;
 }
@@ -1479,7 +1479,7 @@ void exit_mmap(struct mm_struct *mm)
 
 /* Insert vm structure into process list sorted by address
  * and into the inode's i_mmap ring.  If vm_file is non-NULL
- * then i_shared_sem is taken here.
+ * then i_shared_lock is taken here.
  */
 void insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
 {
diff -puN mm/mremap.c~i_shared_lock mm/mremap.c
--- 25/mm/mremap.c~i_shared_lock	2004-05-04 20:54:36.459200488 -0700
+++ 25-akpm/mm/mremap.c	2004-05-04 20:54:36.476197904 -0700
@@ -180,7 +180,6 @@ static unsigned long move_vma(struct vm_
 		unsigned long new_len, unsigned long new_addr)
 {
 	struct mm_struct *mm = vma->vm_mm;
-	struct address_space *mapping = NULL;
 	struct vm_area_struct *new_vma;
 	unsigned long vm_flags = vma->vm_flags;
 	unsigned long new_pgoff;
@@ -201,16 +200,6 @@ static unsigned long move_vma(struct vm_
 	if (!new_vma)
 		return -ENOMEM;
 
-	if (vma->vm_file) {
-		/*
-		 * Subtle point from Rajesh Venkatasubramanian: before
-		 * moving file-based ptes, we must lock vmtruncate out,
-		 * since it might clean the dst vma before the src vma,
-		 * and we propagate stale pages into the dst afterward.
-		 */
-		mapping = vma->vm_file->f_mapping;
-		down(&mapping->i_shared_sem);
-	}
 	moved_len = move_page_tables(vma, new_addr, old_addr, old_len, &cows);
 	if (moved_len < old_len) {
 		/*
@@ -227,8 +216,6 @@ static unsigned long move_vma(struct vm_
 	if (cows)	/* Downgrade or remove this message later */
 		printk(KERN_WARNING "%s: mremap moved %d cows\n",
 							current->comm, cows);
-	if (mapping)
-		up(&mapping->i_shared_sem);
 
 	/* Conceal VM_ACCOUNT so old reservation is not undone */
 	if (vm_flags & VM_ACCOUNT) {
diff -puN mm/rmap.c~i_shared_lock mm/rmap.c
--- 25/mm/rmap.c~i_shared_lock	2004-05-04 20:54:36.460200336 -0700
+++ 25-akpm/mm/rmap.c	2004-05-04 20:54:36.477197752 -0700
@@ -300,7 +300,7 @@ migrate:
  *
  * This function is only called from page_referenced for object-based pages.
  *
- * The semaphore address_space->i_shared_sem is tried.  If it can't be gotten,
+ * The semaphore address_space->i_shared_lock is tried.  If it can't be gotten,
  * assume a reference count of 0, so try_to_unmap will then have a go.
  */
 static inline int page_referenced_file(struct page *page)
@@ -313,7 +313,7 @@ static inline int page_referenced_file(s
 	int referenced = 0;
 	int failed = 0;
 
-	if (down_trylock(&mapping->i_shared_sem))
+	if (!spin_trylock(&mapping->i_shared_lock))
 		return 0;
 
 	list_for_each_entry(vma, &mapping->i_mmap, shared) {
@@ -355,7 +355,7 @@ static inline int page_referenced_file(s
 
 	WARN_ON(!failed);
 out:
-	up(&mapping->i_shared_sem);
+	spin_unlock(&mapping->i_shared_lock);
 	return referenced;
 }
 
@@ -710,7 +710,7 @@ out:
  *
  * This function is only called from try_to_unmap for object-based pages.
  *
- * The semaphore address_space->i_shared_sem is tried.  If it can't be gotten,
+ * The semaphore address_space->i_shared_lock is tried.  If it can't be gotten,
  * return a temporary error.
  */
 static inline int try_to_unmap_file(struct page *page)
@@ -725,7 +725,7 @@ static inline int try_to_unmap_file(stru
 	unsigned long max_nl_cursor = 0;
 	unsigned long max_nl_size = 0;
 
-	if (down_trylock(&mapping->i_shared_sem))
+	if (!spin_trylock(&mapping->i_shared_lock))
 		return ret;
 
 	list_for_each_entry(vma, &mapping->i_mmap, shared) {
@@ -824,7 +824,7 @@ static inline int try_to_unmap_file(stru
 relock:
 	rmap_lock(page);
 out:
-	up(&mapping->i_shared_sem);
+	spin_unlock(&mapping->i_shared_lock);
 	return ret;
 }
 
diff -puN Documentation/vm/locking~i_shared_lock Documentation/vm/locking
--- 25/Documentation/vm/locking~i_shared_lock	2004-05-04 20:54:36.461200184 -0700
+++ 25-akpm/Documentation/vm/locking	2004-05-04 20:54:36.477197752 -0700
@@ -66,7 +66,7 @@ in some cases it is not really needed. E
 expand_stack(), it is hard to come up with a destructive scenario without 
 having the vmlist protection in this case.
 
-The page_table_lock nests with the inode i_shared_sem and the kmem cache
+The page_table_lock nests with the inode i_shared_lock and the kmem cache
 c_spinlock spinlocks.  This is okay, since the kmem code asks for pages after
 dropping c_spinlock.  The page_table_lock also nests with pagecache_lock and
 pagemap_lru_lock spinlocks, and no code asks for memory with these locks
diff -puN include/linux/rmap.h~i_shared_lock include/linux/rmap.h
--- 25/include/linux/rmap.h~i_shared_lock	2004-05-04 20:54:36.462200032 -0700
+++ 25-akpm/include/linux/rmap.h	2004-05-04 20:54:36.478197600 -0700
@@ -69,15 +69,9 @@ static inline int mremap_moved_anon_rmap
 static inline int make_page_exclusive(struct vm_area_struct *vma,
 					unsigned long addr)
 {
-	switch (handle_mm_fault(vma->vm_mm, vma, addr, 1)) {
-	case VM_FAULT_MINOR:
-	case VM_FAULT_MAJOR:
+	if (handle_mm_fault(vma->vm_mm, vma, addr, 1) != VM_FAULT_OOM)
 		return 0;
-	case VM_FAULT_OOM:
-		return -ENOMEM;
-	default:
-		return -EFAULT;
-	}
+	return -ENOMEM;
 }
 
 /*

_