Having a semaphore in there causes modest performance regressions on heavily mmap-intensive workloads on some hardware. Specifically, up to 30% in SDET on NUMAQ and big PPC64. So switch it back to being a spinlock. This does mean that unmap_vmas() needs to be told whether or not it is allowed to schedule away; that's simple to do via the zap_details structure. This change means that there will be high scheuling latencies when someone truncates a large file which is currently mmapped, but nobody does that anyway. The scheduling points in unmap_vmas() are mainly for munmap() and exit(), and they still will work OK for that. DESC i_shared_lock fix 1 EDESC From: Hugh Dickins Sorry, my premature optimizations (trying to pass down NULL zap_details except when needed) have caught you out doubly: unmap_mapping_range_list was NULLing the details even though atomic was set; and if it hadn't, then zap_pte_range would have missed free_swap_and_cache and pte_clear when pte not present. Moved the optimization into zap_pte_range itself. Plus massive documentation update. DESC i_shared_lock fix 2 EDESC From: Hugh Dickins Here's a second patch to add to the first: mremap's cows can't come home without releasing the i_shared_lock, better move the whole "Subtle point" locking from move_vma into move_page_tables. And it's possible for the file that was behind an anonymous page to be truncated while we drop that lock, don't want to abort mremap because of VM_FAULT_SIGBUS. (Eek, should we be checking do_swap_page of a vm_file area against the truncate_count sequence? Technically yes, but I doubt we need bother.) DESC i_shared_lock mremap fix EDESC - We cannot hold i_shared_lock across momve_one_page() because move_one_page() needs to perform __GFP_WAIT allocations of pagetable pages. - Move the cond_resched() out so we test it once per page rather than only when move_one_page() returns -EAGAIN. --- 25-akpm/Documentation/vm/locking | 2 +- 25-akpm/fs/hugetlbfs/inode.c | 4 ++-- 25-akpm/fs/inode.c | 2 +- 25-akpm/include/linux/fs.h | 2 +- 25-akpm/include/linux/mm.h | 1 + 25-akpm/include/linux/rmap.h | 10 ++-------- 25-akpm/kernel/fork.c | 4 ++-- 25-akpm/mm/filemap.c | 6 +++--- 25-akpm/mm/madvise.c | 10 ++++------ 25-akpm/mm/memory.c | 17 +++++++++-------- 25-akpm/mm/mmap.c | 32 ++++++++++++++++---------------- 25-akpm/mm/mremap.c | 13 ------------- 25-akpm/mm/rmap.c | 12 ++++++------ 13 files changed, 48 insertions(+), 67 deletions(-) diff -puN fs/hugetlbfs/inode.c~i_shared_lock fs/hugetlbfs/inode.c --- 25/fs/hugetlbfs/inode.c~i_shared_lock 2004-05-04 20:54:36.446202464 -0700 +++ 25-akpm/fs/hugetlbfs/inode.c 2004-05-04 20:54:36.466199424 -0700 @@ -319,12 +319,12 @@ static int hugetlb_vmtruncate(struct ino pgoff = offset >> HPAGE_SHIFT; inode->i_size = offset; - down(&mapping->i_shared_sem); + spin_lock(&mapping->i_shared_lock); if (!list_empty(&mapping->i_mmap)) hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); if (!list_empty(&mapping->i_mmap_shared)) hugetlb_vmtruncate_list(&mapping->i_mmap_shared, pgoff); - up(&mapping->i_shared_sem); + spin_unlock(&mapping->i_shared_lock); truncate_hugepages(mapping, offset); return 0; } diff -puN fs/inode.c~i_shared_lock fs/inode.c --- 25/fs/inode.c~i_shared_lock 2004-05-04 20:54:36.448202160 -0700 +++ 25-akpm/fs/inode.c 2004-05-04 20:54:36.467199272 -0700 @@ -184,7 +184,7 @@ void inode_init_once(struct inode *inode init_rwsem(&inode->i_alloc_sem); INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); spin_lock_init(&inode->i_data.tree_lock); - init_MUTEX(&inode->i_data.i_shared_sem); + spin_lock_init(&inode->i_data.i_shared_lock); atomic_set(&inode->i_data.truncate_count, 0); INIT_LIST_HEAD(&inode->i_data.private_list); spin_lock_init(&inode->i_data.private_lock); diff -puN include/linux/fs.h~i_shared_lock include/linux/fs.h --- 25/include/linux/fs.h~i_shared_lock 2004-05-04 20:54:36.449202008 -0700 +++ 25-akpm/include/linux/fs.h 2004-05-04 20:54:36.468199120 -0700 @@ -331,7 +331,7 @@ struct address_space { struct address_space_operations *a_ops; /* methods */ struct list_head i_mmap; /* list of private mappings */ struct list_head i_mmap_shared; /* list of shared mappings */ - struct semaphore i_shared_sem; /* protect both above lists */ + spinlock_t i_shared_lock; /* protect both above lists */ atomic_t truncate_count; /* Cover race condition with truncate */ unsigned long flags; /* error bits/gfp mask */ struct backing_dev_info *backing_dev_info; /* device readahead, etc */ diff -puN include/linux/mm.h~i_shared_lock include/linux/mm.h --- 25/include/linux/mm.h~i_shared_lock 2004-05-04 20:54:36.451201704 -0700 +++ 25-akpm/include/linux/mm.h 2004-05-04 20:54:36.469198968 -0700 @@ -442,6 +442,7 @@ struct zap_details { struct address_space *check_mapping; /* Check page->mapping if set */ pgoff_t first_index; /* Lowest page->index to unmap */ pgoff_t last_index; /* Highest page->index to unmap */ + int atomic; /* May not schedule() */ }; void zap_page_range(struct vm_area_struct *vma, unsigned long address, diff -puN kernel/fork.c~i_shared_lock kernel/fork.c --- 25/kernel/fork.c~i_shared_lock 2004-05-04 20:54:36.452201552 -0700 +++ 25-akpm/kernel/fork.c 2004-05-04 20:54:36.470198816 -0700 @@ -323,9 +323,9 @@ static inline int dup_mmap(struct mm_str atomic_dec(&inode->i_writecount); /* insert tmp into the share list, just after mpnt */ - down(&file->f_mapping->i_shared_sem); + spin_lock(&file->f_mapping->i_shared_lock); list_add(&tmp->shared, &mpnt->shared); - up(&file->f_mapping->i_shared_sem); + spin_unlock(&file->f_mapping->i_shared_lock); } /* diff -puN mm/filemap.c~i_shared_lock mm/filemap.c --- 25/mm/filemap.c~i_shared_lock 2004-05-04 20:54:36.453201400 -0700 +++ 25-akpm/mm/filemap.c 2004-05-04 20:54:36.471198664 -0700 @@ -55,17 +55,17 @@ /* * Lock ordering: * - * ->i_shared_sem (vmtruncate) + * ->i_shared_lock (vmtruncate) * ->private_lock (__free_pte->__set_page_dirty_buffers) * ->swap_list_lock * ->swap_device_lock (exclusive_swap_page, others) * ->mapping->tree_lock * * ->i_sem - * ->i_shared_sem (truncate->unmap_mapping_range) + * ->i_shared_lock (truncate->unmap_mapping_range) * * ->mmap_sem - * ->i_shared_sem (various places) + * ->i_shared_lock (various places) * * ->mmap_sem * ->lock_page (access_process_vm) diff -puN mm/madvise.c~i_shared_lock mm/madvise.c --- 25/mm/madvise.c~i_shared_lock 2004-05-04 20:54:36.455201096 -0700 +++ 25-akpm/mm/madvise.c 2004-05-04 20:54:36.472198512 -0700 @@ -92,16 +92,14 @@ static long madvise_willneed(struct vm_a static long madvise_dontneed(struct vm_area_struct * vma, unsigned long start, unsigned long end) { - struct zap_details details; - if (vma->vm_flags & VM_LOCKED) return -EINVAL; if (unlikely(vma->vm_flags & VM_NONLINEAR)) { - details.check_mapping = NULL; - details.nonlinear_vma = vma; - details.first_index = 0; - details.last_index = ULONG_MAX; + struct zap_details details = { + .nonlinear_vma = vma, + .last_index = ULONG_MAX, + }; zap_page_range(vma, start, end - start, &details); } else zap_page_range(vma, start, end - start, NULL); diff -puN mm/memory.c~i_shared_lock mm/memory.c --- 25/mm/memory.c~i_shared_lock 2004-05-04 20:54:36.456200944 -0700 +++ 25-akpm/mm/memory.c 2004-05-04 20:54:36.474198208 -0700 @@ -365,6 +365,8 @@ static void zap_pte_range(struct mmu_gat if (offset + size > PMD_SIZE) size = PMD_SIZE - offset; size &= PAGE_MASK; + if (details && !details->check_mapping && !details->nonlinear_vma) + details = NULL; for (offset=0; offset < size; ptep++, offset += PAGE_SIZE) { pte_t pte = *ptep; if (pte_none(pte)) @@ -518,6 +520,7 @@ int unmap_vmas(struct mmu_gather **tlbp, unsigned long tlb_start = 0; /* For tlb_finish_mmu */ int tlb_start_valid = 0; int ret = 0; + int atomic = details && details->atomic; for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { unsigned long start; @@ -555,7 +558,7 @@ int unmap_vmas(struct mmu_gather **tlbp, zap_bytes -= block; if ((long)zap_bytes > 0) continue; - if (need_resched()) { + if (!atomic && need_resched()) { int fullmm = tlb_is_full_mm(*tlbp); tlb_finish_mmu(*tlbp, tlb_start, start); cond_resched_lock(&mm->page_table_lock); @@ -583,8 +586,6 @@ void zap_page_range(struct vm_area_struc unsigned long end = address + size; unsigned long nr_accounted = 0; - might_sleep(); - if (is_vm_hugetlb_page(vma)) { zap_hugepage_range(vma, address, size); return; @@ -1133,8 +1134,7 @@ static void unmap_mapping_range_list(str zea = vea; zap_page_range(vma, ((zba - vba) << PAGE_SHIFT) + vma->vm_start, - (zea - zba + 1) << PAGE_SHIFT, - details->check_mapping? details: NULL); + (zea - zba + 1) << PAGE_SHIFT, details); } } @@ -1155,7 +1155,7 @@ static void unmap_mapping_range_list(str * but 0 when invalidating pagecache, don't throw away private data. */ void unmap_mapping_range(struct address_space *mapping, - loff_t const holebegin, loff_t const holelen, int even_cows) + loff_t const holebegin, loff_t const holelen, int even_cows) { struct zap_details details; pgoff_t hba = holebegin >> PAGE_SHIFT; @@ -1173,10 +1173,11 @@ void unmap_mapping_range(struct address_ details.nonlinear_vma = NULL; details.first_index = hba; details.last_index = hba + hlen - 1; + details.atomic = 1; /* A spinlock is held */ if (details.last_index < details.first_index) details.last_index = ULONG_MAX; - down(&mapping->i_shared_sem); + spin_lock(&mapping->i_shared_lock); /* Protect against page fault */ atomic_inc(&mapping->truncate_count); if (unlikely(!list_empty(&mapping->i_mmap))) @@ -1187,7 +1188,7 @@ void unmap_mapping_range(struct address_ if (unlikely(!list_empty(&mapping->i_mmap_shared))) unmap_mapping_range_list(&mapping->i_mmap_shared, &details); - up(&mapping->i_shared_sem); + spin_unlock(&mapping->i_shared_lock); } EXPORT_SYMBOL(unmap_mapping_range); diff -puN mm/mmap.c~i_shared_lock mm/mmap.c --- 25/mm/mmap.c~i_shared_lock 2004-05-04 20:54:36.457200792 -0700 +++ 25-akpm/mm/mmap.c 2004-05-04 20:54:36.475198056 -0700 @@ -63,7 +63,7 @@ EXPORT_SYMBOL(sysctl_max_map_count); EXPORT_SYMBOL(vm_committed_space); /* - * Requires inode->i_mapping->i_shared_sem + * Requires inode->i_mapping->i_shared_lock */ static inline void __remove_shared_vm_struct(struct vm_area_struct *vma, struct inode *inode) @@ -84,9 +84,9 @@ static void remove_shared_vm_struct(stru if (file) { struct address_space *mapping = file->f_mapping; - down(&mapping->i_shared_sem); + spin_lock(&mapping->i_shared_lock); __remove_shared_vm_struct(vma, file->f_dentry->d_inode); - up(&mapping->i_shared_sem); + spin_unlock(&mapping->i_shared_lock); } } @@ -286,12 +286,12 @@ static void vma_link(struct mm_struct *m mapping = vma->vm_file->f_mapping; if (mapping) - down(&mapping->i_shared_sem); + spin_lock(&mapping->i_shared_lock); spin_lock(&mm->page_table_lock); __vma_link(mm, vma, prev, rb_link, rb_parent); spin_unlock(&mm->page_table_lock); if (mapping) - up(&mapping->i_shared_sem); + spin_unlock(&mapping->i_shared_lock); mark_mm_hugetlb(mm, vma); mm->map_count++; @@ -301,7 +301,7 @@ static void vma_link(struct mm_struct *m /* * Insert vm structure into process list sorted by address and into the inode's * i_mmap ring. The caller should hold mm->page_table_lock and - * ->f_mappping->i_shared_sem if vm_file is non-NULL. + * ->f_mappping->i_shared_lock if vm_file is non-NULL. */ static void __insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) @@ -391,7 +391,7 @@ static struct vm_area_struct *vma_merge( { spinlock_t *lock = &mm->page_table_lock; struct inode *inode = file ? file->f_dentry->d_inode : NULL; - struct semaphore *i_shared_sem; + spinlock_t *i_shared_lock; /* * We later require that vma->vm_flags == vm_flags, so this tests @@ -400,7 +400,7 @@ static struct vm_area_struct *vma_merge( if (vm_flags & VM_SPECIAL) return NULL; - i_shared_sem = file ? &file->f_mapping->i_shared_sem : NULL; + i_shared_lock = file ? &file->f_mapping->i_shared_lock : NULL; if (!prev) { prev = rb_entry(rb_parent, struct vm_area_struct, vm_rb); @@ -417,7 +417,7 @@ static struct vm_area_struct *vma_merge( if (unlikely(file && prev->vm_next && prev->vm_next->vm_file == file)) { - down(i_shared_sem); + spin_lock(i_shared_lock); need_up = 1; } spin_lock(lock); @@ -435,7 +435,7 @@ static struct vm_area_struct *vma_merge( __remove_shared_vm_struct(next, inode); spin_unlock(lock); if (need_up) - up(i_shared_sem); + spin_unlock(i_shared_lock); if (file) fput(file); @@ -445,7 +445,7 @@ static struct vm_area_struct *vma_merge( } spin_unlock(lock); if (need_up) - up(i_shared_sem); + spin_unlock(i_shared_lock); return prev; } @@ -460,13 +460,13 @@ static struct vm_area_struct *vma_merge( return NULL; if (end == prev->vm_start) { if (file) - down(i_shared_sem); + spin_lock(i_shared_lock); spin_lock(lock); prev->vm_start = addr; prev->vm_pgoff -= (end - addr) >> PAGE_SHIFT; spin_unlock(lock); if (file) - up(i_shared_sem); + spin_unlock(i_shared_lock); return prev; } } @@ -1232,7 +1232,7 @@ int split_vma(struct mm_struct * mm, str mapping = vma->vm_file->f_mapping; if (mapping) - down(&mapping->i_shared_sem); + spin_lock(&mapping->i_shared_lock); spin_lock(&mm->page_table_lock); if (new_below) { @@ -1245,7 +1245,7 @@ int split_vma(struct mm_struct * mm, str spin_unlock(&mm->page_table_lock); if (mapping) - up(&mapping->i_shared_sem); + spin_unlock(&mapping->i_shared_lock); return 0; } @@ -1479,7 +1479,7 @@ void exit_mmap(struct mm_struct *mm) /* Insert vm structure into process list sorted by address * and into the inode's i_mmap ring. If vm_file is non-NULL - * then i_shared_sem is taken here. + * then i_shared_lock is taken here. */ void insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) { diff -puN mm/mremap.c~i_shared_lock mm/mremap.c --- 25/mm/mremap.c~i_shared_lock 2004-05-04 20:54:36.459200488 -0700 +++ 25-akpm/mm/mremap.c 2004-05-04 20:54:36.476197904 -0700 @@ -180,7 +180,6 @@ static unsigned long move_vma(struct vm_ unsigned long new_len, unsigned long new_addr) { struct mm_struct *mm = vma->vm_mm; - struct address_space *mapping = NULL; struct vm_area_struct *new_vma; unsigned long vm_flags = vma->vm_flags; unsigned long new_pgoff; @@ -201,16 +200,6 @@ static unsigned long move_vma(struct vm_ if (!new_vma) return -ENOMEM; - if (vma->vm_file) { - /* - * Subtle point from Rajesh Venkatasubramanian: before - * moving file-based ptes, we must lock vmtruncate out, - * since it might clean the dst vma before the src vma, - * and we propagate stale pages into the dst afterward. - */ - mapping = vma->vm_file->f_mapping; - down(&mapping->i_shared_sem); - } moved_len = move_page_tables(vma, new_addr, old_addr, old_len, &cows); if (moved_len < old_len) { /* @@ -227,8 +216,6 @@ static unsigned long move_vma(struct vm_ if (cows) /* Downgrade or remove this message later */ printk(KERN_WARNING "%s: mremap moved %d cows\n", current->comm, cows); - if (mapping) - up(&mapping->i_shared_sem); /* Conceal VM_ACCOUNT so old reservation is not undone */ if (vm_flags & VM_ACCOUNT) { diff -puN mm/rmap.c~i_shared_lock mm/rmap.c --- 25/mm/rmap.c~i_shared_lock 2004-05-04 20:54:36.460200336 -0700 +++ 25-akpm/mm/rmap.c 2004-05-04 20:54:36.477197752 -0700 @@ -300,7 +300,7 @@ migrate: * * This function is only called from page_referenced for object-based pages. * - * The semaphore address_space->i_shared_sem is tried. If it can't be gotten, + * The semaphore address_space->i_shared_lock is tried. If it can't be gotten, * assume a reference count of 0, so try_to_unmap will then have a go. */ static inline int page_referenced_file(struct page *page) @@ -313,7 +313,7 @@ static inline int page_referenced_file(s int referenced = 0; int failed = 0; - if (down_trylock(&mapping->i_shared_sem)) + if (!spin_trylock(&mapping->i_shared_lock)) return 0; list_for_each_entry(vma, &mapping->i_mmap, shared) { @@ -355,7 +355,7 @@ static inline int page_referenced_file(s WARN_ON(!failed); out: - up(&mapping->i_shared_sem); + spin_unlock(&mapping->i_shared_lock); return referenced; } @@ -710,7 +710,7 @@ out: * * This function is only called from try_to_unmap for object-based pages. * - * The semaphore address_space->i_shared_sem is tried. If it can't be gotten, + * The semaphore address_space->i_shared_lock is tried. If it can't be gotten, * return a temporary error. */ static inline int try_to_unmap_file(struct page *page) @@ -725,7 +725,7 @@ static inline int try_to_unmap_file(stru unsigned long max_nl_cursor = 0; unsigned long max_nl_size = 0; - if (down_trylock(&mapping->i_shared_sem)) + if (!spin_trylock(&mapping->i_shared_lock)) return ret; list_for_each_entry(vma, &mapping->i_mmap, shared) { @@ -824,7 +824,7 @@ static inline int try_to_unmap_file(stru relock: rmap_lock(page); out: - up(&mapping->i_shared_sem); + spin_unlock(&mapping->i_shared_lock); return ret; } diff -puN Documentation/vm/locking~i_shared_lock Documentation/vm/locking --- 25/Documentation/vm/locking~i_shared_lock 2004-05-04 20:54:36.461200184 -0700 +++ 25-akpm/Documentation/vm/locking 2004-05-04 20:54:36.477197752 -0700 @@ -66,7 +66,7 @@ in some cases it is not really needed. E expand_stack(), it is hard to come up with a destructive scenario without having the vmlist protection in this case. -The page_table_lock nests with the inode i_shared_sem and the kmem cache +The page_table_lock nests with the inode i_shared_lock and the kmem cache c_spinlock spinlocks. This is okay, since the kmem code asks for pages after dropping c_spinlock. The page_table_lock also nests with pagecache_lock and pagemap_lru_lock spinlocks, and no code asks for memory with these locks diff -puN include/linux/rmap.h~i_shared_lock include/linux/rmap.h --- 25/include/linux/rmap.h~i_shared_lock 2004-05-04 20:54:36.462200032 -0700 +++ 25-akpm/include/linux/rmap.h 2004-05-04 20:54:36.478197600 -0700 @@ -69,15 +69,9 @@ static inline int mremap_moved_anon_rmap static inline int make_page_exclusive(struct vm_area_struct *vma, unsigned long addr) { - switch (handle_mm_fault(vma->vm_mm, vma, addr, 1)) { - case VM_FAULT_MINOR: - case VM_FAULT_MAJOR: + if (handle_mm_fault(vma->vm_mm, vma, addr, 1) != VM_FAULT_OOM) return 0; - case VM_FAULT_OOM: - return -ENOMEM; - default: - return -EFAULT; - } + return -ENOMEM; } /* _