From: Hugh Dickins With page_map_lock out of the way, there's no need for page_referenced and try_to_unmap to use trylocks - provided we switch anon_vma->lock and mm->page_table_lock around in anon_vma_prepare. Though I suppose it's possible that we'll find that vmscan makes better progress with trylocks than spinning - we're free to choose trylocks again if so. Try to update the mm lock ordering documentation in filemap.c. But I still find it confusing, and I've no idea of where to stop. So add an mm lock ordering list I can understand to rmap.c. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton --- 25-akpm/mm/filemap.c | 15 +++++---- 25-akpm/mm/rmap.c | 82 +++++++++++++++++++++++++++------------------------ 2 files changed, 54 insertions(+), 43 deletions(-) diff -puN mm/filemap.c~rmaplock-4-5-mm-lock-ordering mm/filemap.c --- 25/mm/filemap.c~rmaplock-4-5-mm-lock-ordering Thu Aug 5 15:42:46 2004 +++ 25-akpm/mm/filemap.c Thu Aug 5 15:42:46 2004 @@ -60,7 +60,6 @@ * ->swap_list_lock * ->swap_device_lock (exclusive_swap_page, others) * ->mapping->tree_lock - * ->page_map_lock() (try_to_unmap_file) * * ->i_sem * ->i_mmap_lock (truncate->unmap_mapping_range) @@ -83,16 +82,20 @@ * ->sb_lock (fs/fs-writeback.c) * ->mapping->tree_lock (__sync_single_inode) * + * ->i_mmap_lock + * ->anon_vma.lock (vma_adjust) + * + * ->anon_vma.lock + * ->page_table_lock (anon_vma_prepare and various) + * * ->page_table_lock * ->swap_device_lock (try_to_unmap_one) * ->private_lock (try_to_unmap_one) * ->tree_lock (try_to_unmap_one) * ->zone.lru_lock (follow_page->mark_page_accessed) - * ->page_map_lock() (page_add_anon_rmap) - * ->tree_lock (page_remove_rmap->set_page_dirty) - * ->private_lock (page_remove_rmap->set_page_dirty) - * ->inode_lock (page_remove_rmap->set_page_dirty) - * ->anon_vma.lock (anon_vma_prepare) + * ->private_lock (page_remove_rmap->set_page_dirty) + * ->tree_lock (page_remove_rmap->set_page_dirty) + * ->inode_lock (page_remove_rmap->set_page_dirty) * ->inode_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->__set_page_dirty_buffers) * diff -puN mm/rmap.c~rmaplock-4-5-mm-lock-ordering mm/rmap.c --- 25/mm/rmap.c~rmaplock-4-5-mm-lock-ordering Thu Aug 5 15:42:46 2004 +++ 25-akpm/mm/rmap.c Thu Aug 5 15:42:46 2004 @@ -18,9 +18,30 @@ */ /* - * Locking: see "Lock ordering" summary in filemap.c. - * In swapout, page_map_lock is held on entry to page_referenced and - * try_to_unmap, so they trylock for i_mmap_lock and page_table_lock. + * Lock ordering in mm: + * + * inode->i_sem (while writing or truncating, not reading or faulting) + * inode->i_alloc_sem + * + * When a page fault occurs in writing from user to file, down_read + * of mmap_sem nests within i_sem; in sys_msync, i_sem nests within + * down_read of mmap_sem; i_sem and down_write of mmap_sem are never + * taken together; in truncation, i_sem is taken outermost. + * + * mm->mmap_sem + * page->flags PG_locked (lock_page) + * mapping->i_mmap_lock + * anon_vma->lock + * mm->page_table_lock + * zone->lru_lock (in mark_page_accessed) + * swap_list_lock (in swap_free etc's swap_info_get) + * swap_device_lock (in swap_duplicate, swap_info_get) + * mapping->private_lock (in __set_page_dirty_buffers) + * inode_lock (in set_page_dirty's __mark_inode_dirty) + * sb_lock (within inode_lock in fs/fs-writeback.c) + * mapping->tree_lock (widely used, in set_page_dirty, + * in arch-dependent flush_dcache_mmap_lock, + * within inode_lock in __sync_single_inode) */ #include @@ -64,28 +85,32 @@ int anon_vma_prepare(struct vm_area_stru might_sleep(); if (unlikely(!anon_vma)) { struct mm_struct *mm = vma->vm_mm; - struct anon_vma *allocated = NULL; + struct anon_vma *allocated, *locked; anon_vma = find_mergeable_anon_vma(vma); - if (!anon_vma) { + if (anon_vma) { + allocated = NULL; + locked = anon_vma; + spin_lock(&locked->lock); + } else { anon_vma = anon_vma_alloc(); if (unlikely(!anon_vma)) return -ENOMEM; allocated = anon_vma; + locked = NULL; } /* page_table_lock to protect against threads */ spin_lock(&mm->page_table_lock); if (likely(!vma->anon_vma)) { - if (!allocated) - spin_lock(&anon_vma->lock); vma->anon_vma = anon_vma; list_add(&vma->anon_vma_node, &anon_vma->head); - if (!allocated) - spin_unlock(&anon_vma->lock); allocated = NULL; } spin_unlock(&mm->page_table_lock); + + if (locked) + spin_unlock(&locked->lock); if (unlikely(allocated)) anon_vma_free(allocated); } @@ -225,8 +250,7 @@ static int page_referenced_one(struct pa if (address == -EFAULT) goto out; - if (!spin_trylock(&mm->page_table_lock)) - goto out; + spin_lock(&mm->page_table_lock); pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) @@ -293,9 +317,6 @@ static int page_referenced_anon(struct p * of references it found. * * This function is only called from page_referenced for object-based pages. - * - * The spinlock address_space->i_mmap_lock is tried. If it can't be gotten, - * assume a reference count of 0, so try_to_unmap will then have a go. */ static int page_referenced_file(struct page *page) { @@ -321,8 +342,7 @@ static int page_referenced_file(struct p */ BUG_ON(!PageLocked(page)); - if (!spin_trylock(&mapping->i_mmap_lock)) - return 0; + spin_lock(&mapping->i_mmap_lock); /* * i_mmap_lock does not stabilize mapcount at all, but mapcount @@ -473,8 +493,7 @@ static int try_to_unmap_one(struct page * We need the page_table_lock to protect us from page faults, * munmap, fork, etc... */ - if (!spin_trylock(&mm->page_table_lock)) - goto out; + spin_lock(&mm->page_table_lock); pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) @@ -577,7 +596,7 @@ out: #define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE) #define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) -static int try_to_unmap_cluster(unsigned long cursor, +static void try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, struct vm_area_struct *vma) { struct mm_struct *mm = vma->vm_mm; @@ -594,8 +613,7 @@ static int try_to_unmap_cluster(unsigned * We need the page_table_lock to protect us from page faults, * munmap, fork, etc... */ - if (!spin_trylock(&mm->page_table_lock)) - return SWAP_FAIL; + spin_lock(&mm->page_table_lock); address = (vma->vm_start + cursor) & CLUSTER_MASK; end = address + CLUSTER_SIZE; @@ -652,7 +670,6 @@ static int try_to_unmap_cluster(unsigned out_unlock: spin_unlock(&mm->page_table_lock); - return SWAP_AGAIN; } static int try_to_unmap_anon(struct page *page) @@ -682,9 +699,6 @@ static int try_to_unmap_anon(struct page * contained in the address_space struct it points to. * * This function is only called from try_to_unmap for object-based pages. - * - * The spinlock address_space->i_mmap_lock is tried. If it can't be gotten, - * return a temporary error. */ static int try_to_unmap_file(struct page *page) { @@ -698,9 +712,7 @@ static int try_to_unmap_file(struct page unsigned long max_nl_size = 0; unsigned int mapcount; - if (!spin_trylock(&mapping->i_mmap_lock)) - return ret; - + spin_lock(&mapping->i_mmap_lock); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { ret = try_to_unmap_one(page, vma); if (ret == SWAP_FAIL || !page_mapped(page)) @@ -722,8 +734,10 @@ static int try_to_unmap_file(struct page max_nl_size = cursor; } - if (max_nl_size == 0) /* any nonlinears locked or reserved */ + if (max_nl_size == 0) { /* any nonlinears locked or reserved */ + ret = SWAP_FAIL; goto out; + } /* * We don't try to search for this page in the nonlinear vmas, @@ -750,19 +764,13 @@ static int try_to_unmap_file(struct page while (vma->vm_mm->rss && cursor < max_nl_cursor && cursor < vma->vm_end - vma->vm_start) { - ret = try_to_unmap_cluster( - cursor, &mapcount, vma); - if (ret == SWAP_FAIL) - break; + try_to_unmap_cluster(cursor, &mapcount, vma); cursor += CLUSTER_SIZE; vma->vm_private_data = (void *) cursor; if ((int)mapcount <= 0) goto out; } - if (ret != SWAP_FAIL) - vma->vm_private_data = - (void *) max_nl_cursor; - ret = SWAP_AGAIN; + vma->vm_private_data = (void *) max_nl_cursor; } cond_resched_lock(&mapping->i_mmap_lock); max_nl_cursor += CLUSTER_SIZE; _