From: Hugh Dickins The pte_chains rmap used pte_chain_lock (bit_spin_lock on PG_chainlock) to lock its pte_chains. We kept this (as page_map_lock: bit_spin_lock on PG_maplock) when we moved to objrmap. But the file objrmap locks its vma tree with mapping->i_mmap_lock, and the anon objrmap locks its vma list with anon_vma->lock: so isn't the page_map_lock superfluous? Pretty much, yes. The mapcount was protected by it, and needs to become an atomic: starting at -1 like page _count, so nr_mapped can be tracked precisely up and down. The last page_remove_rmap can't clear anon page mapping any more, because of races with page_add_rmap; from which some BUG_ONs must go for the same reason, but they've served their purpose. vmscan decisions are naturally racy, little change there beyond removing page_map_lock/unlock. But to stabilize the file-backed page->mapping against truncation while acquiring i_mmap_lock, page_referenced_file now needs page lock to be held even for refill_inactive_zone. There's a similar issue in acquiring anon_vma->lock, where page lock doesn't help: which this patch pretends to handle, but actually it needs the next. Roughly 10% cut off lmbench fork numbers on my 2*HT*P4. Must confess my testing failed to show the races even while they were knowingly exposed: would benefit from testing on racier equipment. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton --- 25-akpm/include/linux/mm.h | 22 ++++-- 25-akpm/include/linux/page-flags.h | 3 25-akpm/include/linux/rmap.h | 13 --- 25-akpm/mm/page_alloc.c | 10 +- 25-akpm/mm/rmap.c | 135 ++++++++++++++++++++----------------- 25-akpm/mm/vmscan.c | 37 +--------- 6 files changed, 110 insertions(+), 110 deletions(-) diff -puN include/linux/mm.h~rmaplock-2-5-kill-page_map_lock include/linux/mm.h --- 25/include/linux/mm.h~rmaplock-2-5-kill-page_map_lock 2004-08-09 22:02:28.961226272 -0700 +++ 25-akpm/include/linux/mm.h 2004-08-09 22:02:28.972224600 -0700 @@ -197,10 +197,9 @@ struct page { page_flags_t flags; /* Atomic flags, some possibly * updated asynchronously */ atomic_t _count; /* Usage count, see below. */ - unsigned int mapcount; /* Count of ptes mapped in mms, + atomic_t _mapcount; /* Count of ptes mapped in mms, * to show when page is mapped - * & limit reverse map searches, - * protected by PG_maplock. + * & limit reverse map searches. */ unsigned long private; /* Mapping-private opaque data: * usually used for buffer_heads @@ -474,11 +473,26 @@ static inline pgoff_t page_index(struct } /* + * The atomic page->_mapcount, like _count, starts from -1: + * so that transitions both from it and to it can be tracked, + * using atomic_inc_and_test and atomic_add_negative(-1). + */ +static inline void reset_page_mapcount(struct page *page) +{ + atomic_set(&(page)->_mapcount, -1); +} + +static inline int page_mapcount(struct page *page) +{ + return atomic_read(&(page)->_mapcount) + 1; +} + +/* * Return true if this page is mapped into pagetables. */ static inline int page_mapped(struct page *page) { - return page->mapcount != 0; + return atomic_read(&(page)->_mapcount) >= 0; } /* diff -puN include/linux/page-flags.h~rmaplock-2-5-kill-page_map_lock include/linux/page-flags.h --- 25/include/linux/page-flags.h~rmaplock-2-5-kill-page_map_lock 2004-08-09 22:02:28.962226120 -0700 +++ 25-akpm/include/linux/page-flags.h 2004-08-09 22:02:28.973224448 -0700 @@ -69,12 +69,11 @@ #define PG_private 12 /* Has something at ->private */ #define PG_writeback 13 /* Page is under writeback */ #define PG_nosave 14 /* Used for system suspend/resume */ -#define PG_maplock 15 /* Lock bit for rmap to ptes */ +#define PG_compound 15 /* Part of a compound page */ #define PG_swapcache 16 /* Swap page: swp_entry_t in private */ #define PG_mappedtodisk 17 /* Has blocks allocated on-disk */ #define PG_reclaim 18 /* To be reclaimed asap */ -#define PG_compound 19 /* Part of a compound page */ /* diff -puN include/linux/rmap.h~rmaplock-2-5-kill-page_map_lock include/linux/rmap.h --- 25/include/linux/rmap.h~rmaplock-2-5-kill-page_map_lock 2004-08-09 22:02:28.964225816 -0700 +++ 25-akpm/include/linux/rmap.h 2004-08-09 22:02:28.973224448 -0700 @@ -9,11 +9,6 @@ #include #include -#define page_map_lock(page) \ - bit_spin_lock(PG_maplock, (unsigned long *)&(page)->flags) -#define page_map_unlock(page) \ - bit_spin_unlock(PG_maplock, (unsigned long *)&(page)->flags) - /* * The anon_vma heads a list of private "related" vmas, to scan if * an anonymous page pointing to this anon_vma needs to be unmapped: @@ -87,15 +82,13 @@ void page_remove_rmap(struct page *); */ static inline void page_dup_rmap(struct page *page) { - page_map_lock(page); - page->mapcount++; - page_map_unlock(page); + atomic_inc(&page->_mapcount); } /* * Called from mm/vmscan.c to handle paging out */ -int page_referenced(struct page *); +int page_referenced(struct page *, int is_locked); int try_to_unmap(struct page *); #else /* !CONFIG_MMU */ @@ -104,7 +97,7 @@ int try_to_unmap(struct page *); #define anon_vma_prepare(vma) (0) #define anon_vma_link(vma) do {} while (0) -#define page_referenced(page) TestClearPageReferenced(page) +#define page_referenced(page,l) TestClearPageReferenced(page) #define try_to_unmap(page) SWAP_FAIL #endif /* CONFIG_MMU */ diff -puN mm/page_alloc.c~rmaplock-2-5-kill-page_map_lock mm/page_alloc.c --- 25/mm/page_alloc.c~rmaplock-2-5-kill-page_map_lock 2004-08-09 22:02:28.965225664 -0700 +++ 25-akpm/mm/page_alloc.c 2004-08-09 22:02:28.975224144 -0700 @@ -80,7 +80,7 @@ static void bad_page(const char *functio function, current->comm, page); printk(KERN_EMERG "flags:0x%08lx mapping:%p mapcount:%d count:%d\n", (unsigned long)page->flags, page->mapping, - (int)page->mapcount, page_count(page)); + page_mapcount(page), page_count(page)); printk(KERN_EMERG "Backtrace:\n"); dump_stack(); printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"); @@ -89,12 +89,11 @@ static void bad_page(const char *functio 1 << PG_lru | 1 << PG_active | 1 << PG_dirty | - 1 << PG_maplock | 1 << PG_swapcache | 1 << PG_writeback); set_page_count(page, 0); + reset_page_mapcount(page); page->mapping = NULL; - page->mapcount = 0; } #ifndef CONFIG_HUGETLB_PAGE @@ -230,7 +229,6 @@ static inline void free_pages_check(cons 1 << PG_active | 1 << PG_reclaim | 1 << PG_slab | - 1 << PG_maplock | 1 << PG_swapcache | 1 << PG_writeback ))) bad_page(function, page); @@ -352,7 +350,6 @@ static void prep_new_page(struct page *p 1 << PG_active | 1 << PG_dirty | 1 << PG_reclaim | - 1 << PG_maplock | 1 << PG_swapcache | 1 << PG_writeback ))) bad_page(__FUNCTION__, page); @@ -514,6 +511,8 @@ static void fastcall free_hot_cold_page( kernel_map_pages(page, 1, 0); inc_page_state(pgfree); + if (PageAnon(page)) + page->mapping = NULL; free_pages_check(__FUNCTION__, page); pcp = &zone->pageset[get_cpu()].pcp[cold]; local_irq_save(flags); @@ -1390,6 +1389,7 @@ void __init memmap_init_zone(struct page for (page = start; page < (start + size); page++) { set_page_zone(page, NODEZONE(nid, zone)); set_page_count(page, 0); + reset_page_mapcount(page); SetPageReserved(page); INIT_LIST_HEAD(&page->lru); #ifdef WANT_PAGE_VIRTUAL diff -puN mm/rmap.c~rmaplock-2-5-kill-page_map_lock mm/rmap.c --- 25/mm/rmap.c~rmaplock-2-5-kill-page_map_lock 2004-08-09 22:02:28.967225360 -0700 +++ 25-akpm/mm/rmap.c 2004-08-09 22:02:28.977223840 -0700 @@ -163,13 +163,6 @@ void __init anon_vma_init(void) sizeof(struct anon_vma), 0, SLAB_PANIC, anon_vma_ctor, NULL); } -/* this needs the page->flags PG_maplock held */ -static inline void clear_page_anon(struct page *page) -{ - BUG_ON(!page->mapping); - page->mapping = NULL; -} - /* * At what user virtual address is page expected in vma? */ @@ -245,15 +238,22 @@ out: return referenced; } -static inline int page_referenced_anon(struct page *page) +static int page_referenced_anon(struct page *page) { - unsigned int mapcount = page->mapcount; + unsigned int mapcount; struct anon_vma *anon_vma = (void *) page->mapping - PAGE_MAPPING_ANON; struct vm_area_struct *vma; int referenced = 0; + /* + * Recheck mapcount: it is not safe to take anon_vma->lock after + * last page_remove_rmap, since struct anon_vma might be reused. + */ + mapcount = page_mapcount(page); + if (!mapcount) + return referenced; + spin_lock(&anon_vma->lock); - BUG_ON(list_empty(&anon_vma->head)); list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { referenced += page_referenced_one(page, vma, &mapcount); if (!mapcount) @@ -277,18 +277,39 @@ static inline int page_referenced_anon(s * The spinlock address_space->i_mmap_lock is tried. If it can't be gotten, * assume a reference count of 0, so try_to_unmap will then have a go. */ -static inline int page_referenced_file(struct page *page) +static int page_referenced_file(struct page *page) { - unsigned int mapcount = page->mapcount; + unsigned int mapcount; struct address_space *mapping = page->mapping; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); struct vm_area_struct *vma; struct prio_tree_iter iter; int referenced = 0; + /* + * The caller's checks on page->mapping and !PageAnon have made + * sure that this is a file page: the check for page->mapping + * excludes the case just before it gets set on an anon page. + */ + BUG_ON(PageAnon(page)); + + /* + * The page lock not only makes sure that page->mapping cannot + * suddenly be NULLified by truncation, it makes sure that the + * structure at mapping cannot be freed and reused yet, + * so we can safely take mapping->i_mmap_lock. + */ + BUG_ON(!PageLocked(page)); + if (!spin_trylock(&mapping->i_mmap_lock)) return 0; + /* + * i_mmap_lock does not stabilize mapcount at all, but mapcount + * is more likely to be accurate if we note it after spinning. + */ + mapcount = page_mapcount(page); + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE)) == (VM_LOCKED|VM_MAYSHARE)) { @@ -307,12 +328,12 @@ static inline int page_referenced_file(s /** * page_referenced - test if the page was referenced * @page: the page to test + * @is_locked: caller holds lock on the page * * Quick test_and_clear_referenced for all mappings to a page, * returns the number of ptes which referenced the page. - * Caller needs to hold the rmap lock. */ -int page_referenced(struct page *page) +int page_referenced(struct page *page, int is_locked) { int referenced = 0; @@ -322,11 +343,17 @@ int page_referenced(struct page *page) if (TestClearPageReferenced(page)) referenced++; - if (page->mapcount && page->mapping) { + if (page_mapped(page) && page->mapping) { if (PageAnon(page)) referenced += page_referenced_anon(page); - else + else if (is_locked) + referenced += page_referenced_file(page); + else if (TestSetPageLocked(page)) + referenced++; + else if (page->mapping) { referenced += page_referenced_file(page); + unlock_page(page); + } } return referenced; } @@ -353,18 +380,12 @@ void page_add_anon_rmap(struct page *pag index += vma->vm_pgoff; index >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; - page_map_lock(page); - if (!page->mapcount) { - BUG_ON(page->mapping); + if (atomic_inc_and_test(&page->_mapcount)) { page->index = index; page->mapping = (struct address_space *) anon_vma; inc_page_state(nr_mapped); - } else { - BUG_ON(page->index != index); - BUG_ON(page->mapping != (struct address_space *) anon_vma); } - page->mapcount++; - page_map_unlock(page); + /* else checking page index and mapping is racy */ } /** @@ -379,11 +400,8 @@ void page_add_file_rmap(struct page *pag if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) return; - page_map_lock(page); - if (!page->mapcount) + if (atomic_inc_and_test(&page->_mapcount)) inc_page_state(nr_mapped); - page->mapcount++; - page_map_unlock(page); } /** @@ -395,18 +413,20 @@ void page_add_file_rmap(struct page *pag void page_remove_rmap(struct page *page) { BUG_ON(PageReserved(page)); - BUG_ON(!page->mapcount); - page_map_lock(page); - page->mapcount--; - if (!page->mapcount) { + if (atomic_add_negative(-1, &page->_mapcount)) { + BUG_ON(page_mapcount(page) < 0); + /* + * It would be tidy to reset the PageAnon mapping here, + * but that might overwrite a racing page_add_anon_rmap + * which increments mapcount after us but sets mapping + * before us: so leave the reset to free_hot_cold_page, + * and remember that it's only reliable while mapped. + */ if (page_test_and_clear_dirty(page)) set_page_dirty(page); - if (PageAnon(page)) - clear_page_anon(page); dec_page_state(nr_mapped); } - page_map_unlock(page); } /* @@ -478,7 +498,7 @@ static int try_to_unmap_one(struct page * ptes from being unmapped, so swapoff can make progress. */ if (PageSwapCache(page) && - page_count(page) != page->mapcount + 2) { + page_count(page) != page_mapcount(page) + 2) { ret = SWAP_FAIL; goto out_unmap; } @@ -504,8 +524,7 @@ static int try_to_unmap_one(struct page } mm->rss--; - BUG_ON(!page->mapcount); - page->mapcount--; + page_remove_rmap(page); page_cache_release(page); out_unmap: @@ -616,17 +635,23 @@ out_unlock: return SWAP_AGAIN; } -static inline int try_to_unmap_anon(struct page *page) +static int try_to_unmap_anon(struct page *page) { struct anon_vma *anon_vma = (void *) page->mapping - PAGE_MAPPING_ANON; struct vm_area_struct *vma; int ret = SWAP_AGAIN; + /* + * Recheck mapped: it is not safe to take anon_vma->lock after + * last page_remove_rmap, since struct anon_vma might be reused. + */ + if (!page_mapped(page)) + return ret; + spin_lock(&anon_vma->lock); - BUG_ON(list_empty(&anon_vma->head)); list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { ret = try_to_unmap_one(page, vma); - if (ret == SWAP_FAIL || !page->mapcount) + if (ret == SWAP_FAIL || !page_mapped(page)) break; } spin_unlock(&anon_vma->lock); @@ -645,7 +670,7 @@ static inline int try_to_unmap_anon(stru * The spinlock address_space->i_mmap_lock is tried. If it can't be gotten, * return a temporary error. */ -static inline int try_to_unmap_file(struct page *page) +static int try_to_unmap_file(struct page *page) { struct address_space *mapping = page->mapping; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); @@ -662,7 +687,7 @@ static inline int try_to_unmap_file(stru vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { ret = try_to_unmap_one(page, vma); - if (ret == SWAP_FAIL || !page->mapcount) + if (ret == SWAP_FAIL || !page_mapped(page)) goto out; } @@ -691,8 +716,9 @@ static inline int try_to_unmap_file(stru * The mapcount of the page we came in with is irrelevant, * but even so use it as a guide to how hard we should try? */ - mapcount = page->mapcount; - page_map_unlock(page); + mapcount = page_mapcount(page); + if (!mapcount) + goto out; cond_resched_lock(&mapping->i_mmap_lock); max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; @@ -715,7 +741,7 @@ static inline int try_to_unmap_file(stru cursor += CLUSTER_SIZE; vma->vm_private_data = (void *) cursor; if ((int)mapcount <= 0) - goto relock; + goto out; } if (ret != SWAP_FAIL) vma->vm_private_data = @@ -736,8 +762,6 @@ static inline int try_to_unmap_file(stru if (!(vma->vm_flags & VM_RESERVED)) vma->vm_private_data = NULL; } -relock: - page_map_lock(page); out: spin_unlock(&mapping->i_mmap_lock); return ret; @@ -748,11 +772,11 @@ out: * @page: the page to get unmapped * * Tries to remove all the page table entries which are mapping this - * page, used in the pageout path. Caller must hold the page lock - * and its rmap lock. Return values are: + * page, used in the pageout path. Caller must hold the page lock. + * Return values are: * * SWAP_SUCCESS - we succeeded in removing all mappings - * SWAP_AGAIN - we missed a trylock, try again later + * SWAP_AGAIN - we missed a mapping, try again later * SWAP_FAIL - the page is unswappable */ int try_to_unmap(struct page *page) @@ -761,20 +785,13 @@ int try_to_unmap(struct page *page) BUG_ON(PageReserved(page)); BUG_ON(!PageLocked(page)); - BUG_ON(!page->mapcount); if (PageAnon(page)) ret = try_to_unmap_anon(page); else ret = try_to_unmap_file(page); - if (!page->mapcount) { - if (page_test_and_clear_dirty(page)) - set_page_dirty(page); - if (PageAnon(page)) - clear_page_anon(page); - dec_page_state(nr_mapped); + if (!page_mapped(page)) ret = SWAP_SUCCESS; - } return ret; } diff -puN mm/vmscan.c~rmaplock-2-5-kill-page_map_lock mm/vmscan.c --- 25/mm/vmscan.c~rmaplock-2-5-kill-page_map_lock 2004-08-09 22:02:28.968225208 -0700 +++ 25-akpm/mm/vmscan.c 2004-08-09 22:02:28.979223536 -0700 @@ -219,7 +219,7 @@ static int shrink_slab(unsigned long sca return 0; } -/* Must be called with page's rmap lock held. */ +/* Called without lock on whether page is mapped, so answer is unstable */ static inline int page_mapping_inuse(struct page *page) { struct address_space *mapping; @@ -377,26 +377,19 @@ static int shrink_list(struct list_head if (page_mapped(page) || PageSwapCache(page)) sc->nr_scanned++; - page_map_lock(page); - referenced = page_referenced(page); - if (referenced && page_mapping_inuse(page)) { - /* In active use or really unfreeable. Activate it. */ - page_map_unlock(page); + referenced = page_referenced(page, 1); + /* In active use or really unfreeable? Activate it. */ + if (referenced && page_mapping_inuse(page)) goto activate_locked; - } #ifdef CONFIG_SWAP /* * Anonymous process memory has backing store? * Try to allocate it some swap space here. - * - * XXX: implement swap clustering ? */ if (PageAnon(page) && !PageSwapCache(page)) { - page_map_unlock(page); if (!add_to_swap(page)) goto activate_locked; - page_map_lock(page); } #endif /* CONFIG_SWAP */ @@ -411,16 +404,13 @@ static int shrink_list(struct list_head if (page_mapped(page) && mapping) { switch (try_to_unmap(page)) { case SWAP_FAIL: - page_map_unlock(page); goto activate_locked; case SWAP_AGAIN: - page_map_unlock(page); goto keep_locked; case SWAP_SUCCESS: ; /* try to free the page below */ } } - page_map_unlock(page); if (PageDirty(page)) { if (referenced) @@ -723,25 +713,12 @@ refill_inactive_zone(struct zone *zone, page = lru_to_page(&l_hold); list_del(&page->lru); if (page_mapped(page)) { - if (!reclaim_mapped) { - list_add(&page->lru, &l_active); - continue; - } - page_map_lock(page); - if (page_referenced(page)) { - page_map_unlock(page); + if (!reclaim_mapped || + (total_swap_pages == 0 && PageAnon(page)) || + page_referenced(page, 0)) { list_add(&page->lru, &l_active); continue; } - page_map_unlock(page); - } - /* - * FIXME: need to consider page_count(page) here if/when we - * reap orphaned pages via the LRU (Daniel's locking stuff) - */ - if (total_swap_pages == 0 && PageAnon(page)) { - list_add(&page->lru, &l_active); - continue; } list_add(&page->lru, &l_inactive); } _