From Dave. Crappy name. fs/exec.c | 1 include/asm-i386/mman.h | 0 include/asm-ppc64/mman.h | 0 include/linux/mm.h | 1 include/linux/page-flags.h | 5 include/linux/rmap-locking.h | 0 include/linux/swap.h | 2 mm/filemap.c | 3 mm/fremap.c | 22 +- mm/memory.c | 8 mm/mmap.c | 16 + mm/page_alloc.c | 2 mm/rmap.c | 404 ++++++++++++++++++++++++++++++++++++++++++- mm/swapfile.c | 116 +++++++----- 14 files changed, 520 insertions(+), 60 deletions(-) diff -puN fs/exec.c~objrmap fs/exec.c --- 25/fs/exec.c~objrmap 2003-04-08 03:16:49.000000000 -0700 +++ 25-akpm/fs/exec.c 2003-04-08 03:16:49.000000000 -0700 @@ -315,6 +315,7 @@ void put_dirty_page(struct task_struct * lru_cache_add_active(page); flush_dcache_page(page); flush_page_to_ram(page); + SetPageAnon(page); set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, PAGE_COPY)))); pte_chain = page_add_rmap(page, pte, pte_chain); pte_unmap(pte); diff -puN include/asm-i386/mman.h~objrmap include/asm-i386/mman.h diff -puN include/linux/mm.h~objrmap include/linux/mm.h --- 25/include/linux/mm.h~objrmap 2003-04-08 03:16:49.000000000 -0700 +++ 25-akpm/include/linux/mm.h 2003-04-08 03:16:49.000000000 -0700 @@ -170,6 +170,7 @@ struct page { struct pte_chain *chain;/* Reverse pte mapping pointer. * protected by PG_chainlock */ pte_addr_t direct; + int mapcount; } pte; unsigned long private; /* mapping-private opaque data */ diff -puN include/linux/page-flags.h~objrmap include/linux/page-flags.h --- 25/include/linux/page-flags.h~objrmap 2003-04-08 03:16:49.000000000 -0700 +++ 25-akpm/include/linux/page-flags.h 2003-04-08 03:16:49.000000000 -0700 @@ -74,6 +74,7 @@ #define PG_mappedtodisk 17 /* Has blocks allocated on-disk */ #define PG_reclaim 18 /* To be reclaimed asap */ #define PG_compound 19 /* Part of a compound page */ +#define PG_anon 20 /* Anonymous page */ /* * Global page accounting. One instance per CPU. Only unsigned longs are @@ -254,6 +255,10 @@ extern void get_full_page_state(struct p #define SetPageCompound(page) set_bit(PG_compound, &(page)->flags) #define ClearPageCompound(page) clear_bit(PG_compound, &(page)->flags) +#define PageAnon(page) test_bit(PG_anon, &(page)->flags) +#define SetPageAnon(page) set_bit(PG_anon, &(page)->flags) +#define ClearPageAnon(page) clear_bit(PG_anon, &(page)->flags) + /* * The PageSwapCache predicate doesn't use a PG_flag at this time, * but it may again do so one day. diff -puN mm/fremap.c~objrmap mm/fremap.c --- 25/mm/fremap.c~objrmap 2003-04-08 03:16:49.000000000 -0700 +++ 25-akpm/mm/fremap.c 2003-04-08 03:16:49.000000000 -0700 @@ -60,10 +60,26 @@ int install_page(struct mm_struct *mm, s pgd_t *pgd; pmd_t *pmd; struct pte_chain *pte_chain; + unsigned long pgidx; pte_chain = pte_chain_alloc(GFP_KERNEL); if (!pte_chain) goto err; + + /* + * Convert this page to anon for objrmap if it's nonlinear + */ + pgidx = (addr - vma->vm_start) >> PAGE_SHIFT; + pgidx += vma->vm_pgoff; + pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; + if (!PageAnon(page) && (page->index != pgidx)) { + lock_page(page); + err = page_convert_anon(page); + unlock_page(page); + if (err < 0) + goto err_free; + } + pgd = pgd_offset(mm, addr); spin_lock(&mm->page_table_lock); @@ -86,12 +102,10 @@ int install_page(struct mm_struct *mm, s if (flush) flush_tlb_page(vma, addr); - spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); - return 0; - + err = 0; err_unlock: spin_unlock(&mm->page_table_lock); +err_free: pte_chain_free(pte_chain); err: return err; diff -puN mm/memory.c~objrmap mm/memory.c --- 25/mm/memory.c~objrmap 2003-04-08 03:16:49.000000000 -0700 +++ 25-akpm/mm/memory.c 2003-04-08 03:16:49.000000000 -0700 @@ -1012,6 +1012,7 @@ static int do_wp_page(struct mm_struct * ++mm->rss; page_remove_rmap(old_page, page_table); break_cow(vma, new_page, address, page_table); + SetPageAnon(new_page); pte_chain = page_add_rmap(new_page, page_table, pte_chain); lru_cache_add_active(new_page); @@ -1221,6 +1222,7 @@ static int do_swap_page(struct mm_struct flush_page_to_ram(page); flush_icache_page(vma, page); set_pte(page_table, pte); + SetPageAnon(page); pte_chain = page_add_rmap(page, page_table, pte_chain); /* No need to invalidate - it was non-present before */ @@ -1287,6 +1289,7 @@ do_anonymous_page(struct mm_struct *mm, entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); lru_cache_add_active(page); mark_page_accessed(page); + SetPageAnon(page); } set_pte(page_table, entry); @@ -1346,6 +1349,10 @@ do_no_page(struct mm_struct *mm, struct if (!pte_chain) goto oom; + /* See if nopage returned an anon page */ + if (!new_page->mapping || PageSwapCache(new_page)) + SetPageAnon(new_page); + /* * Should we do an early C-O-W break? */ @@ -1358,6 +1365,7 @@ do_no_page(struct mm_struct *mm, struct copy_user_highpage(page, new_page, address); page_cache_release(new_page); lru_cache_add_active(page); + SetPageAnon(page); new_page = page; } diff -puN mm/mmap.c~objrmap mm/mmap.c --- 25/mm/mmap.c~objrmap 2003-04-08 03:16:49.000000000 -0700 +++ 25-akpm/mm/mmap.c 2003-04-08 03:16:49.000000000 -0700 @@ -311,14 +311,26 @@ static inline void __vma_link_file(struc if (file) { struct inode * inode = file->f_dentry->d_inode; struct address_space *mapping = inode->i_mapping; + struct list_head *vmlist, *vmhead; if (vma->vm_flags & VM_DENYWRITE) atomic_dec(&inode->i_writecount); if (vma->vm_flags & VM_SHARED) - list_add_tail(&vma->shared, &mapping->i_mmap_shared); + vmhead = &mapping->i_mmap_shared; else - list_add_tail(&vma->shared, &mapping->i_mmap); + vmhead = &mapping->i_mmap; + + list_for_each(vmlist, &mapping->i_mmap_shared) { + struct vm_area_struct *vmtemp; + vmtemp = list_entry(vmlist, struct vm_area_struct, shared); + if (vmtemp->vm_pgoff >= vma->vm_pgoff) + break; + } + if (vmlist == vmhead) + list_add_tail(&vma->shared, vmlist); + else + list_add(&vma->shared, vmlist); } } diff -puN mm/page_alloc.c~objrmap mm/page_alloc.c --- 25/mm/page_alloc.c~objrmap 2003-04-08 03:16:49.000000000 -0700 +++ 25-akpm/mm/page_alloc.c 2003-04-08 03:16:49.000000000 -0700 @@ -220,6 +220,8 @@ static inline void free_pages_check(cons bad_page(function, page); if (PageDirty(page)) ClearPageDirty(page); + if (PageAnon(page)) + ClearPageAnon(page); } /* diff -puN mm/rmap.c~objrmap mm/rmap.c --- 25/mm/rmap.c~objrmap 2003-04-08 03:16:49.000000000 -0700 +++ 25-akpm/mm/rmap.c 2003-04-08 03:16:49.000000000 -0700 @@ -14,8 +14,8 @@ /* * Locking: * - the page->pte.chain is protected by the PG_chainlock bit, - * which nests within the zone->lru_lock, then the - * mm->page_table_lock, and then the page lock. + * which nests within the the mm->page_table_lock, + * which nests within the page lock. * - because swapout locking is opposite to the locking order * in the page fault path, the swapout path uses trylocks * on the mm->page_table_lock @@ -102,6 +102,140 @@ pte_chain_encode(struct pte_chain *pte_c **/ /** + * find_pte - Find a pte pointer given a vma and a struct page. + * @vma: the vma to search + * @page: the page to find + * + * Determine if this page is mapped in this vma. If it is, map and rethrn + * the pte pointer associated with it. Return null if the page is not + * mapped in this vma for any reason. + * + * This is strictly an internal helper function for the object-based rmap + * functions. + * + * It is the caller's responsibility to unmap the pte if it is returned. + */ +static inline pte_t * +find_pte(struct vm_area_struct *vma, struct page *page, unsigned long *addr) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + unsigned long loffset; + unsigned long address; + + loffset = (page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT)); + address = vma->vm_start + ((loffset - vma->vm_pgoff) << PAGE_SHIFT); + if (address < vma->vm_start || address >= vma->vm_end) + goto out; + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + + pmd = pmd_offset(pgd, address); + if (!pmd_present(*pmd)) + goto out; + + pte = pte_offset_map(pmd, address); + if (!pte_present(*pte)) + goto out_unmap; + + if (page_to_pfn(page) != pte_pfn(*pte)) + goto out_unmap; + + if (addr) + *addr = address; + + return pte; + +out_unmap: + pte_unmap(pte); +out: + return NULL; +} + +/** + * page_referenced_obj_one - referenced check for object-based rmap + * @vma: the vma to look in. + * @page: the page we're working on. + * + * Find a pte entry for a page/vma pair, then check and clear the referenced + * bit. + * + * This is strictly a helper function for page_referenced_obj. + */ +static int +page_referenced_obj_one(struct vm_area_struct *vma, struct page *page) +{ + struct mm_struct *mm = vma->vm_mm; + pte_t *pte; + int referenced = 0; + + if (!spin_trylock(&mm->page_table_lock)) + return 1; + + pte = find_pte(vma, page, NULL); + if (pte) { + if (ptep_test_and_clear_young(pte)) + referenced++; + pte_unmap(pte); + } + + spin_unlock(&mm->page_table_lock); + return referenced; +} + +/** + * page_referenced_obj_one - referenced check for object-based rmap + * @page: the page we're checking references on. + * + * For an object-based mapped page, find all the places it is mapped and + * check/clear the referenced flag. This is done by following the page->mapping + * pointer, then walking the chain of vmas it holds. It returns the number + * of references it found. + * + * This function is only called from page_referenced for object-based pages. + * + * The semaphore address_space->i_shared_sem is tried. If it can't be gotten, + * assume a reference count of 1. + */ +static int +page_referenced_obj(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct vm_area_struct *vma; + int referenced = 0; + + if (!page->pte.mapcount) + return 0; + + if (!mapping) + BUG(); + + if (PageSwapCache(page)) + BUG(); + + if (down_trylock(&mapping->i_shared_sem)) + return 1; + + list_for_each_entry(vma, &mapping->i_mmap, shared) { + if (vma->vm_pgoff > (page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT))) + break; + referenced += page_referenced_obj_one(vma, page); + } + list_for_each_entry(vma, &mapping->i_mmap_shared, shared) { + if (vma->vm_pgoff > (page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT))) + break; + referenced += page_referenced_obj_one(vma, page); + } + up(&mapping->i_shared_sem); + + return referenced; +} + +/** * page_referenced - test if the page was referenced * @page: the page to test * @@ -120,6 +254,10 @@ int page_referenced(struct page * page) if (TestClearPageReferenced(page)) referenced++; + if (!PageAnon(page)) { + referenced += page_referenced_obj(page); + goto out; + } if (PageDirect(page)) { pte_t *pte = rmap_ptep_map(page->pte.direct); if (ptep_test_and_clear_young(pte)) @@ -153,6 +291,7 @@ int page_referenced(struct page * page) __pte_chain_free(pc); } } +out: return referenced; } @@ -175,6 +314,21 @@ page_add_rmap(struct page *page, pte_t * if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) goto out; + /* + * If this is an object-based page, just count it. We can + * find the mappings by walking the object vma chain for that object. + */ + if (!PageAnon(page)) { + if (!page->mapping) + BUG(); + if (PageSwapCache(page)) + BUG(); + if (!page->pte.mapcount) + inc_page_state(nr_mapped); + page->pte.mapcount++; + goto out; + } + if (page->pte.direct == 0) { page->pte.direct = pte_paddr; SetPageDirect(page); @@ -231,8 +385,25 @@ void page_remove_rmap(struct page *page, goto out_unlock; if (!page_mapped(page)) - goto out_unlock; /* remap_page_range() from a driver? */ + goto out_unlock; + /* + * If this is an object-based page, just uncount it. We can + * find the mappings by walking the object vma chain for that object. + */ + if (!PageAnon(page)) { + if (!page->mapping) + BUG(); + if (PageSwapCache(page)) + BUG(); + if (!page->pte.mapcount) + BUG(); + page->pte.mapcount--; + if (!page->pte.mapcount) + dec_page_state(nr_mapped); + goto out_unlock; + } + if (PageDirect(page)) { if (page->pte.direct == pte_paddr) { page->pte.direct = 0; @@ -279,6 +450,106 @@ out_unlock: } /** + * try_to_unmap_obj - unmap a page using the object-based rmap method + * @page: the page to unmap + * + * Determine whether a page is mapped in a given vma and unmap it if it's found. + * + * This function is strictly a helper function for try_to_unmap_obj. + */ +static inline int +try_to_unmap_obj_one(struct vm_area_struct *vma, struct page *page) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long address; + pte_t *pte; + pte_t pteval; + int ret = SWAP_AGAIN; + + if (!spin_trylock(&mm->page_table_lock)) + return ret; + + pte = find_pte(vma, page, &address); + if (!pte) + goto out; + + if (vma->vm_flags & VM_LOCKED) { + ret = SWAP_FAIL; + goto out_unmap; + } + + flush_cache_page(vma, address); + pteval = ptep_get_and_clear(pte); + flush_tlb_page(vma, address); + + if (pte_dirty(pteval)) + set_page_dirty(page); + + if (!page->pte.mapcount) + BUG(); + + mm->rss--; + page->pte.mapcount--; + page_cache_release(page); + +out_unmap: + pte_unmap(pte); + +out: + spin_unlock(&mm->page_table_lock); + return ret; +} + +/** + * try_to_unmap_obj - unmap a page using the object-based rmap method + * @page: the page to unmap + * + * Find all the mappings of a page using the mapping pointer and the vma chains + * contained in the address_space struct it points to. + * + * This function is only called from try_to_unmap for object-based pages. + * + * The semaphore address_space->i_shared_sem is tried. If it can't be gotten, + * return a temporary error. + */ +static int +try_to_unmap_obj(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct vm_area_struct *vma; + int ret = SWAP_AGAIN; + + if (!mapping) + BUG(); + + if (PageSwapCache(page)) + BUG(); + + if (down_trylock(&mapping->i_shared_sem)) + return ret; + + list_for_each_entry(vma, &mapping->i_mmap, shared) { + if (vma->vm_pgoff > (page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT))) + break; + ret = try_to_unmap_obj_one(vma, page); + if (ret == SWAP_FAIL || !page->pte.mapcount) + goto out; + } + + list_for_each_entry(vma, &mapping->i_mmap_shared, shared) { + if (vma->vm_pgoff > (page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT))) + break; + ret = try_to_unmap_obj_one(vma, page); + if (ret == SWAP_FAIL || !page->pte.mapcount) + goto out; + } + +out: + up(&mapping->i_shared_sem); + return ret; +} + +/** * try_to_unmap_one - worker function for try_to_unmap * @page: page to unmap * @ptep: page table entry to unmap from page @@ -287,9 +558,8 @@ out_unlock: * table entry mapping a page. Because locking order here is opposite * to the locking order used by the page fault path, we use trylocks. * Locking: - * zone->lru_lock page_launder() - * page lock page_launder(), trylock - * pte_chain_lock page_launder() + * page lock shrink_list(), trylock + * pte_chain_lock shrink_list() * mm->page_table_lock try_to_unmap_one(), trylock */ static int FASTCALL(try_to_unmap_one(struct page *, pte_addr_t)); @@ -376,8 +646,8 @@ out_unlock: * @page: the page to get unmapped * * Tries to remove all the page table entries which are mapping this - * page, used in the pageout path. Caller must hold zone->lru_lock - * and the page lock. Return values are: + * page, used in the pageout path. Caller must hold the page lock + * and its pte chain lock. Return values are: * * SWAP_SUCCESS - we succeeded in removing all mappings * SWAP_AGAIN - we missed a trylock, try again later @@ -398,6 +668,15 @@ int try_to_unmap(struct page * page) if (!page->mapping) BUG(); + /* + * If it's an object-based page, use the object vma chain to find all + * the mappings. + */ + if (!PageAnon(page)) { + ret = try_to_unmap_obj(page); + goto out; + } + if (PageDirect(page)) { ret = try_to_unmap_one(page, page->pte.direct); if (ret == SWAP_SUCCESS) { @@ -453,12 +732,119 @@ int try_to_unmap(struct page * page) } } out: - if (!page_mapped(page)) + if (!page_mapped(page)) { dec_page_state(nr_mapped); + ret = SWAP_SUCCESS; + } return ret; } /** + * page_convert_anon - Convert an object-based mapped page to pte_chain-based. + * @page: the page to convert + * + * Find all the mappings for an object-based page and convert them + * to 'anonymous', ie create a pte_chain and store all the pte pointers there. + * + * This function takes the address_space->i_shared_sem, sets the PageAnon flag, + * then sets the mm->page_table_lock for each vma and calls page_add_rmap. This + * means there is a period when PageAnon is set, but still has some mappings + * with no pte_chain entry. This is in fact safe, since page_remove_rmap will + * simply not find it. try_to_unmap might erroneously return success, but it + * will never be called because the page_convert_anon() caller has locked the + * page. + * + * page_referenced() may fail to scan all the appropriate pte's and may return + * an inaccurate result. This is so rare that it does not matter. + */ +int page_convert_anon(struct page *page) +{ + struct address_space *mapping; + struct vm_area_struct *vma; + struct pte_chain *pte_chain = NULL; + pte_t *pte; + int err = 0; + + mapping = page->mapping; + if (mapping == NULL) + goto out; /* truncate won the lock_page() race */ + + down(&mapping->i_shared_sem); + pte_chain_lock(page); + + /* + * Has someone else done it for us before we got the lock? + * If so, pte.direct or pte.chain has replaced pte.mapcount. + */ + if (PageAnon(page)) { + pte_chain_unlock(page); + goto out_unlock; + } + + SetPageAnon(page); + if (page->pte.mapcount == 0) { + pte_chain_unlock(page); + goto out_unlock; + } + /* This is gonna get incremented by page_add_rmap */ + dec_page_state(nr_mapped); + page->pte.mapcount = 0; + + /* + * Now that the page is marked as anon, unlock it. page_add_rmap will + * lock it as necessary. + */ + pte_chain_unlock(page); + + list_for_each_entry(vma, &mapping->i_mmap, shared) { + if (vma->vm_pgoff > (page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT))) + break; + if (!pte_chain) { + pte_chain = pte_chain_alloc(GFP_KERNEL); + if (!pte_chain) { + err = -ENOMEM; + goto out_unlock; + } + } + spin_lock(&vma->vm_mm->page_table_lock); + pte = find_pte(vma, page, NULL); + if (pte) { + /* Make sure this isn't a duplicate */ + page_remove_rmap(page, pte); + pte_chain = page_add_rmap(page, pte, pte_chain); + pte_unmap(pte); + } + spin_unlock(&vma->vm_mm->page_table_lock); + } + list_for_each_entry(vma, &mapping->i_mmap_shared, shared) { + if (vma->vm_pgoff > (page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT))) + break; + if (!pte_chain) { + pte_chain = pte_chain_alloc(GFP_KERNEL); + if (!pte_chain) { + err = -ENOMEM; + goto out_unlock; + } + } + spin_lock(&vma->vm_mm->page_table_lock); + pte = find_pte(vma, page, NULL); + if (pte) { + /* Make sure this isn't a duplicate */ + page_remove_rmap(page, pte); + pte_chain = page_add_rmap(page, pte, pte_chain); + pte_unmap(pte); + } + spin_unlock(&vma->vm_mm->page_table_lock); + } + +out_unlock: + pte_chain_free(pte_chain); + up(&mapping->i_shared_sem); +out: + return err; +} + +/** ** No more VM stuff below this comment, only pte_chain helper ** functions. **/ diff -puN mm/swapfile.c~objrmap mm/swapfile.c --- 25/mm/swapfile.c~objrmap 2003-04-08 03:16:49.000000000 -0700 +++ 25-akpm/mm/swapfile.c 2003-04-08 03:16:49.000000000 -0700 @@ -377,41 +377,34 @@ void free_swap_and_cache(swp_entry_t ent * share this swap entry, so be cautious and let do_wp_page work out * what to do if a write is requested later. */ -/* mmlist_lock and vma->vm_mm->page_table_lock are held */ +/* vma->vm_mm->page_table_lock is held */ static void unuse_pte(struct vm_area_struct *vma, unsigned long address, pte_t *dir, swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) { - pte_t pte = *dir; - - if (pte_file(pte)) - return; - if (likely(pte_to_swp_entry(pte).val != entry.val)) - return; - if (unlikely(pte_none(pte) || pte_present(pte))) - return; + vma->vm_mm->rss++; get_page(page); set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); + SetPageAnon(page); *pte_chainp = page_add_rmap(page, dir, *pte_chainp); swap_free(entry); - ++vma->vm_mm->rss; } -/* mmlist_lock and vma->vm_mm->page_table_lock are held */ -static void unuse_pmd(struct vm_area_struct * vma, pmd_t *dir, +/* vma->vm_mm->page_table_lock is held */ +static int unuse_pmd(struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long size, unsigned long offset, - swp_entry_t entry, struct page* page) + swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) { pte_t * pte; unsigned long end; - struct pte_chain *pte_chain = NULL; + pte_t swp_pte = swp_entry_to_pte(entry); if (pmd_none(*dir)) - return; + return 0; if (pmd_bad(*dir)) { pmd_ERROR(*dir); pmd_clear(dir); - return; + return 0; } pte = pte_offset_map(dir, address); offset += address & PMD_MASK; @@ -421,33 +414,36 @@ static void unuse_pmd(struct vm_area_str end = PMD_SIZE; do { /* - * FIXME: handle pte_chain_alloc() failures + * swapoff spends a _lot_ of time in this loop! + * Test inline before going to call unuse_pte. */ - if (pte_chain == NULL) - pte_chain = pte_chain_alloc(GFP_ATOMIC); - unuse_pte(vma, offset+address-vma->vm_start, - pte, entry, page, &pte_chain); + if (unlikely(pte_same(*pte, swp_pte))) { + unuse_pte(vma, offset + address, pte, + entry, page, pte_chainp); + pte_unmap(pte); + return 1; + } address += PAGE_SIZE; pte++; } while (address && (address < end)); pte_unmap(pte - 1); - pte_chain_free(pte_chain); + return 0; } -/* mmlist_lock and vma->vm_mm->page_table_lock are held */ -static void unuse_pgd(struct vm_area_struct * vma, pgd_t *dir, +/* vma->vm_mm->page_table_lock is held */ +static int unuse_pgd(struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long size, - swp_entry_t entry, struct page* page) + swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) { pmd_t * pmd; unsigned long offset, end; if (pgd_none(*dir)) - return; + return 0; if (pgd_bad(*dir)) { pgd_ERROR(*dir); pgd_clear(dir); - return; + return 0; } pmd = pmd_offset(dir, address); offset = address & PGDIR_MASK; @@ -458,32 +454,42 @@ static void unuse_pgd(struct vm_area_str if (address >= end) BUG(); do { - unuse_pmd(vma, pmd, address, end - address, offset, entry, - page); + if (unuse_pmd(vma, pmd, address, end - address, + offset, entry, page, pte_chainp)) + return 1; address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); + return 0; } -/* mmlist_lock and vma->vm_mm->page_table_lock are held */ -static void unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir, - swp_entry_t entry, struct page* page) +/* vma->vm_mm->page_table_lock is held */ +static int unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir, + swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) { unsigned long start = vma->vm_start, end = vma->vm_end; if (start >= end) BUG(); do { - unuse_pgd(vma, pgdir, start, end - start, entry, page); + if (unuse_pgd(vma, pgdir, start, end - start, + entry, page, pte_chainp)) + return 1; start = (start + PGDIR_SIZE) & PGDIR_MASK; pgdir++; } while (start && (start < end)); + return 0; } -static void unuse_process(struct mm_struct * mm, +static int unuse_process(struct mm_struct * mm, swp_entry_t entry, struct page* page) { struct vm_area_struct* vma; + struct pte_chain *pte_chain; + + pte_chain = pte_chain_alloc(GFP_KERNEL); + if (!pte_chain) + return -ENOMEM; /* * Go through process' page directory. @@ -491,10 +497,12 @@ static void unuse_process(struct mm_stru spin_lock(&mm->page_table_lock); for (vma = mm->mmap; vma; vma = vma->vm_next) { pgd_t * pgd = pgd_offset(mm, vma->vm_start); - unuse_vma(vma, pgd, entry, page); + if (unuse_vma(vma, pgd, entry, page, &pte_chain)) + break; } spin_unlock(&mm->page_table_lock); - return; + pte_chain_free(pte_chain); + return 0; } /* @@ -638,36 +646,54 @@ static int try_to_unuse(unsigned int typ if (start_mm == &init_mm) shmem = shmem_unuse(entry, page); else - unuse_process(start_mm, entry, page); + retval = unuse_process(start_mm, entry, page); } if (*swap_map > 1) { int set_start_mm = (*swap_map >= swcount); struct list_head *p = &start_mm->mmlist; struct mm_struct *new_start_mm = start_mm; + struct mm_struct *prev_mm = start_mm; struct mm_struct *mm; + atomic_inc(&new_start_mm->mm_users); + atomic_inc(&prev_mm->mm_users); spin_lock(&mmlist_lock); - while (*swap_map > 1 && + while (*swap_map > 1 && !retval && (p = p->next) != &start_mm->mmlist) { mm = list_entry(p, struct mm_struct, mmlist); + atomic_inc(&mm->mm_users); + spin_unlock(&mmlist_lock); + mmput(prev_mm); + prev_mm = mm; + + cond_resched(); + swcount = *swap_map; - if (mm == &init_mm) { + if (swcount <= 1) + ; + else if (mm == &init_mm) { set_start_mm = 1; - spin_unlock(&mmlist_lock); shmem = shmem_unuse(entry, page); - spin_lock(&mmlist_lock); } else - unuse_process(mm, entry, page); + retval = unuse_process(mm, entry, page); if (set_start_mm && *swap_map < swcount) { + mmput(new_start_mm); + atomic_inc(&mm->mm_users); new_start_mm = mm; set_start_mm = 0; } + spin_lock(&mmlist_lock); } - atomic_inc(&new_start_mm->mm_users); spin_unlock(&mmlist_lock); + mmput(prev_mm); mmput(start_mm); start_mm = new_start_mm; } + if (retval) { + unlock_page(page); + page_cache_release(page); + break; + } /* * How could swap count reach 0x7fff when the maximum @@ -691,7 +717,7 @@ static int try_to_unuse(unsigned int typ /* * If a reference remains (rare), we would like to leave - * the page in the swap cache; but try_to_swap_out could + * the page in the swap cache; but try_to_unmap could * then re-duplicate the entry once we drop page lock, * so we might loop indefinitely; also, that page could * not be swapped out to other storage meanwhile. So: @@ -727,7 +753,7 @@ static int try_to_unuse(unsigned int typ /* * So we could skip searching mms once swap count went * to 1, we did not mark any present ptes as dirty: must - * mark page dirty so try_to_swap_out will preserve it. + * mark page dirty so shrink_list will preserve it. */ SetPageDirty(page); unlock_page(page); diff -puN include/asm-ppc64/mman.h~objrmap include/asm-ppc64/mman.h diff -puN include/linux/rmap-locking.h~objrmap include/linux/rmap-locking.h diff -puN include/linux/swap.h~objrmap include/linux/swap.h --- 25/include/linux/swap.h~objrmap 2003-04-08 03:16:49.000000000 -0700 +++ 25-akpm/include/linux/swap.h 2003-04-08 03:16:49.000000000 -0700 @@ -178,6 +178,8 @@ struct pte_chain *FASTCALL(page_add_rmap void FASTCALL(page_remove_rmap(struct page *, pte_t *)); int FASTCALL(try_to_unmap(struct page *)); +int page_convert_anon(struct page *); + /* linux/mm/shmem.c */ extern int shmem_unuse(swp_entry_t entry, struct page *page); #else diff -puN mm/filemap.c~objrmap mm/filemap.c --- 25/mm/filemap.c~objrmap 2003-04-08 03:16:49.000000000 -0700 +++ 25-akpm/mm/filemap.c 2003-04-08 03:16:49.000000000 -0700 @@ -64,6 +64,9 @@ * ->mmap_sem * ->i_shared_sem (various places) * + * ->lock_page + * ->i_shared_sem (page_convert_anon) + * * ->inode_lock * ->sb_lock (fs/fs-writeback.c) * ->mapping->page_lock (__sync_single_inode) _