From Dave. Crappy name. 25-akpm/fs/exec.c | 1 25-akpm/include/linux/mm.h | 1 25-akpm/include/linux/page-flags.h | 5 25-akpm/include/linux/swap.h | 2 25-akpm/mm/filemap.c | 3 25-akpm/mm/fremap.c | 22 +- 25-akpm/mm/memory.c | 8 25-akpm/mm/mmap.c | 46 +++- 25-akpm/mm/page_alloc.c | 2 25-akpm/mm/rmap.c | 391 ++++++++++++++++++++++++++++++++++++- 25-akpm/mm/swapfile.c | 1 11 files changed, 466 insertions(+), 16 deletions(-) diff -puN fs/exec.c~objrmap fs/exec.c --- 25/fs/exec.c~objrmap Tue May 13 15:33:38 2003 +++ 25-akpm/fs/exec.c Tue May 13 15:33:59 2003 @@ -316,6 +316,7 @@ void put_dirty_page(struct task_struct * } lru_cache_add_active(page); flush_dcache_page(page); + SetPageAnon(page); set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, prot)))); pte_chain = page_add_rmap(page, pte, pte_chain); pte_unmap(pte); diff -puN include/linux/mm.h~objrmap include/linux/mm.h --- 25/include/linux/mm.h~objrmap Tue May 13 15:33:38 2003 +++ 25-akpm/include/linux/mm.h Tue May 13 15:33:38 2003 @@ -175,6 +175,7 @@ struct page { struct pte_chain *chain;/* Reverse pte mapping pointer. * protected by PG_chainlock */ pte_addr_t direct; + int mapcount; } pte; unsigned long private; /* mapping-private opaque data */ diff -puN include/linux/page-flags.h~objrmap include/linux/page-flags.h --- 25/include/linux/page-flags.h~objrmap Tue May 13 15:33:38 2003 +++ 25-akpm/include/linux/page-flags.h Tue May 13 15:33:38 2003 @@ -74,6 +74,7 @@ #define PG_mappedtodisk 17 /* Has blocks allocated on-disk */ #define PG_reclaim 18 /* To be reclaimed asap */ #define PG_compound 19 /* Part of a compound page */ +#define PG_anon 20 /* Anonymous page */ /* @@ -257,6 +258,10 @@ extern void get_full_page_state(struct p #define SetPageCompound(page) set_bit(PG_compound, &(page)->flags) #define ClearPageCompound(page) clear_bit(PG_compound, &(page)->flags) +#define PageAnon(page) test_bit(PG_anon, &(page)->flags) +#define SetPageAnon(page) set_bit(PG_anon, &(page)->flags) +#define ClearPageAnon(page) clear_bit(PG_anon, &(page)->flags) + /* * The PageSwapCache predicate doesn't use a PG_flag at this time, * but it may again do so one day. diff -puN include/linux/swap.h~objrmap include/linux/swap.h --- 25/include/linux/swap.h~objrmap Tue May 13 15:33:38 2003 +++ 25-akpm/include/linux/swap.h Tue May 13 15:33:38 2003 @@ -186,6 +186,8 @@ struct pte_chain *FASTCALL(page_add_rmap void FASTCALL(page_remove_rmap(struct page *, pte_t *)); int FASTCALL(try_to_unmap(struct page *)); +int page_convert_anon(struct page *); + /* linux/mm/shmem.c */ extern int shmem_unuse(swp_entry_t entry, struct page *page); #else diff -puN mm/filemap.c~objrmap mm/filemap.c --- 25/mm/filemap.c~objrmap Tue May 13 15:33:38 2003 +++ 25-akpm/mm/filemap.c Tue May 13 15:33:38 2003 @@ -63,6 +63,9 @@ * ->mmap_sem * ->i_shared_sem (various places) * + * ->lock_page + * ->i_shared_sem (page_convert_anon) + * * ->inode_lock * ->sb_lock (fs/fs-writeback.c) * ->mapping->page_lock (__sync_single_inode) diff -puN mm/fremap.c~objrmap mm/fremap.c --- 25/mm/fremap.c~objrmap Tue May 13 15:33:38 2003 +++ 25-akpm/mm/fremap.c Tue May 13 15:33:38 2003 @@ -60,10 +60,26 @@ int install_page(struct mm_struct *mm, s pgd_t *pgd; pmd_t *pmd; struct pte_chain *pte_chain; + unsigned long pgidx; pte_chain = pte_chain_alloc(GFP_KERNEL); if (!pte_chain) goto err; + + /* + * Convert this page to anon for objrmap if it's nonlinear + */ + pgidx = (addr - vma->vm_start) >> PAGE_SHIFT; + pgidx += vma->vm_pgoff; + pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; + if (!PageAnon(page) && (page->index != pgidx)) { + lock_page(page); + err = page_convert_anon(page); + unlock_page(page); + if (err < 0) + goto err_free; + } + pgd = pgd_offset(mm, addr); spin_lock(&mm->page_table_lock); @@ -85,12 +101,10 @@ int install_page(struct mm_struct *mm, s if (flush) flush_tlb_page(vma, addr); - spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); - return 0; - + err = 0; err_unlock: spin_unlock(&mm->page_table_lock); +err_free: pte_chain_free(pte_chain); err: return err; diff -puN mm/memory.c~objrmap mm/memory.c --- 25/mm/memory.c~objrmap Tue May 13 15:33:38 2003 +++ 25-akpm/mm/memory.c Tue May 13 15:33:38 2003 @@ -1050,6 +1050,7 @@ static int do_wp_page(struct mm_struct * ++mm->rss; page_remove_rmap(old_page, page_table); break_cow(vma, new_page, address, page_table); + SetPageAnon(new_page); pte_chain = page_add_rmap(new_page, page_table, pte_chain); lru_cache_add_active(new_page); @@ -1253,6 +1254,7 @@ static int do_swap_page(struct mm_struct flush_icache_page(vma, page); set_pte(page_table, pte); + SetPageAnon(page); pte_chain = page_add_rmap(page, page_table, pte_chain); /* No need to invalidate - it was non-present before */ @@ -1318,6 +1320,7 @@ do_anonymous_page(struct mm_struct *mm, entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); lru_cache_add_active(page); mark_page_accessed(page); + SetPageAnon(page); } set_pte(page_table, entry); @@ -1377,6 +1380,10 @@ do_no_page(struct mm_struct *mm, struct if (!pte_chain) goto oom; + /* See if nopage returned an anon page */ + if (!new_page->mapping || PageSwapCache(new_page)) + SetPageAnon(new_page); + /* * Should we do an early C-O-W break? */ @@ -1389,6 +1396,7 @@ do_no_page(struct mm_struct *mm, struct copy_user_highpage(page, new_page, address); page_cache_release(new_page); lru_cache_add_active(page); + SetPageAnon(page); new_page = page; } diff -puN mm/page_alloc.c~objrmap mm/page_alloc.c --- 25/mm/page_alloc.c~objrmap Tue May 13 15:33:38 2003 +++ 25-akpm/mm/page_alloc.c Tue May 13 15:33:38 2003 @@ -263,6 +263,8 @@ static inline void free_pages_check(cons bad_page(function, page); if (PageDirty(page)) ClearPageDirty(page); + if (PageAnon(page)) + ClearPageAnon(page); } /* diff -puN mm/rmap.c~objrmap mm/rmap.c --- 25/mm/rmap.c~objrmap Tue May 13 15:33:38 2003 +++ 25-akpm/mm/rmap.c Tue May 13 15:33:38 2003 @@ -102,6 +102,140 @@ pte_chain_encode(struct pte_chain *pte_c **/ /** + * find_pte - Find a pte pointer given a vma and a struct page. + * @vma: the vma to search + * @page: the page to find + * + * Determine if this page is mapped in this vma. If it is, map and rethrn + * the pte pointer associated with it. Return null if the page is not + * mapped in this vma for any reason. + * + * This is strictly an internal helper function for the object-based rmap + * functions. + * + * It is the caller's responsibility to unmap the pte if it is returned. + */ +static inline pte_t * +find_pte(struct vm_area_struct *vma, struct page *page, unsigned long *addr) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + unsigned long loffset; + unsigned long address; + + loffset = (page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT)); + address = vma->vm_start + ((loffset - vma->vm_pgoff) << PAGE_SHIFT); + if (address < vma->vm_start || address >= vma->vm_end) + goto out; + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + + pmd = pmd_offset(pgd, address); + if (!pmd_present(*pmd)) + goto out; + + pte = pte_offset_map(pmd, address); + if (!pte_present(*pte)) + goto out_unmap; + + if (page_to_pfn(page) != pte_pfn(*pte)) + goto out_unmap; + + if (addr) + *addr = address; + + return pte; + +out_unmap: + pte_unmap(pte); +out: + return NULL; +} + +/** + * page_referenced_obj_one - referenced check for object-based rmap + * @vma: the vma to look in. + * @page: the page we're working on. + * + * Find a pte entry for a page/vma pair, then check and clear the referenced + * bit. + * + * This is strictly a helper function for page_referenced_obj. + */ +static int +page_referenced_obj_one(struct vm_area_struct *vma, struct page *page) +{ + struct mm_struct *mm = vma->vm_mm; + pte_t *pte; + int referenced = 0; + + if (!spin_trylock(&mm->page_table_lock)) + return 1; + + pte = find_pte(vma, page, NULL); + if (pte) { + if (ptep_test_and_clear_young(pte)) + referenced++; + pte_unmap(pte); + } + + spin_unlock(&mm->page_table_lock); + return referenced; +} + +/** + * page_referenced_obj_one - referenced check for object-based rmap + * @page: the page we're checking references on. + * + * For an object-based mapped page, find all the places it is mapped and + * check/clear the referenced flag. This is done by following the page->mapping + * pointer, then walking the chain of vmas it holds. It returns the number + * of references it found. + * + * This function is only called from page_referenced for object-based pages. + * + * The semaphore address_space->i_shared_sem is tried. If it can't be gotten, + * assume a reference count of 1. + */ +static int +page_referenced_obj(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct vm_area_struct *vma; + int referenced = 0; + + if (!page->pte.mapcount) + return 0; + + if (!mapping) + BUG(); + + if (PageSwapCache(page)) + BUG(); + + if (down_trylock(&mapping->i_shared_sem)) + return 1; + + list_for_each_entry(vma, &mapping->i_mmap, shared) { + if (vma->vm_pgoff > (page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT))) + break; + referenced += page_referenced_obj_one(vma, page); + } + list_for_each_entry(vma, &mapping->i_mmap_shared, shared) { + if (vma->vm_pgoff > (page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT))) + break; + referenced += page_referenced_obj_one(vma, page); + } + up(&mapping->i_shared_sem); + + return referenced; +} + +/** * page_referenced - test if the page was referenced * @page: the page to test * @@ -120,6 +254,10 @@ int page_referenced(struct page * page) if (TestClearPageReferenced(page)) referenced++; + if (!PageAnon(page)) { + referenced += page_referenced_obj(page); + goto out; + } if (PageDirect(page)) { pte_t *pte = rmap_ptep_map(page->pte.direct); if (ptep_test_and_clear_young(pte)) @@ -153,6 +291,7 @@ int page_referenced(struct page * page) __pte_chain_free(pc); } } +out: return referenced; } @@ -175,6 +314,21 @@ page_add_rmap(struct page *page, pte_t * pte_chain_lock(page); + /* + * If this is an object-based page, just count it. We can + * find the mappings by walking the object vma chain for that object. + */ + if (!PageAnon(page)) { + if (!page->mapping) + BUG(); + if (PageSwapCache(page)) + BUG(); + if (!page->pte.mapcount) + inc_page_state(nr_mapped); + page->pte.mapcount++; + goto out; + } + if (page->pte.direct == 0) { page->pte.direct = pte_paddr; SetPageDirect(page); @@ -231,8 +385,25 @@ void page_remove_rmap(struct page *page, pte_chain_lock(page); if (!page_mapped(page)) - goto out_unlock; /* remap_page_range() from a driver? */ + goto out_unlock; + /* + * If this is an object-based page, just uncount it. We can + * find the mappings by walking the object vma chain for that object. + */ + if (!PageAnon(page)) { + if (!page->mapping) + BUG(); + if (PageSwapCache(page)) + BUG(); + if (!page->pte.mapcount) + BUG(); + page->pte.mapcount--; + if (!page->pte.mapcount) + dec_page_state(nr_mapped); + goto out_unlock; + } + if (PageDirect(page)) { if (page->pte.direct == pte_paddr) { page->pte.direct = 0; @@ -279,6 +450,106 @@ out_unlock: } /** + * try_to_unmap_obj - unmap a page using the object-based rmap method + * @page: the page to unmap + * + * Determine whether a page is mapped in a given vma and unmap it if it's found. + * + * This function is strictly a helper function for try_to_unmap_obj. + */ +static inline int +try_to_unmap_obj_one(struct vm_area_struct *vma, struct page *page) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long address; + pte_t *pte; + pte_t pteval; + int ret = SWAP_AGAIN; + + if (!spin_trylock(&mm->page_table_lock)) + return ret; + + pte = find_pte(vma, page, &address); + if (!pte) + goto out; + + if (vma->vm_flags & VM_LOCKED) { + ret = SWAP_FAIL; + goto out_unmap; + } + + flush_cache_page(vma, address); + pteval = ptep_get_and_clear(pte); + flush_tlb_page(vma, address); + + if (pte_dirty(pteval)) + set_page_dirty(page); + + if (!page->pte.mapcount) + BUG(); + + mm->rss--; + page->pte.mapcount--; + page_cache_release(page); + +out_unmap: + pte_unmap(pte); + +out: + spin_unlock(&mm->page_table_lock); + return ret; +} + +/** + * try_to_unmap_obj - unmap a page using the object-based rmap method + * @page: the page to unmap + * + * Find all the mappings of a page using the mapping pointer and the vma chains + * contained in the address_space struct it points to. + * + * This function is only called from try_to_unmap for object-based pages. + * + * The semaphore address_space->i_shared_sem is tried. If it can't be gotten, + * return a temporary error. + */ +static int +try_to_unmap_obj(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct vm_area_struct *vma; + int ret = SWAP_AGAIN; + + if (!mapping) + BUG(); + + if (PageSwapCache(page)) + BUG(); + + if (down_trylock(&mapping->i_shared_sem)) + return ret; + + list_for_each_entry(vma, &mapping->i_mmap, shared) { + if (vma->vm_pgoff > (page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT))) + break; + ret = try_to_unmap_obj_one(vma, page); + if (ret == SWAP_FAIL || !page->pte.mapcount) + goto out; + } + + list_for_each_entry(vma, &mapping->i_mmap_shared, shared) { + if (vma->vm_pgoff > (page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT))) + break; + ret = try_to_unmap_obj_one(vma, page); + if (ret == SWAP_FAIL || !page->pte.mapcount) + goto out; + } + +out: + up(&mapping->i_shared_sem); + return ret; +} + +/** * try_to_unmap_one - worker function for try_to_unmap * @page: page to unmap * @ptep: page table entry to unmap from page @@ -403,6 +674,15 @@ int try_to_unmap(struct page * page) if (!page->mapping) BUG(); + /* + * If it's an object-based page, use the object vma chain to find all + * the mappings. + */ + if (!PageAnon(page)) { + ret = try_to_unmap_obj(page); + goto out; + } + if (PageDirect(page)) { ret = try_to_unmap_one(page, page->pte.direct); if (ret == SWAP_SUCCESS) { @@ -458,12 +738,119 @@ int try_to_unmap(struct page * page) } } out: - if (!page_mapped(page)) + if (!page_mapped(page)) { dec_page_state(nr_mapped); + ret = SWAP_SUCCESS; + } return ret; } /** + * page_convert_anon - Convert an object-based mapped page to pte_chain-based. + * @page: the page to convert + * + * Find all the mappings for an object-based page and convert them + * to 'anonymous', ie create a pte_chain and store all the pte pointers there. + * + * This function takes the address_space->i_shared_sem, sets the PageAnon flag, + * then sets the mm->page_table_lock for each vma and calls page_add_rmap. This + * means there is a period when PageAnon is set, but still has some mappings + * with no pte_chain entry. This is in fact safe, since page_remove_rmap will + * simply not find it. try_to_unmap might erroneously return success, but it + * will never be called because the page_convert_anon() caller has locked the + * page. + * + * page_referenced() may fail to scan all the appropriate pte's and may return + * an inaccurate result. This is so rare that it does not matter. + */ +int page_convert_anon(struct page *page) +{ + struct address_space *mapping; + struct vm_area_struct *vma; + struct pte_chain *pte_chain = NULL; + pte_t *pte; + int err = 0; + + mapping = page->mapping; + if (mapping == NULL) + goto out; /* truncate won the lock_page() race */ + + down(&mapping->i_shared_sem); + pte_chain_lock(page); + + /* + * Has someone else done it for us before we got the lock? + * If so, pte.direct or pte.chain has replaced pte.mapcount. + */ + if (PageAnon(page)) { + pte_chain_unlock(page); + goto out_unlock; + } + + SetPageAnon(page); + if (page->pte.mapcount == 0) { + pte_chain_unlock(page); + goto out_unlock; + } + /* This is gonna get incremented by page_add_rmap */ + dec_page_state(nr_mapped); + page->pte.mapcount = 0; + + /* + * Now that the page is marked as anon, unlock it. page_add_rmap will + * lock it as necessary. + */ + pte_chain_unlock(page); + + list_for_each_entry(vma, &mapping->i_mmap, shared) { + if (vma->vm_pgoff > (page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT))) + break; + if (!pte_chain) { + pte_chain = pte_chain_alloc(GFP_KERNEL); + if (!pte_chain) { + err = -ENOMEM; + goto out_unlock; + } + } + spin_lock(&vma->vm_mm->page_table_lock); + pte = find_pte(vma, page, NULL); + if (pte) { + /* Make sure this isn't a duplicate */ + page_remove_rmap(page, pte); + pte_chain = page_add_rmap(page, pte, pte_chain); + pte_unmap(pte); + } + spin_unlock(&vma->vm_mm->page_table_lock); + } + list_for_each_entry(vma, &mapping->i_mmap_shared, shared) { + if (vma->vm_pgoff > (page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT))) + break; + if (!pte_chain) { + pte_chain = pte_chain_alloc(GFP_KERNEL); + if (!pte_chain) { + err = -ENOMEM; + goto out_unlock; + } + } + spin_lock(&vma->vm_mm->page_table_lock); + pte = find_pte(vma, page, NULL); + if (pte) { + /* Make sure this isn't a duplicate */ + page_remove_rmap(page, pte); + pte_chain = page_add_rmap(page, pte, pte_chain); + pte_unmap(pte); + } + spin_unlock(&vma->vm_mm->page_table_lock); + } + +out_unlock: + pte_chain_free(pte_chain); + up(&mapping->i_shared_sem); +out: + return err; +} + +/** ** No more VM stuff below this comment, only pte_chain helper ** functions. **/ diff -puN mm/swapfile.c~objrmap mm/swapfile.c --- 25/mm/swapfile.c~objrmap Tue May 13 15:33:38 2003 +++ 25-akpm/mm/swapfile.c Tue May 13 15:33:38 2003 @@ -385,6 +385,7 @@ unuse_pte(struct vm_area_struct *vma, un vma->vm_mm->rss++; get_page(page); set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); + SetPageAnon(page); *pte_chainp = page_add_rmap(page, dir, *pte_chainp); swap_free(entry); } diff -puN mm/mmap.c~objrmap mm/mmap.c --- 25/mm/mmap.c~objrmap Tue May 13 15:33:38 2003 +++ 25-akpm/mm/mmap.c Tue May 13 15:33:38 2003 @@ -311,14 +311,23 @@ static inline void __vma_link_file(struc if (file) { struct inode * inode = file->f_dentry->d_inode; struct address_space *mapping = inode->i_mapping; + struct list_head *vmlist, *vmhead; if (vma->vm_flags & VM_DENYWRITE) atomic_dec(&inode->i_writecount); if (vma->vm_flags & VM_SHARED) - list_add_tail(&vma->shared, &mapping->i_mmap_shared); + vmhead = &mapping->i_mmap_shared; else - list_add_tail(&vma->shared, &mapping->i_mmap); + vmhead = &mapping->i_mmap; + + list_for_each(vmlist, vmhead) { + struct vm_area_struct *vmtemp; + vmtemp = list_entry(vmlist, struct vm_area_struct, shared); + if (vmtemp->vm_pgoff >= vma->vm_pgoff) + break; + } + list_add_tail(&vma->shared, vmlist); } } @@ -374,6 +383,28 @@ static inline int is_mergeable_vma(struc return 1; } +static void move_vma_start(struct vm_area_struct *vma, unsigned long addr) +{ + spinlock_t *lock = &vma->vm_mm->page_table_lock; + struct inode *inode = NULL; + + if (vma->vm_file) { + inode = vma->vm_file->f_dentry->d_inode; + down(&inode->i_mapping->i_shared_sem); + } + spin_lock(lock); + if (inode) + __remove_shared_vm_struct(vma, inode); + /* If no vm_file, perhaps we should always keep vm_pgoff at 0?? */ + vma->vm_pgoff += (long)(addr - vma->vm_start) >> PAGE_SHIFT; + vma->vm_start = addr; + if (inode) { + __vma_link_file(vma); + up(&inode->i_mapping->i_shared_sem); + } + spin_unlock(lock); +} + /* * Return true if we can merge this (vm_flags,file,vm_pgoff,size) * in front of (at a lower virtual address and file offset than) the vma. @@ -426,8 +457,6 @@ static int vma_merge(struct mm_struct *m unsigned long end, unsigned long vm_flags, struct file *file, unsigned long pgoff) { - spinlock_t * lock = &mm->page_table_lock; - /* * We later require that vma->vm_flags == vm_flags, so this tests * vma->vm_flags & VM_SPECIAL, too. @@ -447,6 +476,7 @@ static int vma_merge(struct mm_struct *m is_mergeable_vma(prev, file, vm_flags) && can_vma_merge_after(prev, vm_flags, file, pgoff)) { struct vm_area_struct *next; + spinlock_t *lock = &mm->page_table_lock; struct inode *inode = file ? file->f_dentry->d_inode : NULL; int need_up = 0; @@ -494,10 +524,7 @@ static int vma_merge(struct mm_struct *m pgoff, (end - addr) >> PAGE_SHIFT)) return 0; if (end == prev->vm_start) { - spin_lock(lock); - prev->vm_start = addr; - prev->vm_pgoff -= (end - addr) >> PAGE_SHIFT; - spin_unlock(lock); + move_vma_start(prev, addr); return 1; } } @@ -1219,8 +1246,7 @@ int split_vma(struct mm_struct * mm, str if (new_below) { new->vm_end = addr; - vma->vm_start = addr; - vma->vm_pgoff += ((addr - new->vm_start) >> PAGE_SHIFT); + move_vma_start(vma, addr); } else { vma->vm_end = addr; new->vm_start = addr; _