From: Hugh Dickins The previous patch let the ptes of file pages be located via page ->mapping->i_mmap and i_mmap_shared lists of vmas; which works well unless the vma is VM_NONLINEAR - one in which sys_remap_file_pages has been used to place pages in unexpected places, to avoid an explosion of distinct unmergable vmas. Such pages were effectively locked in memory. page_referenced_file is already skipping nonlinear vmas, they'd just waste its time, and age unfairly any pages in their proper positions. Now extend try_to_unmap_file, to persuade it to swap from nonlinears. Ignoring the page requested, try to unmap cluster of 32 neighbouring ptes (in worst case all empty slots) in a nonlinear vma, then move on to the next vma; stopping when we've unmapped at least as many maps as the requested page had (vague guide of how hard to try), or have reached the end. With large sparse nonlinear vmas, this could take a long time: inserted a cond_resched while no locks are held, unusual at this level but I think okay, shrink_list does so. Use vm_private_data a little like the old mm->swap_address, as a cursor recording how far we got, so we don't attack the same ptes next time around (earlier tried inserting an empty marker vma in the list, but that got messy). How well this will work on real- life nonlinear vmas remains to be seen, but should work better than locking them all in memory, or swapping everything out all the time. Existing users of vm_private_data have either VM_RESERVED or VM_DONTEXPAND set, both of which are in the VM_SPECIAL category where we never try to merge vmas: so removed the vm_private_data test from is_mergeable_vma, so we can still merge VM_NONLINEARs. Of course, we could instead add another diff -upN reference/mm/fremap.c current/mm/fremap.c --- reference/mm/fremap.c 2004-05-01 10:20:57.000000000 -0700 +++ current/mm/fremap.c 2004-05-01 10:20:57.000000000 -0700 @@ -186,15 +186,18 @@ asmlinkage long sys_remap_file_pages(uns /* * Make sure the vma is shared, that it supports prefaulting, * and that the remapped range is valid and fully within - * the single existing vma: + * the single existing vma. vm_private_data is used as a + * swapout cursor in a VM_NONLINEAR vma (unless VM_RESERVED + * or VM_LOCKED, but VM_LOCKED could be revoked later on). */ if (vma && (vma->vm_flags & VM_SHARED) && + (!vma->vm_private_data || (vma->vm_flags & VM_RESERVED)) && vma->vm_ops && vma->vm_ops->populate && end > start && start >= vma->vm_start && end <= vma->vm_end) { /* Must set VM_NONLINEAR before any pages are populated. */ - if (pgoff != ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff) + if (pgoff != linear_page_index(vma, start)) vma->vm_flags |= VM_NONLINEAR; /* ->populate can take a long time, so downgrade the lock. */ diff -upN reference/mm/mmap.c current/mm/mmap.c --- reference/mm/mmap.c 2004-04-30 11:23:57.000000000 -0700 +++ current/mm/mmap.c 2004-05-01 10:20:57.000000000 -0700 @@ -333,8 +333,6 @@ static inline int is_mergeable_vma(struc return 0; if (vma->vm_flags != vm_flags) return 0; - if (vma->vm_private_data) - return 0; return 1; } diff -upN reference/mm/rmap.c current/mm/rmap.c --- reference/mm/rmap.c 2004-05-01 10:20:57.000000000 -0700 +++ current/mm/rmap.c 2004-05-01 10:20:57.000000000 -0700 @@ -593,6 +593,94 @@ out: return ret; } +/* + * try_to_unmap_cluster is only used on VM_NONLINEAR shared object vmas, + * in which objrmap is unable to predict where a page will be found. + */ +#define CLUSTER_SIZE (32 * PAGE_SIZE) +#if CLUSTER_SIZE > PMD_SIZE +#undef CLUSTER_SIZE +#define CLUSTER_SIZE PMD_SIZE +#endif +#define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) + +static int try_to_unmap_cluster(struct mm_struct *mm, unsigned long cursor, + unsigned int *mapcount, struct vm_area_struct *vma) +{ + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + pte_t pteval; + struct page *page; + unsigned long address; + unsigned long end; + unsigned long pfn; + + /* + * We need the page_table_lock to protect us from page faults, + * munmap, fork, etc... + */ + if (!spin_trylock(&mm->page_table_lock)) + return SWAP_FAIL; + + address = (vma->vm_start + cursor) & CLUSTER_MASK; + end = address + CLUSTER_SIZE; + if (address < vma->vm_start) + address = vma->vm_start; + if (end > vma->vm_end) + end = vma->vm_end; + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out_unlock; + + pmd = pmd_offset(pgd, address); + if (!pmd_present(*pmd)) + goto out_unlock; + + for (pte = pte_offset_map(pmd, address); + address < end; pte++, address += PAGE_SIZE) { + + if (!pte_present(*pte)) + continue; + + pfn = pte_pfn(*pte); + if (!pfn_valid(pfn)) + continue; + + page = pfn_to_page(pfn); + BUG_ON(PageAnon(page)); + if (PageReserved(page)) + continue; + + if (ptep_test_and_clear_young(pte)) + continue; + + /* Nuke the page table entry. */ + flush_cache_page(vma, address); + pteval = ptep_clear_flush(vma, address, pte); + + /* If nonlinear, store the file page offset in the pte. */ + if (page->index != linear_page_index(vma, address)) + set_pte(pte, pgoff_to_pte(page->index)); + + /* Move the dirty bit to the physical page now the pte is gone. */ + if (pte_dirty(pteval)) + set_page_dirty(page); + + page_remove_rmap(page, pte); + page_cache_release(page); + mm->rss--; + (*mapcount)--; + } + + pte_unmap(pte); + +out_unlock: + spin_unlock(&mm->page_table_lock); + return SWAP_AGAIN; +} + /** * try_to_unmap_file - unmap file page using the object-based rmap method * @page: the page to unmap @@ -613,6 +701,9 @@ static inline int try_to_unmap_file(stru struct vm_area_struct *vma; unsigned long address; int ret = SWAP_AGAIN; + unsigned long cursor; + unsigned long max_nl_cursor = 0; + unsigned long max_nl_size = 0; if (down_trylock(&mapping->i_shared_sem)) return ret; @@ -630,8 +721,22 @@ static inline int try_to_unmap_file(stru } list_for_each_entry(vma, &mapping->i_mmap_shared, shared) { - if (unlikely(vma->vm_flags & VM_NONLINEAR)) + if (unlikely(vma->vm_flags & VM_NONLINEAR)) { + /* + * Defer unmapping nonlinear to the next loop, + * but take notes while we're here e.g. don't + * want to loop again when no nonlinear vmas. + */ + if (vma->vm_flags & (VM_LOCKED|VM_RESERVED)) + continue; + cursor = (unsigned long) vma->vm_private_data; + if (cursor > max_nl_cursor) + max_nl_cursor = cursor; + cursor = vma->vm_end - vma->vm_start; + if (cursor > max_nl_size) + max_nl_size = cursor; continue; + } if (vma->vm_mm->rss) { address = vma_address(vma, pgoff); if (address == -EFAULT) @@ -642,6 +747,62 @@ static inline int try_to_unmap_file(stru goto out; } } + + if (max_nl_size == 0) /* no nonlinear vmas of this file */ + goto out; + + /* + * We don't try to search for this page in the nonlinear vmas, + * and page_referenced wouldn't have found it anyway. Instead + * just walk the nonlinear vmas trying to age and unmap some. + * The mapcount of the page we came in with is irrelevant, + * but even so use it as a guide to how hard we should try? + */ + rmap_unlock(page); + + max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; + if (max_nl_cursor == 0) + max_nl_cursor = CLUSTER_SIZE; + + do { + list_for_each_entry(vma, &mapping->i_mmap_shared, shared) { + if (VM_NONLINEAR != (vma->vm_flags & + (VM_NONLINEAR|VM_LOCKED|VM_RESERVED))) + continue; + cursor = (unsigned long) vma->vm_private_data; + while (vma->vm_mm->rss && + cursor < max_nl_cursor && + cursor < vma->vm_end - vma->vm_start) { + ret = try_to_unmap_cluster(vma->vm_mm, + cursor, &mapcount, vma); + if (ret == SWAP_FAIL) + break; + cursor += CLUSTER_SIZE; + vma->vm_private_data = (void *) cursor; + if ((int)mapcount <= 0) + goto relock; + cond_resched(); + } + if (ret != SWAP_FAIL) + vma->vm_private_data = + (void *) max_nl_cursor; + ret = SWAP_AGAIN; + } + max_nl_cursor += CLUSTER_SIZE; + } while (max_nl_cursor <= max_nl_size); + + /* + * Don't loop forever (perhaps all the remaining pages are + * in locked vmas). Reset cursor on all unreserved nonlinear + * vmas, now forgetting on which ones it had fallen behind. + */ + list_for_each_entry(vma, &mapping->i_mmap_shared, shared) { + if ((vma->vm_flags & (VM_NONLINEAR|VM_RESERVED)) == + VM_NONLINEAR) + vma->vm_private_data = 0; + } +relock: + rmap_lock(page); out: up(&mapping->i_shared_sem); return ret;