From: Steve Longerbeam The patch allows NUMA policies for file mappings. Page cache pages are allocated using a policy located in a shared_policy Red-Black tree attached to the mapping object (address_space). This involves: 1. add a shared_policy tree to the address_space object in fs.h. 2. modify page_cache_alloc() in pagemap.h to take a page index in addition to a mapping object, and use those to locate the correct policy in the mapping->policy tree when allocating the page. 3. modify filemap.c to pass the additional page offset to page_cache_alloc(). 4. Also in filemap.c, implement generic file {set|get}_policy() methods and add those to generic_file_vm_ops. 5. Init the file's shared policy in alloc_inode(), and free the shared policy in destroy_inode(). In addition, the patch adds a new flag to the mbind() syscall, MPOL_MF_MOVE. If the flag is set, any existing mapped anonymous or filemap pagecache pages that are/can be used for the given virtual memory region, that do not satisfy the NUMA policy, are moved to a new page that satisfies the policy. Here's how the new flag works with the existing MPOL_MF_STRICT flag (in the following discussion, "invalid page" means a page that does not satisfy the NUMA policy): MOVE and STRICT both set: attempt to move invalid pages, and if any move fails, return mbind() syscall failure. MOVE set: attempt to move invalid pages, but do not return error if any move fails. STRICT set: do not attempt to move invalid pages, returning mbind() failure (same behavior as before). neither MOVE or STRICT set: ignore invalid pages. In the default !NUMA case, there are no additional CPU cycles involved. Well, there is the additional page index passed to page_cache_alloc() and friends, but the extra cycles there is very small. In the NUMA case, there is of course extra processing if MPOL_MF_MOVE is passed to the mbind() syscall. Ie, it loops through every page index of the new region, looking for any invalid existing pte-mapped and pagecache pages, and replaces them. But this is done at syscall time, so it's not time critical. It could take a while for huge mappings, but only mm->mmap_sem is held during the whole search, and mm->page_table_lock is held only long enough to replace a single invalid pte-mapped page if one is found. It adds a few words to struct inode if CONFIG_NUMA is set. Acked-by: Andi Kleen Signed-off-by: Andrew Morton --- /dev/null | 0 25-akpm/fs/inode.c | 5 25-akpm/include/linux/fs.h | 2 25-akpm/include/linux/mempolicy.h | 12 + 25-akpm/include/linux/page-flags.h | 6 25-akpm/include/linux/pagemap.h | 18 + 25-akpm/mm/filemap.c | 39 +++- 25-akpm/mm/mempolicy.c | 354 +++++++++++++++++++++++++++++-------- 25-akpm/mm/readahead.c | 2 25-akpm/mm/shmem.c | 11 - 10 files changed, 357 insertions(+), 92 deletions(-) diff -L fs/cachefs/block.c -puN /dev/null /dev/null diff -puN fs/inode.c~numa-policies-for-file-mappings-mpol_mf_move fs/inode.c --- 25/fs/inode.c~numa-policies-for-file-mappings-mpol_mf_move Wed Nov 17 14:12:37 2004 +++ 25-akpm/fs/inode.c Wed Nov 17 14:12:37 2004 @@ -150,6 +150,7 @@ static struct inode *alloc_inode(struct mapping_set_gfp_mask(mapping, GFP_HIGHUSER); mapping->assoc_mapping = NULL; mapping->backing_dev_info = &default_backing_dev_info; + mpol_shared_policy_init(&mapping->policy); /* * If the block_device provides a backing_dev_info for client @@ -177,8 +178,10 @@ void destroy_inode(struct inode *inode) security_inode_free(inode); if (inode->i_sb->s_op->destroy_inode) inode->i_sb->s_op->destroy_inode(inode); - else + else { + mpol_free_shared_policy(&inode->i_mapping->policy); kmem_cache_free(inode_cachep, (inode)); + } } diff -puN include/linux/fs.h~numa-policies-for-file-mappings-mpol_mf_move include/linux/fs.h --- 25/include/linux/fs.h~numa-policies-for-file-mappings-mpol_mf_move Wed Nov 17 14:12:37 2004 +++ 25-akpm/include/linux/fs.h Wed Nov 17 14:12:37 2004 @@ -18,6 +18,7 @@ #include #include #include +#include #include struct iovec; @@ -349,6 +350,7 @@ struct address_space { struct address_space_operations *a_ops; /* methods */ unsigned long flags; /* error bits/gfp mask */ struct backing_dev_info *backing_dev_info; /* device readahead, etc */ + struct shared_policy policy; /* page alloc policy */ spinlock_t private_lock; /* for use by the address_space */ struct list_head private_list; /* ditto */ struct address_space *assoc_mapping; /* ditto */ diff -puN include/linux/mempolicy.h~numa-policies-for-file-mappings-mpol_mf_move include/linux/mempolicy.h --- 25/include/linux/mempolicy.h~numa-policies-for-file-mappings-mpol_mf_move Wed Nov 17 14:12:37 2004 +++ 25-akpm/include/linux/mempolicy.h Wed Nov 17 14:12:37 2004 @@ -22,6 +22,8 @@ /* Flags for mbind */ #define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */ +#define MPOL_MF_MOVE (1<<1) /* Attempt to move pages in mapping that do + not satisfy policy */ #ifdef __KERNEL__ @@ -149,7 +151,8 @@ int mpol_set_shared_policy(struct shared void mpol_free_shared_policy(struct shared_policy *p); struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx); - +struct page *alloc_page_shared_policy(unsigned gfp, struct shared_policy *sp, + unsigned long idx); extern void numa_default_policy(void); extern void numa_policy_init(void); @@ -215,6 +218,13 @@ mpol_shared_policy_lookup(struct shared_ #define vma_policy(vma) NULL #define vma_set_policy(vma, pol) do {} while(0) +static inline struct page * +alloc_page_shared_policy(unsigned gfp, struct shared_policy *sp, + unsigned long idx) +{ + return alloc_pages(gfp, 0); +} + static inline void numa_policy_init(void) { } diff -puN include/linux/page-flags.h~numa-policies-for-file-mappings-mpol_mf_move include/linux/page-flags.h --- 25/include/linux/page-flags.h~numa-policies-for-file-mappings-mpol_mf_move Wed Nov 17 14:12:37 2004 +++ 25-akpm/include/linux/page-flags.h Wed Nov 17 14:12:37 2004 @@ -74,6 +74,8 @@ #define PG_swapcache 16 /* Swap page: swp_entry_t in private */ #define PG_mappedtodisk 17 /* Has blocks allocated on-disk */ #define PG_reclaim 18 /* To be reclaimed asap */ +#define PG_sharedpolicy 19 /* Page was allocated for a file + mapping using a shared_policy */ /* @@ -290,6 +292,10 @@ extern unsigned long __read_page_state(u #define SetPageCompound(page) set_bit(PG_compound, &(page)->flags) #define ClearPageCompound(page) clear_bit(PG_compound, &(page)->flags) +#define PageSharedPolicy(page) test_bit(PG_sharedpolicy, &(page)->flags) +#define SetPageSharedPolicy(page) set_bit(PG_sharedpolicy, &(page)->flags) +#define ClearPageSharedPolicy(page) clear_bit(PG_sharedpolicy, &(page)->flags) + #ifdef CONFIG_SWAP #define PageSwapCache(page) test_bit(PG_swapcache, &(page)->flags) #define SetPageSwapCache(page) set_bit(PG_swapcache, &(page)->flags) diff -puN include/linux/pagemap.h~numa-policies-for-file-mappings-mpol_mf_move include/linux/pagemap.h --- 25/include/linux/pagemap.h~numa-policies-for-file-mappings-mpol_mf_move Wed Nov 17 14:12:37 2004 +++ 25-akpm/include/linux/pagemap.h Wed Nov 17 14:12:37 2004 @@ -50,14 +50,24 @@ static inline void mapping_set_gfp_mask( #define page_cache_release(page) put_page(page) void release_pages(struct page **pages, int nr, int cold); -static inline struct page *page_cache_alloc(struct address_space *x) + +static inline struct page *__page_cache_alloc(struct address_space *x, + unsigned long idx, + unsigned int gfp_mask) +{ + return alloc_page_shared_policy(gfp_mask, &x->policy, idx); +} + +static inline struct page *page_cache_alloc(struct address_space *x, + unsigned long idx) { - return alloc_pages(mapping_gfp_mask(x), 0); + return __page_cache_alloc(x, idx, mapping_gfp_mask(x)); } -static inline struct page *page_cache_alloc_cold(struct address_space *x) +static inline struct page *page_cache_alloc_cold(struct address_space *x, + unsigned long idx) { - return alloc_pages(mapping_gfp_mask(x)|__GFP_COLD, 0); + return __page_cache_alloc(x, idx, mapping_gfp_mask(x)|__GFP_COLD); } typedef int filler_t(void *, struct page *); diff -puN mm/filemap.c~numa-policies-for-file-mappings-mpol_mf_move mm/filemap.c --- 25/mm/filemap.c~numa-policies-for-file-mappings-mpol_mf_move Wed Nov 17 14:12:37 2004 +++ 25-akpm/mm/filemap.c Wed Nov 17 14:12:37 2004 @@ -566,7 +566,8 @@ repeat: page = find_lock_page(mapping, index); if (!page) { if (!cached_page) { - cached_page = alloc_page(gfp_mask); + cached_page = __page_cache_alloc(mapping, index, + gfp_mask); if (!cached_page) return NULL; } @@ -659,7 +660,7 @@ grab_cache_page_nowait(struct address_sp return NULL; } gfp_mask = mapping_gfp_mask(mapping) & ~__GFP_FS; - page = alloc_pages(gfp_mask, 0); + page = __page_cache_alloc(mapping, index, gfp_mask); if (page && add_to_page_cache_lru(page, mapping, index, gfp_mask)) { page_cache_release(page); page = NULL; @@ -836,7 +837,7 @@ no_cached_page: * page.. */ if (!cached_page) { - cached_page = page_cache_alloc_cold(mapping); + cached_page = page_cache_alloc_cold(mapping, index); if (!cached_page) { desc->error = -ENOMEM; goto out; @@ -1099,7 +1100,7 @@ static int fastcall page_cache_read(stru struct page *page; int error; - page = page_cache_alloc_cold(mapping); + page = page_cache_alloc_cold(mapping, offset); if (!page) return -ENOMEM; @@ -1481,9 +1482,35 @@ repeat: return 0; } + +#ifdef CONFIG_NUMA +int generic_file_set_policy(struct vm_area_struct *vma, + struct mempolicy *new) +{ + struct address_space *mapping = vma->vm_file->f_mapping; + return mpol_set_shared_policy(&mapping->policy, vma, new); +} + +struct mempolicy * +generic_file_get_policy(struct vm_area_struct *vma, + unsigned long addr) +{ + struct address_space *mapping = vma->vm_file->f_mapping; + unsigned long idx; + + idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + return mpol_shared_policy_lookup(&mapping->policy, idx); +} +#endif + + struct vm_operations_struct generic_file_vm_ops = { .nopage = filemap_nopage, .populate = filemap_populate, +#ifdef CONFIG_NUMA + .set_policy = generic_file_set_policy, + .get_policy = generic_file_get_policy, +#endif }; /* This is used for a general mmap of a disk file */ @@ -1533,7 +1560,7 @@ repeat: page = find_get_page(mapping, index); if (!page) { if (!cached_page) { - cached_page = page_cache_alloc_cold(mapping); + cached_page = page_cache_alloc_cold(mapping, index); if (!cached_page) return ERR_PTR(-ENOMEM); } @@ -1615,7 +1642,7 @@ repeat: page = find_lock_page(mapping, index); if (!page) { if (!*cached_page) { - *cached_page = page_cache_alloc(mapping); + *cached_page = page_cache_alloc(mapping, index); if (!*cached_page) return NULL; } diff -puN mm/mempolicy.c~numa-policies-for-file-mappings-mpol_mf_move mm/mempolicy.c --- 25/mm/mempolicy.c~numa-policies-for-file-mappings-mpol_mf_move Wed Nov 17 14:12:37 2004 +++ 25-akpm/mm/mempolicy.c Wed Nov 17 14:12:54 2004 @@ -2,6 +2,7 @@ * Simple NUMA memory policy for the Linux kernel. * * Copyright 2003,2004 Andi Kleen, SuSE Labs. + * Copyright 2004 Steve Longerbeam, MontaVista Software. * Subject to the GNU Public License, version 2. * * NUMA policy allows the user to give hints in which node(s) memory should @@ -47,15 +48,28 @@ */ /* Notebook: - fix mmap readahead to honour policy and enable policy for any page cache - object - statistics for bigpages - global policy for page cache? currently it uses process policy. Requires - first item above. + Page cache pages can now be policied, by adding a shared_policy tree to + inodes (actually located in address_space). One entry in the tree for + each mapped region of a file. Generic files now have set_policy and + get_policy methods in generic_file_vm_ops [stevel]. + + Added a page-move feature, whereby existing pte-mapped or filemap + pagecache pages that are/can be mapped to the given virtual memory + region, that do not satisfy the NUMA policy, are moved to a new + page that satisfies the policy. Enabled by the new mbind flag + MPOL_MF_MOVE [stevel]. + + statistics for bigpages. + + global policy for page cache? currently it uses per-file policies in + address_space (see first item above). + handle mremap for shared memory (currently ignored for the policy) grows down? + make bind policy root only? It can trigger oom much faster and the kernel is not always grateful with that. + could replace all the switch()es with a mempolicy_ops structure. */ @@ -66,6 +80,7 @@ #include #include #include +#include #include #include #include @@ -75,6 +90,9 @@ #include #include #include +#include +#include +#include #include #include @@ -232,33 +250,225 @@ static struct mempolicy *mpol_new(int mo return policy; } -/* Ensure all existing pages follow the policy. */ + +/* Return effective policy for a VMA */ +static struct mempolicy * +get_vma_policy(struct vm_area_struct *vma, unsigned long addr) +{ + struct mempolicy *pol = current->mempolicy; + + if (vma) { + if (vma->vm_ops && vma->vm_ops->get_policy) + pol = vma->vm_ops->get_policy(vma, addr); + else if (vma->vm_policy && + vma->vm_policy->policy != MPOL_DEFAULT) + pol = vma->vm_policy; + } + if (!pol) + pol = &default_policy; + return pol; +} + + +/* Find secondary valid nodes for an allocation */ +static int __mpol_node_valid(int nid, struct mempolicy *pol) +{ + switch (pol->policy) { + case MPOL_PREFERRED: + case MPOL_DEFAULT: + case MPOL_INTERLEAVE: + return 1; + case MPOL_BIND: { + struct zone **z; + for (z = pol->v.zonelist->zones; *z; z++) + if ((*z)->zone_pgdat->node_id == nid) + return 1; + return 0; + } + default: + BUG(); + return 0; + } +} + +int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr) +{ + return __mpol_node_valid(nid, get_vma_policy(vma, addr)); +} + +/* + * The given page doesn't match a file mapped VMA's policy. If the + * page is unused, remove it from the page cache, so that a new page + * can be later reallocated to the cache using the correct policy. + * Returns 0 if the page was removed from the cache, < 0 if failed. + * + * We use invalidate_mapping_pages(), which doesn't try very hard. + * It won't remove pages which are locked (won't wait for a lock), + * dirty, under writeback, or mapped by pte's. All the latter are + * valid checks for us, but we might be able to improve our success + * by waiting for a lock. + */ +static int +remove_invalid_filemap_page(struct page * page, + struct vm_area_struct *vma, + pgoff_t pgoff) +{ + /* + * the page in the cache is not in any of the nodes this + * VMA's policy wants it to be in. Can we remove it? + */ + if (!PageSharedPolicy(page) && + invalidate_mapping_pages(vma->vm_file->f_mapping, + pgoff, pgoff) > 0) { + PDprintk("removed cache page in node %ld, " + "pgoff=%lu, for %s\n", + page_to_nid(page), pgoff, + vma->vm_file->f_dentry->d_name.name); + return 0; + } + + /* + * the page is being used by other pagetable mappings, + * or is currently locked, dirty, or under writeback. + */ + PDprintk("could not remove cache page in node %ld, " + "pgoff=%lu, for %s\n", + page_to_nid(page), pgoff, + vma->vm_file->f_dentry->d_name.name); + return -EIO; +} + +/* + * The given page doesn't match a VMA's policy. Allocate a new + * page using the policy, copy contents from old to new, free + * the old page, map in the new page. This looks a lot like a COW. + */ +static int +move_invalid_page(struct page * page, struct mempolicy *pol, + struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd) +{ + struct mm_struct *mm = vma->vm_mm; + struct page * new_page; + struct vm_area_struct pvma; + pte_t *page_table; + pte_t entry; + + PDprintk("moving anon page in node %ld, address=%08lx\n", + page_to_nid(page), addr); + + if (!PageReserved(page)) + page_cache_get(page); + spin_unlock(&mm->page_table_lock); + if (unlikely(anon_vma_prepare(vma))) + goto err_no_mem; + + /* Create a pseudo vma that just contains the policy */ + memset(&pvma, 0, sizeof(struct vm_area_struct)); + pvma.vm_end = PAGE_SIZE; + pvma.vm_pgoff = vma->vm_pgoff; + pvma.vm_policy = pol; + new_page = alloc_page_vma(GFP_HIGHUSER, &pvma, addr); + if (!new_page) + goto err_no_mem; + + copy_user_highpage(new_page, page, addr); + + spin_lock(&mm->page_table_lock); + page_table = pte_offset_map(pmd, addr); + if (!PageReserved(page)) + page_remove_rmap(page); + + flush_cache_page(vma, addr); + entry = pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)); + if (likely(vma->vm_flags & VM_WRITE)) + entry = pte_mkwrite(entry); + ptep_establish(vma, addr, page_table, entry); + update_mmu_cache(vma, addr, entry); + lru_cache_add_active(new_page); + page_add_anon_rmap(new_page, vma, addr); + + pte_unmap(page_table); + page_cache_release(page); /* release our ref on the old page */ + page_cache_release(page); /* release our pte ref on the old page */ + return 0; + + err_no_mem: + spin_lock(&mm->page_table_lock); + return -ENOMEM; +} + +/* Ensure all existing pages in a VMA follow the policy. */ static int -verify_pages(struct mm_struct *mm, - unsigned long addr, unsigned long end, unsigned long *nodes) +move_verify_pages(struct vm_area_struct *vma, struct mempolicy *pol, + unsigned long flags) { - while (addr < end) { + struct mm_struct *mm = vma->vm_mm; + unsigned long addr; + unsigned long start = vma->vm_start; + unsigned long end = vma->vm_end; + + if (!(flags & (MPOL_MF_MOVE | MPOL_MF_STRICT))) + return 0; + + for (addr = start; addr < end; addr += PAGE_SIZE) { struct page *p; pte_t *pte; pmd_t *pmd; pgd_t *pgd; pml4_t *pml4; + int err; + + /* + * first, if this is a file mapping and we are moving pages, + * check for invalid page cache pages, and if they are unused, + * remove. + */ + if (vma->vm_ops && vma->vm_ops->nopage) { + struct address_space *mapping = + vma->vm_file->f_mapping; + unsigned long pgoff = + ((addr - vma->vm_start) >> PAGE_CACHE_SHIFT) + + vma->vm_pgoff; + + p = find_get_page(mapping, pgoff); + if (p) { + err = 0; + if (!__mpol_node_valid(page_to_nid(p), pol)) { + if (!(flags & MPOL_MF_MOVE)) + err = -EIO; + else + err = remove_invalid_filemap_page( + p,vma,pgoff); + } + page_cache_release(p); /* find_get_page */ + if (err && (flags & MPOL_MF_STRICT)) + return err; + } + } + + /* + * Now let's see if there is a pte-mapped page that doesn't + * satisfy the policy. Because of the above, we can be sure + * from here that, if there is a VMA page that's pte-mapped + * and it belongs to the page cache, it either satisfies the + * policy, or we don't mind if it doesn't (MF_STRICT not set). + */ + spin_lock(&mm->page_table_lock); pml4 = pml4_offset(mm, addr); if (pml4_none(*pml4)) { - unsigned long next = (addr + PML4_SIZE) & PML4_MASK; - if (next > addr) - break; - addr = next; + spin_unlock(&mm->page_table_lock); continue; } pgd = pml4_pgd_offset(pml4, addr); + if (pgd_none(*pgd)) { - addr = (addr + PGDIR_SIZE) & PGDIR_MASK; + spin_unlock(&mm->page_table_lock); continue; } pmd = pmd_offset(pgd, addr); if (pmd_none(*pmd)) { - addr = (addr + PMD_SIZE) & PMD_MASK; + spin_unlock(&mm->page_table_lock); continue; } p = NULL; @@ -267,19 +477,29 @@ verify_pages(struct mm_struct *mm, p = pte_page(*pte); pte_unmap(pte); if (p) { - unsigned nid = page_to_nid(p); - if (!test_bit(nid, nodes)) - return -EIO; + err = 0; + if (!__mpol_node_valid(page_to_nid(p), pol)) { + if (!(flags & MPOL_MF_MOVE)) + err = -EIO; + else + err = move_invalid_page(p, pol, vma, + addr, pmd); + } + if (err && (flags & MPOL_MF_STRICT)) { + spin_unlock(&mm->page_table_lock); + return err; + } } - addr += PAGE_SIZE; + spin_unlock(&mm->page_table_lock); } + return 0; } /* Step 1: check the range */ static struct vm_area_struct * check_range(struct mm_struct *mm, unsigned long start, unsigned long end, - unsigned long *nodes, unsigned long flags) + struct mempolicy *policy, unsigned long flags) { int err; struct vm_area_struct *first, *vma, *prev; @@ -293,9 +513,8 @@ check_range(struct mm_struct *mm, unsign return ERR_PTR(-EFAULT); if (prev && prev->vm_end < vma->vm_start) return ERR_PTR(-EFAULT); - if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) { - err = verify_pages(vma->vm_mm, - vma->vm_start, vma->vm_end, nodes); + if (flags & (MPOL_MF_MOVE | MPOL_MF_STRICT)) { + err = move_verify_pages(vma, policy, flags); if (err) { first = ERR_PTR(err); break; @@ -362,12 +581,13 @@ asmlinkage long sys_mbind(unsigned long DECLARE_BITMAP(nodes, MAX_NUMNODES); int err; - if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX) + if ((flags & ~(unsigned long)(MPOL_MF_STRICT | MPOL_MF_MOVE)) || + mode > MPOL_MAX) return -EINVAL; if (start & ~PAGE_MASK) return -EINVAL; if (mode == MPOL_DEFAULT) - flags &= ~MPOL_MF_STRICT; + flags &= ~(MPOL_MF_STRICT | MPOL_MF_MOVE); len = (len + PAGE_SIZE - 1) & PAGE_MASK; end = start + len; if (end < start) @@ -387,7 +607,7 @@ asmlinkage long sys_mbind(unsigned long mode,nodes[0]); down_write(&mm->mmap_sem); - vma = check_range(mm, start, end, nodes, flags); + vma = check_range(mm, start, end, new, flags); err = PTR_ERR(vma); if (!IS_ERR(vma)) err = mbind_range(vma, start, end, new); @@ -620,24 +840,6 @@ asmlinkage long compat_sys_mbind(compat_ #endif -/* Return effective policy for a VMA */ -static struct mempolicy * -get_vma_policy(struct vm_area_struct *vma, unsigned long addr) -{ - struct mempolicy *pol = current->mempolicy; - - if (vma) { - if (vma->vm_ops && vma->vm_ops->get_policy) - pol = vma->vm_ops->get_policy(vma, addr); - else if (vma->vm_policy && - vma->vm_policy->policy != MPOL_DEFAULT) - pol = vma->vm_policy; - } - if (!pol) - pol = &default_policy; - return pol; -} - /* Return a zonelist representing a mempolicy */ static struct zonelist *zonelist_policy(unsigned gfp, struct mempolicy *policy) { @@ -872,28 +1074,6 @@ int mpol_first_node(struct vm_area_struc return 0; } -/* Find secondary valid nodes for an allocation */ -int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr) -{ - struct mempolicy *pol = get_vma_policy(vma, addr); - - switch (pol->policy) { - case MPOL_PREFERRED: - case MPOL_DEFAULT: - case MPOL_INTERLEAVE: - return 1; - case MPOL_BIND: { - struct zone **z; - for (z = pol->v.zonelist->zones; *z; z++) - if ((*z)->zone_pgdat->node_id == nid) - return 1; - return 0; - } - default: - BUG(); - return 0; - } -} /* * Shared memory backing store policy support. @@ -1013,10 +1193,14 @@ restart: /* Take care of old policies in the same range. */ while (n && n->start < end) { struct rb_node *next = rb_next(&n->nd); - if (n->start >= start) { - if (n->end <= end) + if (n->start == start && n->end == end && + mpol_equal(n->policy, new->policy)) { + /* the same shared policy already exists, just exit */ + goto out; + } else if (n->start >= start) { + if (n->end <= end) { sp_delete(sp, n); - else + } else n->start = end; } else { /* Old policy spanning whole new range. */ @@ -1042,6 +1226,7 @@ restart: } if (new) sp_insert(sp, new); + out: spin_unlock(&sp->lock); if (new2) { mpol_free(new2->policy); @@ -1093,6 +1278,37 @@ void mpol_free_shared_policy(struct shar spin_unlock(&p->lock); } +struct page * +alloc_page_shared_policy(unsigned gfp, struct shared_policy *sp, + unsigned long idx) +{ + struct page *page; + struct mempolicy * shared_pol = NULL; + + if (sp->root.rb_node) { + struct vm_area_struct pvma; + /* Create a pseudo vma that just contains the policy */ + memset(&pvma, 0, sizeof(struct vm_area_struct)); + pvma.vm_end = PAGE_SIZE; + pvma.vm_pgoff = idx; + shared_pol = mpol_shared_policy_lookup(sp, idx); + pvma.vm_policy = shared_pol; + page = alloc_page_vma(gfp, &pvma, 0); + mpol_free(pvma.vm_policy); + } else { + page = alloc_pages(gfp, 0); + } + + if (page) { + if (shared_pol) + SetPageSharedPolicy(page); + else + ClearPageSharedPolicy(page); + } + + return page; +} + /* assumes fs == KERNEL_DS */ void __init numa_policy_init(void) { diff -puN mm/readahead.c~numa-policies-for-file-mappings-mpol_mf_move mm/readahead.c --- 25/mm/readahead.c~numa-policies-for-file-mappings-mpol_mf_move Wed Nov 17 14:12:37 2004 +++ 25-akpm/mm/readahead.c Wed Nov 17 14:12:37 2004 @@ -245,7 +245,7 @@ __do_page_cache_readahead(struct address continue; spin_unlock_irq(&mapping->tree_lock); - page = page_cache_alloc_cold(mapping); + page = page_cache_alloc_cold(mapping, page_offset); spin_lock_irq(&mapping->tree_lock); if (!page) break; diff -puN mm/shmem.c~numa-policies-for-file-mappings-mpol_mf_move mm/shmem.c --- 25/mm/shmem.c~numa-policies-for-file-mappings-mpol_mf_move Wed Nov 17 14:12:37 2004 +++ 25-akpm/mm/shmem.c Wed Nov 17 14:12:37 2004 @@ -903,16 +903,7 @@ static struct page * shmem_alloc_page(unsigned long gfp, struct shmem_inode_info *info, unsigned long idx) { - struct vm_area_struct pvma; - struct page *page; - - memset(&pvma, 0, sizeof(struct vm_area_struct)); - pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); - pvma.vm_pgoff = idx; - pvma.vm_end = PAGE_SIZE; - page = alloc_page_vma(gfp, &pvma, 0); - mpol_free(pvma.vm_policy); - return page; + return alloc_page_shared_policy(gfp, &info->policy, idx); } #else static inline struct page * _