diff -purN -X /home/mbligh/.diff.exclude reference/Documentation/numastat.txt current/Documentation/numastat.txt
--- reference/Documentation/numastat.txt	1969-12-31 16:00:00.000000000 -0800
+++ current/Documentation/numastat.txt	2004-05-01 10:25:17.000000000 -0700
@@ -0,0 +1,22 @@
+
+Numa policy hit/miss statistics
+
+/sys/devices/system/node/node*/numastat
+
+All units are pages. Hugepages have separate counters.
+
+numa_hit			A process wanted to allocate memory from this node,
+					and succeeded.
+numa_miss			A process wanted to allocate memory from this node,
+					but ended up with memory from another.
+numa_foreign		A process wanted to allocate on another node,
+				    but ended up with memory from this one.
+local_node			A process ran on this node and got memory from it.
+other_node			A process ran on this node and got memory from another node.
+interleave_hit 		Interleaving wanted to allocate from this node
+					and succeeded.
+
+For easier reading you can use the numastat utility from the numactl package
+(ftp://ftp.suse.com/pub/people/ak/numa/numactl*). Note that it only works
+well right now on machines with a small number of CPUs.
+
diff -purN -X /home/mbligh/.diff.exclude reference/arch/i386/kernel/entry.S current/arch/i386/kernel/entry.S
--- reference/arch/i386/kernel/entry.S	2004-04-30 12:55:07.000000000 -0700
+++ current/arch/i386/kernel/entry.S	2004-05-01 10:25:16.000000000 -0700
@@ -908,9 +908,9 @@ ENTRY(sys_call_table)
 	.long sys_utimes
  	.long sys_fadvise64_64
 	.long sys_ni_syscall	/* sys_vserver */
-	.long sys_ni_syscall	/* sys_mbind */
-	.long sys_ni_syscall	/* 275 sys_get_mempolicy */
-	.long sys_ni_syscall	/* sys_set_mempolicy */
+	.long sys_mbind
+	.long sys_get_mempolicy
+	.long sys_set_mempolicy
 	.long sys_mq_open
 	.long sys_mq_unlink
 	.long sys_mq_timedsend
diff -purN -X /home/mbligh/.diff.exclude reference/arch/ia64/ia32/binfmt_elf32.c current/arch/ia64/ia32/binfmt_elf32.c
--- reference/arch/ia64/ia32/binfmt_elf32.c	2004-04-30 11:23:03.000000000 -0700
+++ current/arch/ia64/ia32/binfmt_elf32.c	2004-05-01 10:25:17.000000000 -0700
@@ -104,6 +104,7 @@ ia64_elf32_init (struct pt_regs *regs)
 		vma->vm_pgoff = 0;
 		vma->vm_file = NULL;
 		vma->vm_private_data = NULL;
+		mpol_set_vma_default(vma);
 		down_write(&current->mm->mmap_sem);
 		{
 			insert_vm_struct(current->mm, vma);
@@ -190,6 +191,7 @@ ia32_setup_arg_pages (struct linux_binpr
 		mpnt->vm_pgoff = 0;
 		mpnt->vm_file = NULL;
 		mpnt->vm_private_data = 0;
+		mpol_set_vma_default(mpnt);
 		insert_vm_struct(current->mm, mpnt);
 		current->mm->total_vm = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
 	}
diff -purN -X /home/mbligh/.diff.exclude reference/arch/ia64/kernel/entry.S current/arch/ia64/kernel/entry.S
--- reference/arch/ia64/kernel/entry.S	2004-04-30 11:23:03.000000000 -0700
+++ current/arch/ia64/kernel/entry.S	2004-05-01 10:25:16.000000000 -0700
@@ -1501,9 +1501,9 @@ sys_call_table:
 	data8 sys_clock_nanosleep
 	data8 sys_fstatfs64
 	data8 sys_statfs64
-	data8 sys_ni_syscall
-	data8 sys_ni_syscall			// 1260
-	data8 sys_ni_syscall
+	data8 sys_mbind
+	data8 sys_get_mempolicy			// 1260
+	data8 sys_set_mempolicy
 	data8 sys_mq_open
 	data8 sys_mq_unlink
 	data8 sys_mq_timedsend
diff -purN -X /home/mbligh/.diff.exclude reference/arch/ia64/kernel/perfmon.c current/arch/ia64/kernel/perfmon.c
--- reference/arch/ia64/kernel/perfmon.c	2004-04-30 11:23:03.000000000 -0700
+++ current/arch/ia64/kernel/perfmon.c	2004-05-01 10:25:17.000000000 -0700
@@ -2308,6 +2308,7 @@ pfm_smpl_buffer_alloc(struct task_struct
 	vma->vm_ops	     = NULL;
 	vma->vm_pgoff	     = 0;
 	vma->vm_file	     = NULL;
+	mpol_set_vma_default(vma);
 	vma->vm_private_data = NULL; 
 
 	/*
diff -purN -X /home/mbligh/.diff.exclude reference/arch/ia64/mm/init.c current/arch/ia64/mm/init.c
--- reference/arch/ia64/mm/init.c	2004-04-30 11:23:03.000000000 -0700
+++ current/arch/ia64/mm/init.c	2004-05-01 10:25:17.000000000 -0700
@@ -132,6 +132,7 @@ ia64_init_addr_space (void)
 		vma->vm_pgoff = 0;
 		vma->vm_file = NULL;
 		vma->vm_private_data = NULL;
+		mpol_set_vma_default(vma);
 		insert_vm_struct(current->mm, vma);
 	}
 
@@ -144,6 +145,7 @@ ia64_init_addr_space (void)
 			vma->vm_end = PAGE_SIZE;
 			vma->vm_page_prot = __pgprot(pgprot_val(PAGE_READONLY) | _PAGE_MA_NAT);
 			vma->vm_flags = VM_READ | VM_MAYREAD | VM_IO | VM_RESERVED;
+			mpol_set_vma_default(vma);
 			insert_vm_struct(current->mm, vma);
 		}
 	}
diff -purN -X /home/mbligh/.diff.exclude reference/arch/m68k/atari/stram.c current/arch/m68k/atari/stram.c
--- reference/arch/m68k/atari/stram.c	2004-04-30 11:23:03.000000000 -0700
+++ current/arch/m68k/atari/stram.c	2004-05-01 10:25:17.000000000 -0700
@@ -752,7 +752,7 @@ static int unswap_by_read(unsigned short
 			/* Get a page for the entry, using the existing
 			   swap cache page if there is one.  Otherwise,
 			   get a clean page and read the swap into it. */
-			page = read_swap_cache_async(entry);
+			page = read_swap_cache_async(entry, NULL, 0);
 			if (!page) {
 				swap_free(entry);
 				return -ENOMEM;
diff -purN -X /home/mbligh/.diff.exclude reference/arch/s390/kernel/compat_exec.c current/arch/s390/kernel/compat_exec.c
--- reference/arch/s390/kernel/compat_exec.c	2004-04-30 11:23:09.000000000 -0700
+++ current/arch/s390/kernel/compat_exec.c	2004-05-01 10:25:17.000000000 -0700
@@ -72,6 +72,7 @@ int setup_arg_pages32(struct linux_binpr
 		mpnt->vm_ops = NULL;
 		mpnt->vm_pgoff = 0;
 		mpnt->vm_file = NULL;
+		mpol_set_vma_default(mpnt);
 		INIT_LIST_HEAD(&mpnt->shared);
 		mpnt->vm_private_data = (void *) 0;
 		insert_vm_struct(mm, mpnt);
diff -purN -X /home/mbligh/.diff.exclude reference/arch/x86_64/ia32/ia32_binfmt.c current/arch/x86_64/ia32/ia32_binfmt.c
--- reference/arch/x86_64/ia32/ia32_binfmt.c	2004-04-30 11:23:12.000000000 -0700
+++ current/arch/x86_64/ia32/ia32_binfmt.c	2004-05-01 10:25:17.000000000 -0700
@@ -365,6 +365,7 @@ int setup_arg_pages(struct linux_binprm 
 		mpnt->vm_ops = NULL;
 		mpnt->vm_pgoff = 0;
 		mpnt->vm_file = NULL;
+		mpol_set_vma_default(mpnt);
 		INIT_LIST_HEAD(&mpnt->shared);
 		mpnt->vm_private_data = (void *) 0;
 		insert_vm_struct(mm, mpnt);
diff -purN -X /home/mbligh/.diff.exclude reference/drivers/base/node.c current/drivers/base/node.c
--- reference/drivers/base/node.c	2004-03-11 14:34:06.000000000 -0800
+++ current/drivers/base/node.c	2004-05-01 10:25:17.000000000 -0700
@@ -30,13 +30,20 @@ static ssize_t node_read_cpumap(struct s
 
 static SYSDEV_ATTR(cpumap,S_IRUGO,node_read_cpumap,NULL);
 
+/* Can be overwritten by architecture specific code. */
+int __attribute__((weak)) hugetlb_report_node_meminfo(int node, char *buf)
+{
+	return 0;
+}
+
 #define K(x) ((x) << (PAGE_SHIFT - 10))
 static ssize_t node_read_meminfo(struct sys_device * dev, char * buf)
 {
+	int n;
 	int nid = dev->id;
 	struct sysinfo i;
 	si_meminfo_node(&i, nid);
-	return sprintf(buf, "\n"
+	n = sprintf(buf, "\n"
 		       "Node %d MemTotal:     %8lu kB\n"
 		       "Node %d MemFree:      %8lu kB\n"
 		       "Node %d MemUsed:      %8lu kB\n"
@@ -51,10 +58,52 @@ static ssize_t node_read_meminfo(struct 
 		       nid, K(i.freehigh),
 		       nid, K(i.totalram-i.totalhigh),
 		       nid, K(i.freeram-i.freehigh));
+	n += hugetlb_report_node_meminfo(nid, buf + n);
+	return n;
 }
+
 #undef K 
 static SYSDEV_ATTR(meminfo,S_IRUGO,node_read_meminfo,NULL);
 
+static ssize_t node_read_numastat(struct sys_device * dev, char * buf)
+{
+	unsigned long numa_hit, numa_miss, interleave_hit, numa_foreign;
+	unsigned long local_node, other_node;
+	int i, cpu;
+	pg_data_t *pg = NODE_DATA(dev->id);
+	numa_hit = 0;
+	numa_miss = 0;
+	interleave_hit = 0;
+	numa_foreign = 0;
+	local_node = 0;
+	other_node = 0;
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		struct zone *z = &pg->node_zones[i];
+		for (cpu = 0; cpu < NR_CPUS; cpu++) {
+			struct per_cpu_pageset *ps = &z->pageset[cpu];
+			numa_hit += ps->numa_hit;
+			numa_miss += ps->numa_miss;
+			numa_foreign += ps->numa_foreign;
+			interleave_hit += ps->interleave_hit;
+			local_node += ps->local_node;
+			other_node += ps->other_node;
+		}
+	}
+	return sprintf(buf,
+		       "numa_hit %lu\n"
+		       "numa_miss %lu\n"
+		       "numa_foreign %lu\n"
+		       "interleave_hit %lu\n"
+		       "local_node %lu\n"
+		       "other_node %lu\n",
+		       numa_hit,
+		       numa_miss,
+		       numa_foreign,
+		       interleave_hit,
+		       local_node,
+		       other_node);
+}
+static SYSDEV_ATTR(numastat,S_IRUGO,node_read_numastat,NULL);
 
 /*
  * register_node - Setup a driverfs device for a node.
@@ -74,6 +123,7 @@ int __init register_node(struct node *no
 	if (!error){
 		sysdev_create_file(&node->sysdev, &attr_cpumap);
 		sysdev_create_file(&node->sysdev, &attr_meminfo);
+		sysdev_create_file(&node->sysdev, &attr_numastat);
 	}
 	return error;
 }
diff -purN -X /home/mbligh/.diff.exclude reference/fs/exec.c current/fs/exec.c
--- reference/fs/exec.c	2004-05-01 10:21:04.000000000 -0700
+++ current/fs/exec.c	2004-05-01 10:25:17.000000000 -0700
@@ -427,6 +427,7 @@ int setup_arg_pages(struct linux_binprm 
 		mpnt->vm_ops = NULL;
 		mpnt->vm_pgoff = 0;
 		mpnt->vm_file = NULL;
+		mpol_set_vma_default(mpnt);
 		INIT_LIST_HEAD(&mpnt->shared);
 		mpnt->vm_private_data = (void *) 0;
 		insert_vm_struct(mm, mpnt);
diff -purN -X /home/mbligh/.diff.exclude reference/fs/hugetlbfs/inode.c current/fs/hugetlbfs/inode.c
--- reference/fs/hugetlbfs/inode.c	2004-05-01 10:23:54.000000000 -0700
+++ current/fs/hugetlbfs/inode.c	2004-05-01 10:25:17.000000000 -0700
@@ -375,6 +375,7 @@ static struct inode *hugetlbfs_get_inode
 
 	inode = new_inode(sb);
 	if (inode) {
+		struct hugetlbfs_inode_info *info;
 		inode->i_mode = mode;
 		inode->i_uid = uid;
 		inode->i_gid = gid;
@@ -383,6 +384,8 @@ static struct inode *hugetlbfs_get_inode
 		inode->i_mapping->a_ops = &hugetlbfs_aops;
 		inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		info = HUGETLBFS_I(inode);
+		mpol_shared_policy_init(&info->policy);
 		switch (mode & S_IFMT) {
 		default:
 			init_special_inode(inode, mode, dev);
@@ -510,6 +513,33 @@ static void hugetlbfs_put_super(struct s
 	}
 }
 
+static kmem_cache_t *hugetlbfs_inode_cachep;
+
+static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
+{
+	struct hugetlbfs_inode_info *p;
+
+	p = kmem_cache_alloc(hugetlbfs_inode_cachep, SLAB_KERNEL);
+	if (!p)
+		return NULL;
+	return &p->vfs_inode;
+}
+
+static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
+{
+	struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
+
+	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+	    SLAB_CTOR_CONSTRUCTOR)
+		inode_init_once(&ei->vfs_inode);
+}
+
+static void hugetlbfs_destroy_inode(struct inode *inode)
+{
+	mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy);
+	kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
+}
+
 static struct address_space_operations hugetlbfs_aops = {
 	.readpage	= hugetlbfs_readpage,
 	.prepare_write	= hugetlbfs_prepare_write,
@@ -541,6 +571,8 @@ static struct inode_operations hugetlbfs
 };
 
 static struct super_operations hugetlbfs_ops = {
+	.alloc_inode    = hugetlbfs_alloc_inode,
+	.destroy_inode  = hugetlbfs_destroy_inode,
 	.statfs		= hugetlbfs_statfs,
 	.drop_inode	= hugetlbfs_drop_inode,
 	.put_super	= hugetlbfs_put_super,
@@ -755,9 +787,16 @@ static int __init init_hugetlbfs_fs(void
 	int error;
 	struct vfsmount *vfsmount;
 
+	hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
+					sizeof(struct hugetlbfs_inode_info),
+					0, SLAB_RECLAIM_ACCOUNT,
+					init_once, NULL);
+	if (hugetlbfs_inode_cachep == NULL)
+		return -ENOMEM;
+
 	error = register_filesystem(&hugetlbfs_fs_type);
 	if (error)
-		return error;
+		goto out;
 
 	vfsmount = kern_mount(&hugetlbfs_fs_type);
 
@@ -767,11 +806,16 @@ static int __init init_hugetlbfs_fs(void
 	}
 
 	error = PTR_ERR(vfsmount);
+
+ out:
+	if (error)
+		kmem_cache_destroy(hugetlbfs_inode_cachep);
 	return error;
 }
 
 static void __exit exit_hugetlbfs_fs(void)
 {
+	kmem_cache_destroy(hugetlbfs_inode_cachep);
 	unregister_filesystem(&hugetlbfs_fs_type);
 }
 
diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-ia64/unistd.h current/include/asm-ia64/unistd.h
--- reference/include/asm-ia64/unistd.h	2004-04-30 11:23:48.000000000 -0700
+++ current/include/asm-ia64/unistd.h	2004-05-01 10:25:16.000000000 -0700
@@ -248,9 +248,9 @@
 #define __NR_clock_nanosleep		1256
 #define __NR_fstatfs64			1257
 #define __NR_statfs64			1258
-#define __NR_reserved1			1259	/* reserved for NUMA interface */
-#define __NR_reserved2			1260	/* reserved for NUMA interface */
-#define __NR_reserved3			1261	/* reserved for NUMA interface */
+#define __NR_mbind			1259
+#define __NR_get_mempolicy		1260
+#define __NR_set_mempolicy		1261
 #define __NR_mq_open			1262
 #define __NR_mq_unlink			1263
 #define __NR_mq_timedsend		1264
diff -purN -X /home/mbligh/.diff.exclude reference/include/asm-x86_64/unistd.h current/include/asm-x86_64/unistd.h
--- reference/include/asm-x86_64/unistd.h	2004-04-30 11:23:52.000000000 -0700
+++ current/include/asm-x86_64/unistd.h	2004-05-01 10:25:16.000000000 -0700
@@ -534,7 +534,7 @@ __SYSCALL(__NR_utimes, sys_utimes)
 __SYSCALL(__NR_vserver, sys_ni_syscall)
 #define __NR_vserver		236
 __SYSCALL(__NR_vserver, sys_ni_syscall)
-#define __NR_mbind 			237
+#define __NR_mbind 		237
 __SYSCALL(__NR_mbind, sys_ni_syscall)
 #define __NR_set_mempolicy 	238
 __SYSCALL(__NR_set_mempolicy, sys_ni_syscall)
@@ -546,7 +546,7 @@ __SYSCALL(__NR_mq_open, sys_mq_open)
 __SYSCALL(__NR_mq_unlink, sys_mq_unlink)
 #define __NR_mq_timedsend 	242
 __SYSCALL(__NR_mq_timedsend, sys_mq_timedsend)
-#define __NR_mq_timedreceive 243
+#define __NR_mq_timedreceive	243
 __SYSCALL(__NR_mq_timedreceive, sys_mq_timedreceive)
 #define __NR_mq_notify 		244
 __SYSCALL(__NR_mq_notify, sys_mq_notify)
diff -purN -X /home/mbligh/.diff.exclude reference/include/linux/bitmap.h current/include/linux/bitmap.h
--- reference/include/linux/bitmap.h	2004-04-30 11:23:52.000000000 -0700
+++ current/include/linux/bitmap.h	2004-05-01 10:25:16.000000000 -0700
@@ -29,7 +29,8 @@ static inline void bitmap_fill(unsigned 
 static inline void bitmap_copy(unsigned long *dst,
 			const unsigned long *src, int bits)
 {
-	memcpy(dst, src, BITS_TO_LONGS(bits)*sizeof(unsigned long));
+	int len = BITS_TO_LONGS(bits)*sizeof(unsigned long);
+	memcpy(dst, src, len);
 }
 
 void bitmap_shift_right(unsigned long *dst,
diff -purN -X /home/mbligh/.diff.exclude reference/include/linux/gfp.h current/include/linux/gfp.h
--- reference/include/linux/gfp.h	2004-04-30 11:23:53.000000000 -0700
+++ current/include/linux/gfp.h	2004-05-01 10:25:16.000000000 -0700
@@ -4,6 +4,10 @@
 #include <linux/mmzone.h>
 #include <linux/stddef.h>
 #include <linux/linkage.h>
+#include <linux/config.h>
+
+struct vm_area_struct;
+
 /*
  * GFP bitmasks..
  */
@@ -69,19 +73,38 @@
  * For the normal case of non-DISCONTIGMEM systems the NODE_DATA() gets
  * optimized to &contig_page_data at compile-time.
  */
-extern struct page * FASTCALL(__alloc_pages(unsigned int, unsigned int, struct zonelist *));
-static inline struct page * alloc_pages_node(int nid, unsigned int gfp_mask, unsigned int order)
+extern struct page *
+FASTCALL(__alloc_pages(unsigned int, unsigned int, struct zonelist *));
+
+static inline struct page *alloc_pages_node(int nid, unsigned int gfp_mask,
+						unsigned int order)
 {
 	if (unlikely(order >= MAX_ORDER))
 		return NULL;
 
-	return __alloc_pages(gfp_mask, order, NODE_DATA(nid)->node_zonelists + (gfp_mask & GFP_ZONEMASK));
+	return __alloc_pages(gfp_mask, order,
+		NODE_DATA(nid)->node_zonelists + (gfp_mask & GFP_ZONEMASK));
 }
 
+#ifdef CONFIG_NUMA
+extern struct page *alloc_pages_current(unsigned gfp_mask, unsigned order);
+
+static inline struct page *
+alloc_pages(unsigned int gfp_mask, unsigned int order)
+{
+	if (unlikely(order >= MAX_ORDER))
+		return NULL;
+
+	return alloc_pages_current(gfp_mask, order);
+}
+extern struct page *alloc_page_vma(unsigned gfp_mask,
+			struct vm_area_struct *vma, unsigned long addr);
+#else
 #define alloc_pages(gfp_mask, order) \
 		alloc_pages_node(numa_node_id(), gfp_mask, order)
-#define alloc_page(gfp_mask) \
-		alloc_pages_node(numa_node_id(), gfp_mask, 0)
+#define alloc_page_vma(gfp_mask, vma, addr) alloc_pages(gfp_mask, 0)
+#endif
+#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
 
 extern unsigned long FASTCALL(__get_free_pages(unsigned int gfp_mask, unsigned int order));
 extern unsigned long FASTCALL(get_zeroed_page(unsigned int gfp_mask));
diff -purN -X /home/mbligh/.diff.exclude reference/include/linux/hugetlb.h current/include/linux/hugetlb.h
--- reference/include/linux/hugetlb.h	2004-04-30 11:23:53.000000000 -0700
+++ current/include/linux/hugetlb.h	2004-05-01 10:25:17.000000000 -0700
@@ -3,6 +3,8 @@
 
 #ifdef CONFIG_HUGETLB_PAGE
 
+#include <linux/mempolicy.h>
+
 struct ctl_table;
 
 static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
@@ -103,6 +105,17 @@ struct hugetlbfs_sb_info {
 	spinlock_t	stat_lock;
 };
 
+
+struct hugetlbfs_inode_info {
+	struct shared_policy policy;
+	struct inode vfs_inode;
+};
+
+static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
+{
+	return container_of(inode, struct hugetlbfs_inode_info, vfs_inode);
+}
+
 static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
 {
 	return sb->s_fs_info;
diff -purN -X /home/mbligh/.diff.exclude reference/include/linux/mempolicy.h current/include/linux/mempolicy.h
--- reference/include/linux/mempolicy.h	1969-12-31 16:00:00.000000000 -0800
+++ current/include/linux/mempolicy.h	2004-05-01 10:25:16.000000000 -0700
@@ -0,0 +1,221 @@
+#ifndef _LINUX_MEMPOLICY_H
+#define _LINUX_MEMPOLICY_H 1
+
+#include <linux/errno.h>
+
+/*
+ * NUMA memory policies for Linux.
+ * Copyright 2003,2004 Andi Kleen SuSE Labs
+ */
+
+/* Policies */
+#define MPOL_DEFAULT	0
+#define MPOL_PREFERRED	1
+#define MPOL_BIND	2
+#define MPOL_INTERLEAVE	3
+
+#define MPOL_MAX MPOL_INTERLEAVE
+
+/* Flags for get_mem_policy */
+#define MPOL_F_NODE	(1<<0)	/* return next IL mode instead of node mask */
+#define MPOL_F_ADDR	(1<<1)	/* look up vma using address */
+
+/* Flags for mbind */
+#define MPOL_MF_STRICT	(1<<0)	/* Verify existing pages in the mapping */
+
+#ifdef __KERNEL__
+
+#include <linux/config.h>
+#include <linux/mmzone.h>
+#include <linux/bitmap.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+#include <asm/semaphore.h>
+
+struct vm_area_struct;
+
+#ifdef CONFIG_NUMA
+
+/*
+ * Describe a memory policy.
+ *
+ * A mempolicy can be either associated with a process or with a VMA.
+ * For VMA related allocations the VMA policy is preferred, otherwise
+ * the process policy is used. Interrupts ignore the memory policy
+ * of the current process.
+ *
+ * Locking policy for interlave:
+ * In process context there is no locking because only the process accesses
+ * its own state. All vma manipulation is somewhat protected by a down_read on
+ * mmap_sem. For allocating in the interleave policy the page_table_lock
+ * must be also aquired to protect il_next.
+ *
+ * Freeing policy:
+ * When policy is MPOL_BIND v.zonelist is kmalloc'ed and must be kfree'd.
+ * All other policies don't have any external state. mpol_free() handles this.
+ *
+ * Copying policy objects:
+ * For MPOL_BIND the zonelist must be always duplicated. mpol_clone() does this.
+ */
+struct mempolicy {
+	atomic_t refcnt;
+	short policy; 	/* See MPOL_* above */
+	union {
+		struct zonelist  *zonelist;	/* bind */
+		short 		 preferred_node; /* preferred */
+		DECLARE_BITMAP(nodes, MAX_NUMNODES); /* interleave */
+		/* undefined for default */
+	} v;
+};
+
+/* An NULL mempolicy pointer is a synonym of &default_policy. */
+extern struct mempolicy default_policy;
+
+/*
+ * Support for managing mempolicy data objects (clone, copy, destroy)
+ * The default fast path of a NULL MPOL_DEFAULT policy is always inlined.
+ */
+
+extern void __mpol_free(struct mempolicy *pol);
+static inline void mpol_free(struct mempolicy *pol)
+{
+	if (pol)
+		__mpol_free(pol);
+}
+
+extern struct mempolicy *__mpol_copy(struct mempolicy *pol);
+static inline struct mempolicy *mpol_copy(struct mempolicy *pol)
+{
+	if (pol)
+		pol = __mpol_copy(pol);
+	return pol;
+}
+
+#define vma_policy(vma) ((vma)->vm_policy)
+#define vma_set_policy(vma, pol) ((vma)->vm_policy = (pol))
+
+static inline void mpol_get(struct mempolicy *pol)
+{
+	if (pol)
+		atomic_inc(&pol->refcnt);
+}
+
+extern int __mpol_equal(struct mempolicy *a, struct mempolicy *b);
+static inline int mpol_equal(struct mempolicy *a, struct mempolicy *b)
+{
+	if (a == b)
+		return 1;
+	return __mpol_equal(a, b);
+}
+#define vma_mpol_equal(a,b) mpol_equal(vma_policy(a), vma_policy(b))
+
+/* Could later add inheritance of the process policy here. */
+
+#define mpol_set_vma_default(vma) ((vma)->vm_policy = NULL)
+
+/*
+ * Hugetlb policy. i386 hugetlb so far works with node numbers
+ * instead of zone lists, so give it special interfaces for now.
+ */
+extern int mpol_first_node(struct vm_area_struct *vma, unsigned long addr);
+extern int mpol_node_valid(int nid, struct vm_area_struct *vma,
+			unsigned long addr);
+
+/*
+ * Tree of shared policies for a shared memory region.
+ * Maintain the policies in a pseudo mm that contains vmas. The vmas
+ * carry the policy. As a special twist the pseudo mm is indexed in pages, not
+ * bytes, so that we can work with shared memory segments bigger than
+ * unsigned long.
+ */
+
+struct sp_node {
+	struct rb_node nd;
+	unsigned long start, end;
+	struct mempolicy *policy;
+};
+
+struct shared_policy {
+	struct rb_root root;
+	struct semaphore sem;
+};
+
+static inline void mpol_shared_policy_init(struct shared_policy *info)
+{
+	info->root = RB_ROOT;
+	init_MUTEX(&info->sem);
+}
+
+int mpol_set_shared_policy(struct shared_policy *info,
+				struct vm_area_struct *vma,
+				struct mempolicy *new);
+void mpol_free_shared_policy(struct shared_policy *p);
+struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
+					    unsigned long idx);
+
+#else
+
+struct mempolicy {};
+
+static inline int mpol_equal(struct mempolicy *a, struct mempolicy *b)
+{
+	return 1;
+}
+#define vma_mpol_equal(a,b) 1
+
+#define mpol_set_vma_default(vma) do {} while(0)
+
+static inline void mpol_free(struct mempolicy *p)
+{
+}
+
+static inline void mpol_get(struct mempolicy *pol)
+{
+}
+
+static inline struct mempolicy *mpol_copy(struct mempolicy *old)
+{
+	return NULL;
+}
+
+static inline int mpol_first_node(struct vm_area_struct *vma, unsigned long a)
+{
+	return numa_node_id();
+}
+
+static inline int
+mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long a)
+{
+	return 1;
+}
+
+struct shared_policy {};
+
+static inline int mpol_set_shared_policy(struct shared_policy *info,
+					struct vm_area_struct *vma,
+					struct mempolicy *new)
+{
+	return -EINVAL;
+}
+
+static inline void mpol_shared_policy_init(struct shared_policy *info)
+{
+}
+
+static inline void mpol_free_shared_policy(struct shared_policy *p)
+{
+}
+
+static inline struct mempolicy *
+mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
+{
+	return NULL;
+}
+
+#define vma_policy(vma) NULL
+#define vma_set_policy(vma, pol) do {} while(0)
+
+#endif /* CONFIG_NUMA */
+#endif /* __KERNEL__ */
+
+#endif
diff -purN -X /home/mbligh/.diff.exclude reference/include/linux/mm.h current/include/linux/mm.h
--- reference/include/linux/mm.h	2004-05-01 10:23:54.000000000 -0700
+++ current/include/linux/mm.h	2004-05-01 10:25:16.000000000 -0700
@@ -12,6 +12,7 @@
 #include <linux/mmzone.h>
 #include <linux/rbtree.h>
 #include <linux/fs.h>
+#include <linux/mempolicy.h>
 
 #ifndef CONFIG_DISCONTIGMEM          /* Don't use mapnrs, do it properly */
 extern unsigned long max_mapnr;
@@ -47,6 +48,10 @@ extern int page_cluster;
  *
  * This structure is exactly 64 bytes on ia32.  Please think very, very hard
  * before adding anything to it.
+ * [Now 4 bytes more on 32bit NUMA machines. Sorry. -AK.
+ * But if you want to recover the 4 bytes justr remove vm_next. It is redundant
+ * with vm_rb. Will be a lot of editing work though. vm_rb.color is redundant
+ * too.]
  */
 struct vm_area_struct {
 	struct mm_struct * vm_mm;	/* The address space we belong to. */
@@ -77,6 +82,10 @@ struct vm_area_struct {
 					   units, *not* PAGE_CACHE_SIZE */
 	struct file * vm_file;		/* File we map to (can be NULL). */
 	void * vm_private_data;		/* was vm_pte (shared mem) */
+
+#ifdef CONFIG_NUMA
+	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
+#endif
 };
 
 /*
@@ -148,6 +157,11 @@ struct vm_operations_struct {
 	void (*close)(struct vm_area_struct * area);
 	struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int *type);
 	int (*populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock);
+#ifdef CONFIG_NUMA
+	int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new);
+	struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
+					unsigned long addr);
+#endif
 };
 
 struct mmu_gather;
@@ -430,6 +444,9 @@ extern void show_free_areas(void);
 
 struct page *shmem_nopage(struct vm_area_struct * vma,
 			unsigned long address, int *type);
+int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new);
+struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
+					unsigned long addr);
 struct file *shmem_file_setup(char * name, loff_t size, unsigned long flags);
 void shmem_lock(struct file * file, int lock);
 int shmem_zero_setup(struct vm_area_struct *);
@@ -632,6 +649,11 @@ static inline struct vm_area_struct * fi
 	return vma;
 }
 
+static inline unsigned long vma_pages(struct vm_area_struct *vma)
+{
+	return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+}
+
 extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr);
 
 extern unsigned int nr_used_zone_pages(void);
diff -purN -X /home/mbligh/.diff.exclude reference/include/linux/mmzone.h current/include/linux/mmzone.h
--- reference/include/linux/mmzone.h	2004-04-30 11:23:53.000000000 -0700
+++ current/include/linux/mmzone.h	2004-05-01 10:25:17.000000000 -0700
@@ -52,6 +52,14 @@ struct per_cpu_pages {
 
 struct per_cpu_pageset {
 	struct per_cpu_pages pcp[2];	/* 0: hot.  1: cold */
+#ifdef CONFIG_NUMA
+	unsigned long numa_hit;		/* allocated in intended node */
+	unsigned long numa_miss;	/* allocated in non intended node */
+	unsigned long numa_foreign;	/* was intended here, hit elsewhere */
+	unsigned long interleave_hit; 	/* interleaver prefered this zone */
+	unsigned long local_node;	/* allocation from local node */
+	unsigned long other_node;	/* allocation from other node */
+#endif
 } ____cacheline_aligned_in_smp;
 
 #define ZONE_DMA		0
diff -purN -X /home/mbligh/.diff.exclude reference/include/linux/sched.h current/include/linux/sched.h
--- reference/include/linux/sched.h	2004-05-01 10:21:04.000000000 -0700
+++ current/include/linux/sched.h	2004-05-01 10:25:16.000000000 -0700
@@ -29,6 +29,7 @@
 #include <linux/completion.h>
 #include <linux/pid.h>
 #include <linux/percpu.h>
+#include <linux/mempolicy.h>
 
 struct exec_domain;
 
@@ -506,6 +507,9 @@ struct task_struct {
 
 	unsigned long ptrace_message;
 	siginfo_t *last_siginfo; /* For ptrace use.  */
+
+  	struct mempolicy *mempolicy;
+  	short il_next;		/* could be shared with used_math */
 };
 
 static inline pid_t process_group(struct task_struct *tsk)
diff -purN -X /home/mbligh/.diff.exclude reference/include/linux/shmem_fs.h current/include/linux/shmem_fs.h
--- reference/include/linux/shmem_fs.h	2003-06-19 14:41:54.000000000 -0700
+++ current/include/linux/shmem_fs.h	2004-05-01 10:25:17.000000000 -0700
@@ -2,6 +2,7 @@
 #define __SHMEM_FS_H
 
 #include <linux/swap.h>
+#include <linux/mempolicy.h>
 
 /* inode in-kernel data */
 
@@ -15,6 +16,7 @@ struct shmem_inode_info {
 	unsigned long		alloced;    /* data pages allocated to file */
 	unsigned long		swapped;    /* subtotal assigned to swap */
 	unsigned long		flags;
+	struct shared_policy     policy;
 	struct list_head	list;
 	struct inode		vfs_inode;
 };
diff -purN -X /home/mbligh/.diff.exclude reference/include/linux/swap.h current/include/linux/swap.h
--- reference/include/linux/swap.h	2004-04-30 11:23:54.000000000 -0700
+++ current/include/linux/swap.h	2004-05-01 10:25:17.000000000 -0700
@@ -151,7 +151,7 @@ struct swap_list_t {
 extern void out_of_memory(void);
 
 /* linux/mm/memory.c */
-extern void swapin_readahead(swp_entry_t);
+extern void swapin_readahead(swp_entry_t, unsigned long, struct vm_area_struct *);
 
 /* linux/mm/page_alloc.c */
 extern unsigned long totalram_pages;
@@ -200,7 +200,8 @@ extern int move_from_swap_cache(struct p
 extern void free_page_and_swap_cache(struct page *);
 extern void free_pages_and_swap_cache(struct page **, int);
 extern struct page * lookup_swap_cache(swp_entry_t);
-extern struct page * read_swap_cache_async(swp_entry_t);
+extern struct page * read_swap_cache_async(swp_entry_t, struct vm_area_struct *vma,
+					   unsigned long addr);
 
 /* linux/mm/swapfile.c */
 extern int total_swap_pages;
@@ -242,7 +243,7 @@ extern spinlock_t swaplock;
 #define free_swap_and_cache(swp)		/*NOTHING*/
 #define swap_duplicate(swp)			/*NOTHING*/
 #define swap_free(swp)				/*NOTHING*/
-#define read_swap_cache_async(swp)		NULL
+#define read_swap_cache_async(swp,vma,addr)	NULL
 #define lookup_swap_cache(swp)			NULL
 #define valid_swaphandles(swp, off)		0
 #define can_share_swap_page(p)			0
diff -purN -X /home/mbligh/.diff.exclude reference/ipc/shm.c current/ipc/shm.c
--- reference/ipc/shm.c	2004-04-30 11:23:55.000000000 -0700
+++ current/ipc/shm.c	2004-05-01 10:25:17.000000000 -0700
@@ -163,6 +163,10 @@ static struct vm_operations_struct shm_v
 	.open	= shm_open,	/* callback for a new vm-area open */
 	.close	= shm_close,	/* callback for when the vm-area is released */
 	.nopage	= shmem_nopage,
+#ifdef CONFIG_NUMA
+	.set_policy = shmem_set_policy,
+	.get_policy = shmem_get_policy,
+#endif
 };
 
 static int newseg (key_t key, int shmflg, size_t size)
diff -purN -X /home/mbligh/.diff.exclude reference/kernel/exit.c current/kernel/exit.c
--- reference/kernel/exit.c	2004-04-30 11:23:55.000000000 -0700
+++ current/kernel/exit.c	2004-05-01 10:25:17.000000000 -0700
@@ -790,6 +790,7 @@ asmlinkage NORET_TYPE void do_exit(long 
 	__exit_fs(tsk);
 	exit_namespace(tsk);
 	exit_thread();
+	mpol_free(tsk->mempolicy);
 
 	if (tsk->signal->leader)
 		disassociate_ctty(1);
diff -purN -X /home/mbligh/.diff.exclude reference/kernel/fork.c current/kernel/fork.c
--- reference/kernel/fork.c	2004-05-01 10:23:54.000000000 -0700
+++ current/kernel/fork.c	2004-05-01 10:25:17.000000000 -0700
@@ -270,6 +270,7 @@ static inline int dup_mmap(struct mm_str
 	struct rb_node **rb_link, *rb_parent;
 	int retval;
 	unsigned long charge = 0;
+	struct mempolicy *pol;
 
 	down_write(&oldmm->mmap_sem);
 	flush_cache_mm(current->mm);
@@ -311,6 +312,11 @@ static inline int dup_mmap(struct mm_str
 		if (!tmp)
 			goto fail_nomem;
 		*tmp = *mpnt;
+		pol = mpol_copy(vma_policy(mpnt));
+		retval = PTR_ERR(pol);
+		if (IS_ERR(pol))
+			goto fail_nomem_policy;
+		vma_set_policy(tmp, pol);
 		tmp->vm_flags &= ~VM_LOCKED;
 		tmp->vm_mm = mm;
 		tmp->vm_next = NULL;
@@ -357,6 +363,8 @@ out:
 	flush_tlb_mm(current->mm);
 	up_write(&oldmm->mmap_sem);
 	return retval;
+fail_nomem_policy:
+	kmem_cache_free(vm_area_cachep, tmp);
 fail_nomem:
 	retval = -ENOMEM;
 fail:
@@ -963,10 +971,16 @@ struct task_struct *copy_process(unsigne
 	p->security = NULL;
 	p->io_context = NULL;
 	p->audit_context = NULL;
+ 	p->mempolicy = mpol_copy(p->mempolicy);
+ 	if (IS_ERR(p->mempolicy)) {
+ 		retval = PTR_ERR(p->mempolicy);
+ 		p->mempolicy = NULL;
+ 		goto bad_fork_cleanup;
+ 	}
 
 	retval = -ENOMEM;
 	if ((retval = security_task_alloc(p)))
-		goto bad_fork_cleanup;
+		goto bad_fork_cleanup_policy;
 	if ((retval = audit_alloc(p)))
 		goto bad_fork_cleanup_security;
 	/* copy all the process information */
@@ -1112,6 +1126,8 @@ bad_fork_cleanup_audit:
 	audit_free(p);
 bad_fork_cleanup_security:
 	security_task_free(p);
+bad_fork_cleanup_policy:
+	mpol_free(p->mempolicy);
 bad_fork_cleanup:
 	if (p->pid > 0)
 		free_pidmap(p->pid);
diff -purN -X /home/mbligh/.diff.exclude reference/kernel/sys.c current/kernel/sys.c
--- reference/kernel/sys.c	2004-04-30 11:23:56.000000000 -0700
+++ current/kernel/sys.c	2004-05-01 10:25:17.000000000 -0700
@@ -271,6 +271,9 @@ cond_syscall(compat_sys_mq_timedsend)
 cond_syscall(compat_sys_mq_timedreceive)
 cond_syscall(compat_sys_mq_notify)
 cond_syscall(compat_sys_mq_getsetattr)
+cond_syscall(sys_mbind)
+cond_syscall(sys_get_mempolicy)
+cond_syscall(sys_set_mempolicy)
 
 /* arch-specific weak syscall entries */
 cond_syscall(sys_pciconfig_read)
diff -purN -X /home/mbligh/.diff.exclude reference/mm/Makefile current/mm/Makefile
--- reference/mm/Makefile	2004-04-30 11:23:56.000000000 -0700
+++ current/mm/Makefile	2004-05-01 10:25:17.000000000 -0700
@@ -13,3 +13,4 @@ obj-y			:= bootmem.o filemap.o mempool.o
 
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o
 obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
+obj-$(CONFIG_NUMA) 	+= mempolicy.o
diff -purN -X /home/mbligh/.diff.exclude reference/mm/memory.c current/mm/memory.c
--- reference/mm/memory.c	2004-05-01 10:23:54.000000000 -0700
+++ current/mm/memory.c	2004-05-01 10:25:17.000000000 -0700
@@ -1071,7 +1071,7 @@ static int do_wp_page(struct mm_struct *
 	page_cache_get(old_page);
 	spin_unlock(&mm->page_table_lock);
 
-	new_page = alloc_page(GFP_HIGHUSER);
+	new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
 	if (!new_page)
 		goto no_new_page;
 	copy_cow_page(old_page,new_page,address);
@@ -1237,9 +1237,17 @@ EXPORT_SYMBOL(vmtruncate);
  * (1 << page_cluster) entries in the swap area. This method is chosen
  * because it doesn't cost us any seek time.  We also make sure to queue
  * the 'original' request together with the readahead ones...  
+ *
+ * This has been extended to use the NUMA policies from the mm triggering
+ * the readahead.
+ *
+ * Caller must hold down_read on the vma->vm_mm if vma is not NULL.
  */
-void swapin_readahead(swp_entry_t entry)
+void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struct *vma)
 {
+#ifdef CONFIG_NUMA
+	struct vm_area_struct *next_vma = vma ? vma->vm_next : NULL;
+#endif
 	int i, num;
 	struct page *new_page;
 	unsigned long offset;
@@ -1251,10 +1259,31 @@ void swapin_readahead(swp_entry_t entry)
 	for (i = 0; i < num; offset++, i++) {
 		/* Ok, do the async read-ahead now */
 		new_page = read_swap_cache_async(swp_entry(swp_type(entry),
-						offset));
+							   offset), vma, addr);
 		if (!new_page)
 			break;
 		page_cache_release(new_page);
+#ifdef CONFIG_NUMA
+		/*
+		 * Find the next applicable VMA for the NUMA policy.
+		 */
+		addr += PAGE_SIZE;
+		if (addr == 0)
+			vma = NULL;
+		if (vma) {
+			if (addr >= vma->vm_end) {
+				vma = next_vma;
+				next_vma = vma ? vma->vm_next : NULL;
+			}
+			if (vma && addr < vma->vm_start)
+				vma = NULL;
+		} else {
+			if (next_vma && addr >= next_vma->vm_start) {
+				vma = next_vma;
+				next_vma = vma->vm_next;
+			}
+		}
+#endif
 	}
 	lru_add_drain();	/* Push any new pages onto the LRU now */
 }
@@ -1276,8 +1305,8 @@ static int do_swap_page(struct mm_struct
 	spin_unlock(&mm->page_table_lock);
 	page = lookup_swap_cache(entry);
 	if (!page) {
-		swapin_readahead(entry);
-		page = read_swap_cache_async(entry);
+ 		swapin_readahead(entry, address, vma);
+ 		page = read_swap_cache_async(entry, vma, address);
 		if (!page) {
 			/*
 			 * Back out if somebody else faulted in this pte while
@@ -1372,7 +1401,7 @@ do_anonymous_page(struct mm_struct *mm, 
 		pte_unmap(page_table);
 		spin_unlock(&mm->page_table_lock);
 
-		page = alloc_page(GFP_HIGHUSER);
+		page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
 		if (!page)
 			goto no_mem;
 		clear_user_highpage(page, addr);
@@ -1454,7 +1483,7 @@ retry:
 	 * Should we do an early C-O-W break?
 	 */
 	if (write_access && !(vma->vm_flags & VM_SHARED)) {
-		struct page * page = alloc_page(GFP_HIGHUSER);
+		struct page *page = alloc_page_vma(GFP_HIGHUSER, vma, address);
 		if (!page)
 			goto oom;
 		copy_user_highpage(page, new_page, address);
diff -purN -X /home/mbligh/.diff.exclude reference/mm/mempolicy.c current/mm/mempolicy.c
--- reference/mm/mempolicy.c	1969-12-31 16:00:00.000000000 -0800
+++ current/mm/mempolicy.c	2004-05-01 10:25:17.000000000 -0700
@@ -0,0 +1,1014 @@
+/*
+ * Simple NUMA memory policy for the Linux kernel.
+ *
+ * Copyright 2003,2004 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License, version 2.
+ *
+ * NUMA policy allows the user to give hints in which node(s) memory should
+ * be allocated.
+ *
+ * Support four policies per VMA and per process:
+ *
+ * The VMA policy has priority over the process policy for a page fault.
+ *
+ * interleave     Allocate memory interleaved over a set of nodes,
+ *                with normal fallback if it fails.
+ *                For VMA based allocations this interleaves based on the
+ *                offset into the backing object or offset into the mapping
+ *                for anonymous memory. For process policy an process counter
+ *                is used.
+ * bind           Only allocate memory on a specific set of nodes,
+ *                no fallback.
+ * preferred       Try a specific node first before normal fallback.
+ *                As a special case node -1 here means do the allocation
+ *                on the local CPU. This is normally identical to default,
+ *                but useful to set in a VMA when you have a non default
+ *                process policy.
+ * default        Allocate on the local node first, or when on a VMA
+ *                use the process policy. This is what Linux always did
+ *				   in a NUMA aware kernel and still does by, ahem, default.
+ *
+ * The process policy is applied for most non interrupt memory allocations
+ * in that process' context. Interrupts ignore the policies and always
+ * try to allocate on the local CPU. The VMA policy is only applied for memory
+ * allocations for a VMA in the VM.
+ *
+ * Currently there are a few corner cases in swapping where the policy
+ * is not applied, but the majority should be handled. When process policy
+ * is used it is not remembered over swap outs/swap ins.
+ *
+ * Only the highest zone in the zone hierarchy gets policied. Allocations
+ * requesting a lower zone just use default policy. This implies that
+ * on systems with highmem kernel lowmem allocation don't get policied.
+ * Same with GFP_DMA allocations.
+ *
+ * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
+ * all users and remembered even when nobody has memory mapped.
+ */
+
+/* Notebook:
+   fix mmap readahead to honour policy and enable policy for any page cache
+   object
+   statistics for bigpages
+   global policy for page cache? currently it uses process policy. Requires
+   first item above.
+   handle mremap for shared memory (currently ignored for the policy)
+   grows down?
+   make bind policy root only? It can trigger oom much faster and the
+   kernel is not always grateful with that.
+   could replace all the switch()es with a mempolicy_ops structure.
+*/
+
+#include <linux/mempolicy.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/compat.h>
+#include <asm/uaccess.h>
+
+static kmem_cache_t *policy_cache;
+static kmem_cache_t *sn_cache;
+
+#define PDprintk(fmt...)
+
+/* Highest zone. An specific allocation for a zone below that is not
+   policied. */
+static int policy_zone;
+
+static struct mempolicy default_policy = {
+	.refcnt = ATOMIC_INIT(1), /* never free it */
+	.policy = MPOL_DEFAULT,
+};
+
+/* Check if all specified nodes are online */
+static int nodes_online(unsigned long *nodes)
+{
+	DECLARE_BITMAP(offline, MAX_NUMNODES);
+
+	bitmap_copy(offline, node_online_map, MAX_NUMNODES);
+	if (bitmap_empty(offline, MAX_NUMNODES))
+		set_bit(0, offline);
+	bitmap_complement(offline, MAX_NUMNODES);
+	bitmap_and(offline, offline, nodes, MAX_NUMNODES);
+	if (!bitmap_empty(offline, MAX_NUMNODES))
+		return -EINVAL;
+	return 0;
+}
+
+/* Do sanity checking on a policy */
+static int mpol_check_policy(int mode, unsigned long *nodes)
+{
+	int empty = bitmap_empty(nodes, MAX_NUMNODES);
+
+	switch (mode) {
+	case MPOL_DEFAULT:
+		if (!empty)
+			return -EINVAL;
+		break;
+	case MPOL_BIND:
+	case MPOL_INTERLEAVE:
+		/* Preferred will only use the first bit, but allow
+		   more for now. */
+		if (empty)
+			return -EINVAL;
+		break;
+	}
+	return nodes_online(nodes);
+}
+
+/* Copy a node mask from user space. */
+static int get_nodes(unsigned long *nodes, unsigned long *nmask,
+		     unsigned long maxnode, int mode)
+{
+	unsigned long k;
+	unsigned long nlongs;
+	unsigned long endmask;
+
+	--maxnode;
+	nlongs = BITS_TO_LONGS(maxnode);
+	if ((maxnode % BITS_PER_LONG) == 0)
+		endmask = ~0UL;
+	else
+		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
+
+	/* When the user specified more nodes than supported just check
+	   if the non supported part is all zero. */
+	if (nmask && nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
+		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
+			unsigned long t;
+			if (get_user(t,  nmask + k))
+				return -EFAULT;
+			if (k == nlongs - 1) {
+				if (t & endmask)
+					return -EINVAL;
+			} else if (t)
+				return -EINVAL;
+		}
+		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
+		endmask = ~0UL;
+	}
+
+	bitmap_zero(nodes, MAX_NUMNODES);
+	if (nmask && copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long)))
+		return -EFAULT;
+	nodes[nlongs-1] &= endmask;
+	return mpol_check_policy(mode, nodes);
+}
+
+/* Generate a custom zonelist for the BIND policy. */
+static struct zonelist *bind_zonelist(unsigned long *nodes)
+{
+	struct zonelist *zl;
+	int num, max, nd;
+
+	max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES);
+	zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
+	if (!zl)
+		return NULL;
+	num = 0;
+	for (nd = find_first_bit(nodes, MAX_NUMNODES);
+	     nd < MAX_NUMNODES;
+	     nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) {
+		int k;
+		for (k = MAX_NR_ZONES-1; k >= 0; k--) {
+			struct zone *z = &NODE_DATA(nd)->node_zones[k];
+			if (!z->present_pages)
+				continue;
+			zl->zones[num++] = z;
+			if (k > policy_zone)
+				policy_zone = k;
+		}
+	}
+	BUG_ON(num >= max);
+	zl->zones[num] = NULL;
+	return zl;
+}
+
+/* Create a new policy */
+static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
+{
+	struct mempolicy *policy;
+
+	PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]);
+	if (mode == MPOL_DEFAULT)
+		return NULL;
+	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
+	if (!policy)
+		return ERR_PTR(-ENOMEM);
+	atomic_set(&policy->refcnt, 1);
+	switch (mode) {
+	case MPOL_INTERLEAVE:
+		bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES);
+		break;
+	case MPOL_PREFERRED:
+		policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES);
+		if (policy->v.preferred_node >= MAX_NUMNODES)
+			policy->v.preferred_node = -1;
+		break;
+	case MPOL_BIND:
+		policy->v.zonelist = bind_zonelist(nodes);
+		if (policy->v.zonelist == NULL) {
+			kmem_cache_free(policy_cache, policy);
+			return ERR_PTR(-ENOMEM);
+		}
+		break;
+	}
+	policy->policy = mode;
+	return policy;
+}
+
+/* Ensure all existing pages follow the policy. */
+static int
+verify_pages(unsigned long addr, unsigned long end, unsigned long *nodes)
+{
+	while (addr < end) {
+		struct page *p;
+		pte_t *pte;
+		pmd_t *pmd;
+		pgd_t *pgd = pgd_offset_k(addr);
+		if (pgd_none(*pgd)) {
+			addr = (addr + PGDIR_SIZE) & PGDIR_MASK;
+			continue;
+		}
+		pmd = pmd_offset(pgd, addr);
+		if (pmd_none(*pmd)) {
+			addr = (addr + PMD_SIZE) & PMD_MASK;
+			continue;
+		}
+		p = NULL;
+		pte = pte_offset_map(pmd, addr);
+		if (pte_present(*pte))
+			p = pte_page(*pte);
+		pte_unmap(pte);
+		if (p) {
+			unsigned nid = page_to_nid(p);
+			if (!test_bit(nid, nodes))
+				return -EIO;
+		}
+		addr += PAGE_SIZE;
+	}
+	return 0;
+}
+
+/* Step 1: check the range */
+static struct vm_area_struct *
+check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
+	    unsigned long *nodes, unsigned long flags)
+{
+	int err;
+	struct vm_area_struct *first, *vma, *prev;
+
+	first = find_vma(mm, start);
+	if (!first)
+		return ERR_PTR(-EFAULT);
+	prev = NULL;
+	for (vma = first; vma->vm_start < end; vma = vma->vm_next) {
+		if (!vma->vm_next && vma->vm_end < end)
+			return ERR_PTR(-EFAULT);
+		if (prev && prev->vm_end < vma->vm_start)
+			return ERR_PTR(-EFAULT);
+		if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
+			err = verify_pages(vma->vm_start, vma->vm_end, nodes);
+			if (err) {
+				first = ERR_PTR(err);
+				break;
+			}
+		}
+		prev = vma;
+	}
+	return first;
+}
+
+/* Apply policy to a single VMA */
+static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
+{
+	int err = 0;
+	struct mempolicy *old = vma->vm_policy;
+
+	PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
+		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
+		 vma->vm_ops, vma->vm_file,
+		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
+
+	if (vma->vm_ops && vma->vm_ops->set_policy)
+		err = vma->vm_ops->set_policy(vma, new);
+	if (!err) {
+		mpol_get(new);
+		vma->vm_policy = new;
+		mpol_free(old);
+	}
+	return err;
+}
+
+/* Step 2: apply policy to a range and do splits. */
+static int mbind_range(struct vm_area_struct *vma, unsigned long start,
+		       unsigned long end, struct mempolicy *new)
+{
+	struct vm_area_struct *next;
+	int err;
+
+	err = 0;
+	for (; vma->vm_start < end; vma = next) {
+		next = vma->vm_next;
+		if (vma->vm_start < start)
+			err = split_vma(vma->vm_mm, vma, start, 1);
+		if (!err && vma->vm_end > end)
+			err = split_vma(vma->vm_mm, vma, end, 0);
+		if (!err)
+			err = policy_vma(vma, new);
+		if (err)
+			break;
+	}
+	return err;
+}
+
+/* Change policy for a memory range */
+asmlinkage long sys_mbind(unsigned long start, unsigned long len,
+			  unsigned long mode,
+			  unsigned long *nmask, unsigned long maxnode,
+			  unsigned flags)
+{
+	struct vm_area_struct *vma;
+	struct mm_struct *mm = current->mm;
+	struct mempolicy *new;
+	unsigned long end;
+	DECLARE_BITMAP(nodes, MAX_NUMNODES);
+	int err;
+
+	if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
+		return -EINVAL;
+	if (start & ~PAGE_MASK)
+		return -EINVAL;
+	if (mode == MPOL_DEFAULT)
+		flags &= ~MPOL_MF_STRICT;
+	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
+	end = start + len;
+	if (end < start)
+		return -EINVAL;
+	if (end == start)
+		return 0;
+
+	err = get_nodes(nodes, nmask, maxnode, mode);
+	if (err)
+		return err;
+
+	new = mpol_new(mode, nodes);
+	if (IS_ERR(new))
+		return PTR_ERR(new);
+
+	PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
+			mode,nodes[0]);
+
+	down_write(&mm->mmap_sem);
+	vma = check_range(mm, start, end, nodes, flags);
+	err = PTR_ERR(vma);
+	if (!IS_ERR(vma))
+		err = mbind_range(vma, start, end, new);
+	up_write(&mm->mmap_sem);
+	mpol_free(new);
+	return err;
+}
+
+/* Set the process memory policy */
+asmlinkage long sys_set_mempolicy(int mode, unsigned long *nmask,
+				   unsigned long maxnode)
+{
+	int err;
+	struct mempolicy *new;
+	DECLARE_BITMAP(nodes, MAX_NUMNODES);
+
+	if (mode > MPOL_MAX)
+		return -EINVAL;
+	err = get_nodes(nodes, nmask, maxnode, mode);
+	if (err)
+		return err;
+	new = mpol_new(mode, nodes);
+	if (IS_ERR(new))
+		return PTR_ERR(new);
+	mpol_free(current->mempolicy);
+	current->mempolicy = new;
+	if (new && new->policy == MPOL_INTERLEAVE)
+		current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES);
+	return 0;
+}
+
+/* Fill a zone bitmap for a policy */
+static void get_zonemask(struct mempolicy *p, unsigned long *nodes)
+{
+	int i;
+
+	bitmap_zero(nodes, MAX_NUMNODES);
+	switch (p->policy) {
+	case MPOL_BIND:
+		for (i = 0; p->v.zonelist->zones[i]; i++)
+			__set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes);
+		break;
+	case MPOL_DEFAULT:
+		break;
+	case MPOL_INTERLEAVE:
+		bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES);
+		break;
+	case MPOL_PREFERRED:
+		/* or use current node instead of online map? */
+		if (p->v.preferred_node < 0)
+			bitmap_copy(nodes, node_online_map, MAX_NUMNODES);
+		else
+			__set_bit(p->v.preferred_node, nodes);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static int lookup_node(struct mm_struct *mm, unsigned long addr)
+{
+	struct page *p;
+	int err;
+
+	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
+	if (err >= 0) {
+		err = page_zone(p)->zone_pgdat->node_id;
+		put_page(p);
+	}
+	return err;
+}
+
+/* Copy a kernel node mask to user space */
+static int copy_nodes_to_user(unsigned long *user_mask, unsigned long maxnode,
+			      unsigned long *nodes)
+{
+	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
+
+	if (copy > sizeof(nodes)) {
+		if (copy > PAGE_SIZE)
+			return -EINVAL;
+		if (clear_user((char*)user_mask + sizeof(nodes),
+				copy - sizeof(nodes)))
+			return -EFAULT;
+		copy = sizeof(nodes);
+	}
+	return copy_to_user(user_mask, nodes, copy) ? -EFAULT : 0;
+}
+
+/* Retrieve NUMA policy */
+asmlinkage long sys_get_mempolicy(int *policy,
+				  unsigned long *nmask, unsigned long maxnode,
+				  unsigned long addr, unsigned long flags)
+{
+	int err, pval;
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma = NULL;
+	struct mempolicy *pol = current->mempolicy;
+
+	if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
+		return -EINVAL;
+	if (nmask != NULL && maxnode < numnodes)
+		return -EINVAL;
+	if (flags & MPOL_F_ADDR) {
+		down_read(&mm->mmap_sem);
+		vma = find_vma_intersection(mm, addr, addr+1);
+		if (!vma) {
+			up_read(&mm->mmap_sem);
+			return -EFAULT;
+		}
+		if (vma->vm_ops && vma->vm_ops->get_policy)
+			pol = vma->vm_ops->get_policy(vma, addr);
+		else
+			pol = vma->vm_policy;
+	} else if (addr)
+		return -EINVAL;
+
+	if (!pol)
+		pol = &default_policy;
+
+	if (flags & MPOL_F_NODE) {
+		if (flags & MPOL_F_ADDR) {
+			err = lookup_node(mm, addr);
+			if (err < 0)
+				goto out;
+			pval = err;
+		} else if (pol == current->mempolicy &&
+				pol->policy == MPOL_INTERLEAVE) {
+			pval = current->il_next;
+		} else {
+			err = -EINVAL;
+			goto out;
+		}
+	} else
+		pval = pol->policy;
+
+	err = -EFAULT;
+	if (policy && put_user(pval, policy))
+		goto out;
+
+	err = 0;
+	if (nmask) {
+		DECLARE_BITMAP(nodes, MAX_NUMNODES);
+		get_zonemask(pol, nodes);
+		err = copy_nodes_to_user(nmask, maxnode, nodes);
+	}
+
+ out:
+	if (vma)
+		up_read(&current->mm->mmap_sem);
+	return err;
+}
+
+#ifdef CONFIG_COMPAT
+/* The other functions are compatible */
+asmlinkage long compat_get_mempolicy(int *policy,
+				  unsigned  *nmask, unsigned  maxnode,
+				  unsigned addr, unsigned  flags)
+{
+	long err;
+	unsigned long *nm = NULL;
+	if (nmask)
+		nm = compat_alloc_user_space(ALIGN(maxnode-1, 64) / 8);
+	err = sys_get_mempolicy(policy, nm, maxnode, addr, flags);
+	if (!err && copy_in_user(nmask, nm, ALIGN(maxnode-1, 32)/8))
+		err = -EFAULT;
+	return err;
+}
+#endif
+
+/* Return effective policy for a VMA */
+static struct mempolicy *
+get_vma_policy(struct vm_area_struct *vma, unsigned long addr)
+{
+	struct mempolicy *pol = current->mempolicy;
+
+	if (vma) {
+		if (vma->vm_ops && vma->vm_ops->get_policy)
+		        pol = vma->vm_ops->get_policy(vma, addr);
+		else if (vma->vm_policy &&
+				vma->vm_policy->policy != MPOL_DEFAULT)
+			pol = vma->vm_policy;
+	}
+	if (!pol)
+		pol = &default_policy;
+	return pol;
+}
+
+/* Return a zonelist representing a mempolicy */
+static struct zonelist *zonelist_policy(unsigned gfp, struct mempolicy *policy)
+{
+	int nd;
+
+	switch (policy->policy) {
+	case MPOL_PREFERRED:
+		nd = policy->v.preferred_node;
+		if (nd < 0)
+			nd = numa_node_id();
+		break;
+	case MPOL_BIND:
+		/* Lower zones don't get a policy applied */
+		if (gfp >= policy_zone)
+			return policy->v.zonelist;
+		/*FALL THROUGH*/
+	case MPOL_INTERLEAVE: /* should not happen */
+	case MPOL_DEFAULT:
+		nd = numa_node_id();
+		break;
+	default:
+		nd = 0;
+		BUG();
+	}
+	return NODE_DATA(nd)->node_zonelists + (gfp & GFP_ZONEMASK);
+}
+
+/* Do dynamic interleaving for a process */
+static unsigned interleave_nodes(struct mempolicy *policy)
+{
+	unsigned nid, next;
+	struct task_struct *me = current;
+
+	nid = me->il_next;
+	BUG_ON(nid >= MAX_NUMNODES);
+	next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid);
+	if (next >= MAX_NUMNODES)
+		next = find_first_bit(policy->v.nodes, MAX_NUMNODES);
+	me->il_next = next;
+	return nid;
+}
+
+/* Do static interleaving for a VMA with known offset. */
+static unsigned offset_il_node(struct mempolicy *pol,
+		struct vm_area_struct *vma, unsigned long off)
+{
+	unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES);
+	unsigned target = (unsigned)off % nnodes;
+	int c;
+	int nid = -1;
+
+	c = 0;
+	do {
+		nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1);
+		c++;
+	} while (c <= target);
+	BUG_ON(nid >= MAX_NUMNODES);
+	BUG_ON(!test_bit(nid, pol->v.nodes));
+	return nid;
+}
+
+/* Allocate a page in interleaved policy.
+   Own path because it needs to do special accounting. */
+static struct page *alloc_page_interleave(unsigned gfp, unsigned nid)
+{
+	struct zonelist *zl;
+	struct page *page;
+
+	BUG_ON(!test_bit(nid, node_online_map));
+	zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
+	page = __alloc_pages(gfp, 0, zl);
+	if (page && page_zone(page) == zl->zones[0]) {
+		zl->zones[0]->pageset[get_cpu()].interleave_hit++;
+		put_cpu();
+	}
+	return page;
+}
+
+/**
+ * 	alloc_page_vma	- Allocate a page for a VMA.
+ *
+ * 	@gfp:
+ *      %GFP_USER    user allocation.
+ *      %GFP_KERNEL  kernel allocations,
+ *      %GFP_HIGHMEM highmem/user allocations,
+ *      %GFP_FS      allocation should not call back into a file system.
+ *      %GFP_ATOMIC  don't sleep.
+ *
+ * 	@vma:  Pointer to VMA or NULL if not available.
+ *	@addr: Virtual Address of the allocation. Must be inside the VMA.
+ *
+ * 	This function allocates a page from the kernel page pool and applies
+ *	a NUMA policy associated with the VMA or the current process.
+ *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
+ *	mm_struct of the VMA to prevent it from going away. Should be used for
+ *	all allocations for pages that will be mapped into
+ * 	user space. Returns NULL when no page can be allocated.
+ *
+ *	Should be called with the mm_sem of the vma hold.
+ */
+struct page *
+alloc_page_vma(unsigned gfp, struct vm_area_struct *vma, unsigned long addr)
+{
+	struct mempolicy *pol = get_vma_policy(vma, addr);
+
+	if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
+		unsigned nid;
+		if (vma) {
+			unsigned long off;
+			BUG_ON(addr >= vma->vm_end);
+			BUG_ON(addr < vma->vm_start);
+			off = vma->vm_pgoff;
+			off += (addr - vma->vm_start) >> PAGE_SHIFT;
+			nid = offset_il_node(pol, vma, off);
+		} else {
+			/* fall back to process interleaving */
+			nid = interleave_nodes(pol);
+		}
+		return alloc_page_interleave(gfp, nid);
+	}
+	return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
+}
+
+/**
+ * 	alloc_pages_current - Allocate pages.
+ *
+ *	@gfp:
+ *			%GFP_USER   user allocation,
+ *      	%GFP_KERNEL kernel allocation,
+ *      	%GFP_HIGHMEM highmem allocation,
+ *      	%GFP_FS     don't call back into a file system.
+ *      	%GFP_ATOMIC don't sleep.
+ *	@order: Power of two of allocation size in pages. 0 is a single page.
+ *
+ *	Allocate a page from the kernel page pool.  When not in
+ *	interrupt context and apply the current process NUMA policy.
+ *	Returns NULL when no page can be allocated.
+ */
+struct page *alloc_pages_current(unsigned gfp, unsigned order)
+{
+	struct mempolicy *pol = current->mempolicy;
+
+	if (!pol || in_interrupt())
+		pol = &default_policy;
+	if (pol->policy == MPOL_INTERLEAVE && order == 0)
+		return alloc_page_interleave(gfp, interleave_nodes(pol));
+	return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
+}
+EXPORT_SYMBOL(alloc_pages_current);
+
+/* Slow path of a mempolicy copy */
+struct mempolicy *__mpol_copy(struct mempolicy *old)
+{
+	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
+
+	if (!new)
+		return ERR_PTR(-ENOMEM);
+	*new = *old;
+	atomic_set(&new->refcnt, 1);
+	if (new->policy == MPOL_BIND) {
+		int sz = ksize(old->v.zonelist);
+		new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
+		if (!new->v.zonelist) {
+			kmem_cache_free(policy_cache, new);
+			return ERR_PTR(-ENOMEM);
+		}
+		memcpy(new->v.zonelist, old->v.zonelist, sz);
+	}
+	return new;
+}
+
+/* Slow path of a mempolicy comparison */
+int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
+{
+	if (!a || !b)
+		return 0;
+	if (a->policy != b->policy)
+		return 0;
+	switch (a->policy) {
+	case MPOL_DEFAULT:
+		return 1;
+	case MPOL_INTERLEAVE:
+		return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES);
+	case MPOL_PREFERRED:
+		return a->v.preferred_node == b->v.preferred_node;
+	case MPOL_BIND: {
+		int i;
+		for (i = 0; a->v.zonelist->zones[i]; i++)
+			if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
+				return 0;
+		return b->v.zonelist->zones[i] == NULL;
+	}
+	default:
+		BUG();
+		return 0;
+	}
+}
+
+/* Slow path of a mpol destructor. */
+extern void __mpol_free(struct mempolicy *p)
+{
+	if (!atomic_dec_and_test(&p->refcnt))
+		return;
+	if (p->policy == MPOL_BIND)
+		kfree(p->v.zonelist);
+	p->policy = MPOL_DEFAULT;
+	kmem_cache_free(policy_cache, p);
+}
+
+/*
+ * Hugetlb policy. Same as above, just works with node numbers instead of
+ * zonelists.
+ */
+
+/* Find first node suitable for an allocation */
+int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
+{
+	struct mempolicy *pol = get_vma_policy(vma, addr);
+
+	switch (pol->policy) {
+	case MPOL_DEFAULT:
+		return numa_node_id();
+	case MPOL_BIND:
+		return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
+	case MPOL_INTERLEAVE:
+		return interleave_nodes(pol);
+	case MPOL_PREFERRED:
+		return pol->v.preferred_node >= 0 ?
+				pol->v.preferred_node : numa_node_id();
+	}
+	BUG();
+	return 0;
+}
+
+/* Find secondary valid nodes for an allocation */
+int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
+{
+	struct mempolicy *pol = get_vma_policy(vma, addr);
+
+	switch (pol->policy) {
+	case MPOL_PREFERRED:
+	case MPOL_DEFAULT:
+	case MPOL_INTERLEAVE:
+		return 1;
+	case MPOL_BIND: {
+		struct zone **z;
+		for (z = pol->v.zonelist->zones; *z; z++)
+			if ((*z)->zone_pgdat->node_id == nid)
+				return 1;
+		return 0;
+	}
+	default:
+		BUG();
+		return 0;
+	}
+}
+
+/*
+ * Shared memory backing store policy support.
+ *
+ * Remember policies even when nobody has shared memory mapped.
+ * The policies are kept in Red-Black tree linked from the inode.
+ * They are protected by the sp->sem semaphore, which should be held
+ * for any accesses to the tree.
+ */
+
+/* lookup first element intersecting start-end */
+/* Caller holds sp->sem */
+static struct sp_node *
+sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
+{
+	struct rb_node *n = sp->root.rb_node;
+
+	while (n) {
+		struct sp_node *p = rb_entry(n, struct sp_node, nd);
+		if (start >= p->end) {
+			n = n->rb_right;
+		} else if (end < p->start) {
+			n = n->rb_left;
+		} else {
+			break;
+		}
+	}
+	if (!n)
+		return NULL;
+	for (;;) {
+		struct sp_node *w = NULL;
+		struct rb_node *prev = rb_prev(n);
+		if (!prev)
+			break;
+		w = rb_entry(prev, struct sp_node, nd);
+		if (w->end <= start)
+			break;
+		n = prev;
+	}
+	return rb_entry(n, struct sp_node, nd);
+}
+
+/* Insert a new shared policy into the list. */
+/* Caller holds sp->sem */
+static void sp_insert(struct shared_policy *sp, struct sp_node *new)
+{
+	struct rb_node **p = &sp->root.rb_node;
+	struct rb_node *parent = NULL;
+	struct sp_node *nd;
+
+	while (*p) {
+		parent = *p;
+		nd = rb_entry(parent, struct sp_node, nd);
+		if (new->start < nd->start)
+			p = &(*p)->rb_left;
+		else if (new->end > nd->end)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+	rb_link_node(&new->nd, parent, p);
+	rb_insert_color(&new->nd, &sp->root);
+	PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
+		 new->policy ? new->policy->policy : 0);
+}
+
+/* Find shared policy intersecting idx */
+struct mempolicy *
+mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
+{
+	struct mempolicy *pol = NULL;
+	struct sp_node *sn;
+
+	down(&sp->sem);
+	sn = sp_lookup(sp, idx, idx+1);
+	if (sn) {
+		mpol_get(sn->policy);
+		pol = sn->policy;
+	}
+	up(&sp->sem);
+	return pol;
+}
+
+static void sp_delete(struct shared_policy *sp, struct sp_node *n)
+{
+	PDprintk("deleting %lx-l%x\n", n->start, n->end);
+	rb_erase(&n->nd, &sp->root);
+	mpol_free(n->policy);
+	kmem_cache_free(sn_cache, n);
+}
+
+struct sp_node *
+sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
+{
+	struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
+
+	if (!n)
+		return NULL;
+	n->start = start;
+	n->end = end;
+	mpol_get(pol);
+	n->policy = pol;
+	return n;
+}
+
+/* Replace a policy range. */
+static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
+				 unsigned long end, struct sp_node *new)
+{
+	struct sp_node *n, *new2;
+
+	down(&sp->sem);
+	n = sp_lookup(sp, start, end);
+	/* Take care of old policies in the same range. */
+	while (n && n->start < end) {
+		struct rb_node *next = rb_next(&n->nd);
+		if (n->start >= start) {
+			if (n->end <= end)
+				sp_delete(sp, n);
+			else
+				n->start = end;
+		} else {
+			/* Old policy spanning whole new range. */
+			if (n->end > end) {
+				new2 = sp_alloc(end, n->end, n->policy);
+				if (!new2) {
+					up(&sp->sem);
+					return -ENOMEM;
+				}
+				n->end = end;
+				sp_insert(sp, new2);
+			}
+			/* Old crossing beginning, but not end (easy) */
+			if (n->start < start && n->end > start)
+				n->end = start;
+		}
+		if (!next)
+			break;
+		n = rb_entry(next, struct sp_node, nd);
+	}
+	if (new)
+		sp_insert(sp, new);
+	up(&sp->sem);
+	return 0;
+}
+
+int mpol_set_shared_policy(struct shared_policy *info,
+			struct vm_area_struct *vma, struct mempolicy *npol)
+{
+	int err;
+	struct sp_node *new = NULL;
+	unsigned long sz = vma_pages(vma);
+
+	PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
+		 vma->vm_pgoff,
+		 sz, npol? npol->policy : -1,
+		npol ? npol->v.nodes[0] : -1);
+
+	if (npol) {
+		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
+		if (!new)
+			return -ENOMEM;
+	}
+	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
+	if (err && new)
+		kmem_cache_free(sn_cache, new);
+	return err;
+}
+
+/* Free a backing policy store on inode delete. */
+void mpol_free_shared_policy(struct shared_policy *p)
+{
+	struct sp_node *n;
+	struct rb_node *next;
+
+	down(&p->sem);
+	next = rb_first(&p->root);
+	while (next) {
+		n = rb_entry(next, struct sp_node, nd);
+		next = rb_next(&n->nd);
+		rb_erase(&n->nd, &p->root);
+		mpol_free(n->policy);
+		kmem_cache_free(sn_cache, n);
+	}
+	up(&p->sem);
+}
+
+static __init int numa_policy_init(void)
+{
+	policy_cache = kmem_cache_create("numa_policy",
+					 sizeof(struct mempolicy),
+					 0, SLAB_PANIC, NULL, NULL);
+
+	sn_cache = kmem_cache_create("shared_policy_node",
+				     sizeof(struct sp_node),
+				     0, SLAB_PANIC, NULL, NULL);
+	return 0;
+}
+module_init(numa_policy_init);
diff -purN -X /home/mbligh/.diff.exclude reference/mm/mmap.c current/mm/mmap.c
--- reference/mm/mmap.c	2004-05-01 10:23:54.000000000 -0700
+++ current/mm/mmap.c	2004-05-01 10:25:17.000000000 -0700
@@ -387,7 +387,8 @@ static struct vm_area_struct *vma_merge(
 			struct vm_area_struct *prev,
 			struct rb_node *rb_parent, unsigned long addr, 
 			unsigned long end, unsigned long vm_flags,
-			struct file *file, unsigned long pgoff)
+		     	struct file *file, unsigned long pgoff,
+		        struct mempolicy *policy)
 {
 	spinlock_t *lock = &mm->page_table_lock;
 	struct inode *inode = file ? file->f_dentry->d_inode : NULL;
@@ -411,6 +412,7 @@ static struct vm_area_struct *vma_merge(
 	 * Can it merge with the predecessor?
 	 */
 	if (prev->vm_end == addr &&
+  		        mpol_equal(vma_policy(prev), policy) &&
 			can_vma_merge_after(prev, vm_flags, file, pgoff)) {
 		struct vm_area_struct *next;
 		int need_up = 0;
@@ -428,6 +430,7 @@ static struct vm_area_struct *vma_merge(
 		 */
 		next = prev->vm_next;
 		if (next && prev->vm_end == next->vm_start &&
+		    		vma_mpol_equal(prev, next) &&
 				can_vma_merge_before(next, vm_flags, file,
 					pgoff, (end - addr) >> PAGE_SHIFT)) {
 			prev->vm_end = next->vm_end;
@@ -440,6 +443,7 @@ static struct vm_area_struct *vma_merge(
 				fput(file);
 
 			mm->map_count--;
+			mpol_free(vma_policy(next));
 			kmem_cache_free(vm_area_cachep, next);
 			return prev;
 		}
@@ -455,6 +459,8 @@ static struct vm_area_struct *vma_merge(
 	prev = prev->vm_next;
 	if (prev) {
  merge_next:
+ 		if (!mpol_equal(policy, vma_policy(prev)))
+  			return 0;
 		if (!can_vma_merge_before(prev, vm_flags, file,
 				pgoff, (end - addr) >> PAGE_SHIFT))
 			return NULL;
@@ -631,7 +637,7 @@ munmap_back:
 	/* Can we just expand an old anonymous mapping? */
 	if (!file && !(vm_flags & VM_SHARED) && rb_parent)
 		if (vma_merge(mm, prev, rb_parent, addr, addr + len,
-					vm_flags, NULL, 0))
+					vm_flags, NULL, pgoff, NULL))
 			goto out;
 
 	/*
@@ -654,6 +660,7 @@ munmap_back:
 	vma->vm_file = NULL;
 	vma->vm_private_data = NULL;
 	vma->vm_next = NULL;
+	mpol_set_vma_default(vma);
 	INIT_LIST_HEAD(&vma->shared);
 
 	if (file) {
@@ -693,7 +700,9 @@ munmap_back:
 	addr = vma->vm_start;
 
 	if (!file || !rb_parent || !vma_merge(mm, prev, rb_parent, addr,
-				addr + len, vma->vm_flags, file, pgoff)) {
+					      vma->vm_end,
+					      vma->vm_flags, file, pgoff,
+					      vma_policy(vma))) {
 		vma_link(mm, vma, prev, rb_link, rb_parent);
 		if (correct_wcount)
 			atomic_inc(&inode->i_writecount);
@@ -703,6 +712,7 @@ munmap_back:
 				atomic_inc(&inode->i_writecount);
 			fput(file);
 		}
+		mpol_free(vma_policy(vma));
 		kmem_cache_free(vm_area_cachep, vma);
 	}
 out:	
@@ -1118,6 +1128,7 @@ static void unmap_vma(struct mm_struct *
 
 	remove_shared_vm_struct(area);
 
+	mpol_free(vma_policy(area));
 	if (area->vm_ops && area->vm_ops->close)
 		area->vm_ops->close(area);
 	if (area->vm_file)
@@ -1200,6 +1211,7 @@ detach_vmas_to_be_unmapped(struct mm_str
 int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
 	      unsigned long addr, int new_below)
 {
+	struct mempolicy *pol;
 	struct vm_area_struct *new;
 	struct address_space *mapping = NULL;
 
@@ -1222,6 +1234,13 @@ int split_vma(struct mm_struct * mm, str
 		new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
 	}
 
+	pol = mpol_copy(vma_policy(vma));
+	if (IS_ERR(pol)) {
+		kmem_cache_free(vm_area_cachep, new);
+		return PTR_ERR(pol);
+	}
+	vma_set_policy(new, pol);
+
 	if (new->vm_file)
 		get_file(new->vm_file);
 
@@ -1391,7 +1410,7 @@ unsigned long do_brk(unsigned long addr,
 
 	/* Can we just expand an old anonymous mapping? */
 	if (rb_parent && vma_merge(mm, prev, rb_parent, addr, addr + len,
-					flags, NULL, 0))
+					flags, NULL, 0, NULL))
 		goto out;
 
 	/*
@@ -1412,6 +1431,7 @@ unsigned long do_brk(unsigned long addr,
 	vma->vm_pgoff = 0;
 	vma->vm_file = NULL;
 	vma->vm_private_data = NULL;
+	mpol_set_vma_default(vma);
 	INIT_LIST_HEAD(&vma->shared);
 
 	vma_link(mm, vma, prev, rb_link, rb_parent);
@@ -1472,6 +1492,7 @@ void exit_mmap(struct mm_struct *mm)
 		}
 		if (vma->vm_file)
 			fput(vma->vm_file);
+		mpol_free(vma_policy(vma));
 		kmem_cache_free(vm_area_cachep, vma);
 		vma = next;
 	}
@@ -1504,10 +1525,11 @@ struct vm_area_struct *copy_vma(struct v
 	struct mm_struct *mm = vma->vm_mm;
 	struct vm_area_struct *new_vma, *prev;
 	struct rb_node **rb_link, *rb_parent;
+	struct mempolicy *pol;
 
 	find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
 	new_vma = vma_merge(mm, prev, rb_parent, addr, addr + len,
-			vma->vm_flags, vma->vm_file, pgoff);
+			vma->vm_flags, vma->vm_file, pgoff, vma_policy(vma));
 	if (new_vma) {
 		/*
 		 * Source vma may have been merged into new_vma
@@ -1519,6 +1541,12 @@ struct vm_area_struct *copy_vma(struct v
 		new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 		if (new_vma) {
 			*new_vma = *vma;
+			pol = mpol_copy(vma_policy(vma));
+			if (IS_ERR(pol)) {
+				kmem_cache_free(vm_area_cachep, new_vma);
+				return NULL;
+			}
+			vma_set_policy(new_vma, pol);
 			INIT_LIST_HEAD(&new_vma->shared);
 			new_vma->vm_start = addr;
 			new_vma->vm_end = addr + len;
diff -purN -X /home/mbligh/.diff.exclude reference/mm/mprotect.c current/mm/mprotect.c
--- reference/mm/mprotect.c	2004-04-30 11:23:57.000000000 -0700
+++ current/mm/mprotect.c	2004-05-01 10:25:17.000000000 -0700
@@ -125,6 +125,8 @@ mprotect_attempt_merge(struct vm_area_st
 		return 0;
 	if (vma->vm_file || (vma->vm_flags & VM_SHARED))
 		return 0;
+	if (!vma_mpol_equal(vma, prev))
+		return 0;
 
 	/*
 	 * If the whole area changes to the protection of the previous one
@@ -136,6 +138,7 @@ mprotect_attempt_merge(struct vm_area_st
 		__vma_unlink(mm, vma, prev);
 		spin_unlock(&mm->page_table_lock);
 
+		mpol_free(vma_policy(vma));
 		kmem_cache_free(vm_area_cachep, vma);
 		mm->map_count--;
 		return 1;
@@ -318,12 +321,14 @@ sys_mprotect(unsigned long start, size_t
 
 	if (next && prev->vm_end == next->vm_start &&
 			can_vma_merge(next, prev->vm_flags) &&
+	    	vma_mpol_equal(prev, next) &&
 			!prev->vm_file && !(prev->vm_flags & VM_SHARED)) {
 		spin_lock(&prev->vm_mm->page_table_lock);
 		prev->vm_end = next->vm_end;
 		__vma_unlink(prev->vm_mm, next, prev);
 		spin_unlock(&prev->vm_mm->page_table_lock);
 
+		mpol_free(vma_policy(next));
 		kmem_cache_free(vm_area_cachep, next);
 		prev->vm_mm->map_count--;
 	}
diff -purN -X /home/mbligh/.diff.exclude reference/mm/page_alloc.c current/mm/page_alloc.c
--- reference/mm/page_alloc.c	2004-04-30 11:23:57.000000000 -0700
+++ current/mm/page_alloc.c	2004-05-01 10:25:17.000000000 -0700
@@ -460,6 +460,32 @@ void drain_local_pages(void)
 }
 #endif /* CONFIG_PM */
 
+static void zone_statistics(struct zonelist *zonelist, struct zone *z)
+{
+#ifdef CONFIG_NUMA
+	unsigned long flags;
+	int cpu;
+	pg_data_t *pg = z->zone_pgdat;
+	pg_data_t *orig = zonelist->zones[0]->zone_pgdat;
+	struct per_cpu_pageset *p;
+
+	local_irq_save(flags);
+	cpu = smp_processor_id();
+	p = &z->pageset[cpu];
+	if (pg == orig) {
+		z->pageset[cpu].numa_hit++;
+	} else {
+		p->numa_miss++;
+		zonelist->zones[0]->pageset[cpu].numa_foreign++;
+	}
+	if (pg == NODE_DATA(numa_node_id()))
+		p->local_node++;
+	else
+		p->other_node++;
+	local_irq_restore(flags);
+#endif
+}
+
 /*
  * Free a 0-order page
  */
@@ -593,8 +619,10 @@ __alloc_pages(unsigned int gfp_mask, uns
 		if (z->free_pages >= min ||
 				(!wait && z->free_pages >= z->pages_high)) {
 			page = buffered_rmqueue(z, order, gfp_mask);
-			if (page)
+			if (page) {
+				zone_statistics(zonelist, z);
 				goto got_pg;
+			}
 		}
 	}
 
@@ -616,8 +644,10 @@ __alloc_pages(unsigned int gfp_mask, uns
 		if (z->free_pages >= min ||
 				(!wait && z->free_pages >= z->pages_high)) {
 			page = buffered_rmqueue(z, order, gfp_mask);
-			if (page)
+			if (page) {
+				zone_statistics(zonelist, z);
 				goto got_pg;
+			}
 		}
 	}
 
@@ -630,8 +660,10 @@ rebalance:
 			struct zone *z = zones[i];
 
 			page = buffered_rmqueue(z, order, gfp_mask);
-			if (page)
+			if (page) {
+				zone_statistics(zonelist, z);
 				goto got_pg;
+			}
 		}
 		goto nopage;
 	}
@@ -658,8 +690,10 @@ rebalance:
 		if (z->free_pages >= min ||
 				(!wait && z->free_pages >= z->pages_high)) {
 			page = buffered_rmqueue(z, order, gfp_mask);
-			if (page)
+			if (page) {
+ 				zone_statistics(zonelist, z);
 				goto got_pg;
+			}
 		}
 	}
 
diff -purN -X /home/mbligh/.diff.exclude reference/mm/shmem.c current/mm/shmem.c
--- reference/mm/shmem.c	2004-05-01 10:21:04.000000000 -0700
+++ current/mm/shmem.c	2004-05-01 10:25:17.000000000 -0700
@@ -8,6 +8,7 @@
  *		 2002 Red Hat Inc.
  * Copyright (C) 2002-2003 Hugh Dickins.
  * Copyright (C) 2002-2003 VERITAS Software Corporation.
+ * Copyright (C) 2004 Andi Kleen, SuSE Labs
  *
  * This file is released under the GPL.
  */
@@ -37,8 +38,10 @@
 #include <linux/vfs.h>
 #include <linux/blkdev.h>
 #include <linux/security.h>
+#include <linux/swapops.h>
 #include <asm/uaccess.h>
 #include <asm/div64.h>
+#include <asm/pgtable.h>
 
 /* This magic number is used in glibc for posix shared memory */
 #define TMPFS_MAGIC	0x01021994
@@ -783,6 +786,74 @@ redirty:
 	return WRITEPAGE_ACTIVATE;	/* Return with the page locked */
 }
 
+#ifdef CONFIG_NUMA
+static struct page *shmem_swapin_async(struct shared_policy *p,
+				       swp_entry_t entry, unsigned long idx)
+{
+	struct page *page;
+	struct vm_area_struct pvma;
+
+	/* Create a pseudo vma that just contains the policy */
+	memset(&pvma, 0, sizeof(struct vm_area_struct));
+	pvma.vm_end = PAGE_SIZE;
+	pvma.vm_pgoff = idx;
+	pvma.vm_policy = mpol_shared_policy_lookup(p, idx);
+	page = read_swap_cache_async(entry, &pvma, 0);
+	mpol_free(pvma.vm_policy);
+	return page;
+}
+
+struct page *shmem_swapin(struct shmem_inode_info *info, swp_entry_t entry,
+			  unsigned long idx)
+{
+	struct shared_policy *p = &info->policy;
+	int i, num;
+	struct page *page;
+	unsigned long offset;
+
+	num = valid_swaphandles(entry, &offset);
+	for (i = 0; i < num; offset++, i++) {
+		page = shmem_swapin_async(p,
+				swp_entry(swp_type(entry), offset), idx);
+		if (!page)
+			break;
+		page_cache_release(page);
+	}
+	lru_add_drain();	/* Push any new pages onto the LRU now */
+	return shmem_swapin_async(p, entry, idx);
+}
+
+static struct page *
+shmem_alloc_page(unsigned long gfp, struct shmem_inode_info *info,
+		 unsigned long idx)
+{
+	struct vm_area_struct pvma;
+	struct page *page;
+
+	memset(&pvma, 0, sizeof(struct vm_area_struct));
+	pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
+	pvma.vm_pgoff = idx;
+	pvma.vm_end = PAGE_SIZE;
+	page = alloc_page_vma(gfp, &pvma, 0);
+	mpol_free(pvma.vm_policy);
+	return page;
+}
+#else
+static inline struct page *
+shmem_swapin(struct shmem_inode_info *info,swp_entry_t entry,unsigned long idx)
+{
+	swapin_readahead(entry, 0, NULL);
+	return read_swap_cache_async(entry, NULL, 0);
+}
+
+static inline struct page *
+shmem_alloc_page(unsigned long gfp,struct shmem_inode_info *info,
+				 unsigned long idx)
+{
+	return alloc_page(gfp);
+}
+#endif
+
 /*
  * shmem_getpage - either get the page from swap or allocate a new one
  *
@@ -790,7 +861,8 @@ redirty:
  * vm. If we swap it in we mark it dirty since we also free the swap
  * entry since a page cannot live in both the swap and page cache
  */
-static int shmem_getpage(struct inode *inode, unsigned long idx, struct page **pagep, enum sgp_type sgp, int *type)
+static int shmem_getpage(struct inode *inode, unsigned long idx,
+			struct page **pagep, enum sgp_type sgp, int *type)
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct shmem_inode_info *info = SHMEM_I(inode);
@@ -840,8 +912,7 @@ repeat:
 			if (majmin == VM_FAULT_MINOR && type)
 				inc_page_state(pgmajfault);
 			majmin = VM_FAULT_MAJOR;
-			swapin_readahead(swap);
-			swappage = read_swap_cache_async(swap);
+			swappage = shmem_swapin(info, swap, idx);
 			if (!swappage) {
 				spin_lock(&info->lock);
 				entry = shmem_swp_alloc(info, idx, sgp);
@@ -946,7 +1017,9 @@ repeat:
 
 		if (!filepage) {
 			spin_unlock(&info->lock);
-			filepage = page_cache_alloc(mapping);
+			filepage = shmem_alloc_page(mapping_gfp_mask(mapping),
+						    info,
+						    idx);
 			if (!filepage) {
 				shmem_unacct_blocks(info->flags, 1);
 				shmem_free_block(inode);
@@ -1069,6 +1142,24 @@ static int shmem_populate(struct vm_area
 	return 0;
 }
 
+#ifdef CONFIG_NUMA
+int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
+{
+	struct inode *i = vma->vm_file->f_dentry->d_inode;
+	return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new);
+}
+
+struct mempolicy *
+shmem_get_policy(struct vm_area_struct *vma, unsigned long addr)
+{
+	struct inode *i = vma->vm_file->f_dentry->d_inode;
+	unsigned long idx;
+
+	idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+	return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx);
+}
+#endif
+
 void shmem_lock(struct file *file, int lock)
 {
 	struct inode *inode = file->f_dentry->d_inode;
@@ -1117,6 +1208,7 @@ shmem_get_inode(struct super_block *sb, 
 		info = SHMEM_I(inode);
 		memset(info, 0, (char *)inode - (char *)info);
 		spin_lock_init(&info->lock);
+ 		mpol_shared_policy_init(&info->policy);
 		switch (mode & S_IFMT) {
 		default:
 			init_special_inode(inode, mode, dev);
@@ -1792,6 +1884,7 @@ static struct inode *shmem_alloc_inode(s
 
 static void shmem_destroy_inode(struct inode *inode)
 {
+	mpol_free_shared_policy(&SHMEM_I(inode)->policy);
 	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
 }
 
@@ -1876,6 +1969,10 @@ static struct super_operations shmem_ops
 static struct vm_operations_struct shmem_vm_ops = {
 	.nopage		= shmem_nopage,
 	.populate	= shmem_populate,
+#ifdef CONFIG_NUMA
+	.set_policy     = shmem_set_policy,
+	.get_policy     = shmem_get_policy,
+#endif
 };
 
 static struct super_block *shmem_get_sb(struct file_system_type *fs_type,
diff -purN -X /home/mbligh/.diff.exclude reference/mm/swap_state.c current/mm/swap_state.c
--- reference/mm/swap_state.c	2004-04-30 11:23:57.000000000 -0700
+++ current/mm/swap_state.c	2004-05-01 10:25:17.000000000 -0700
@@ -325,7 +325,8 @@ struct page * lookup_swap_cache(swp_entr
  * A failure return means that either the page allocation failed or that
  * the swap entry is no longer in use.
  */
-struct page * read_swap_cache_async(swp_entry_t entry)
+struct page *read_swap_cache_async(swp_entry_t entry,
+			struct vm_area_struct *vma, unsigned long addr)
 {
 	struct page *found_page, *new_page = NULL;
 	int err;
@@ -349,7 +350,7 @@ struct page * read_swap_cache_async(swp_
 		 * Get a new page to read into from swap.
 		 */
 		if (!new_page) {
-			new_page = alloc_page(GFP_HIGHUSER);
+			new_page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
 			if (!new_page)
 				break;		/* Out of memory */
 		}
diff -purN -X /home/mbligh/.diff.exclude reference/mm/swapfile.c current/mm/swapfile.c
--- reference/mm/swapfile.c	2004-05-01 10:21:04.000000000 -0700
+++ current/mm/swapfile.c	2004-05-01 10:25:17.000000000 -0700
@@ -685,7 +685,7 @@ static int try_to_unuse(unsigned int typ
 		 */
 		swap_map = &si->swap_map[i];
 		entry = swp_entry(type, i);
-		page = read_swap_cache_async(entry);
+		page = read_swap_cache_async(entry, NULL, 0);
 		if (!page) {
 			/*
 			 * Either swap_duplicate() failed because entry