diff -urpN -X /home/fletch/.diff.exclude 850-mbind_part1/arch/i386/kernel/entry.S 851-mbind_part2/arch/i386/kernel/entry.S --- 850-mbind_part1/arch/i386/kernel/entry.S Wed Aug 13 20:29:41 2003 +++ 851-mbind_part2/arch/i386/kernel/entry.S Wed Aug 13 20:51:50 2003 @@ -858,7 +858,7 @@ ENTRY(sys_call_table) .long sys_getdents64 /* 220 */ .long sys_fcntl64 .long sys_ni_syscall /* reserved for TUX */ - .long sys_ni_syscall + .long sys_mbind .long sys_gettid .long sys_readahead /* 225 */ .long sys_setxattr diff -urpN -X /home/fletch/.diff.exclude 850-mbind_part1/fs/inode.c 851-mbind_part2/fs/inode.c --- 850-mbind_part1/fs/inode.c Wed Aug 13 20:24:28 2003 +++ 851-mbind_part2/fs/inode.c Wed Aug 13 20:51:50 2003 @@ -145,6 +145,9 @@ static struct inode *alloc_inode(struct mapping->dirtied_when = 0; mapping->assoc_mapping = NULL; mapping->backing_dev_info = &default_backing_dev_info; +#ifdef CONFIG_NUMA + mapping->binding = NULL; +#endif if (sb->s_bdev) mapping->backing_dev_info = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; memset(&inode->u, 0, sizeof(inode->u)); diff -urpN -X /home/fletch/.diff.exclude 850-mbind_part1/include/asm-i386/unistd.h 851-mbind_part2/include/asm-i386/unistd.h --- 850-mbind_part1/include/asm-i386/unistd.h Tue Aug 5 20:01:43 2003 +++ 851-mbind_part2/include/asm-i386/unistd.h Wed Aug 13 20:51:50 2003 @@ -228,7 +228,7 @@ #define __NR_madvise1 219 /* delete when C lib stub is removed */ #define __NR_getdents64 220 #define __NR_fcntl64 221 -/* 223 is unused */ +#define __NR_mbind 223 #define __NR_gettid 224 #define __NR_readahead 225 #define __NR_setxattr 226 diff -urpN -X /home/fletch/.diff.exclude 850-mbind_part1/include/linux/fs.h 851-mbind_part2/include/linux/fs.h --- 850-mbind_part1/include/linux/fs.h Wed Aug 13 20:24:32 2003 +++ 851-mbind_part2/include/linux/fs.h Wed Aug 13 20:51:50 2003 @@ -332,6 +332,9 @@ struct address_space { spinlock_t private_lock; /* for use by the address_space */ struct list_head private_list; /* ditto */ struct address_space *assoc_mapping; /* ditto */ +#ifdef CONFIG_NUMA + struct binding *binding; /* for memory bindings */ +#endif }; struct block_device { diff -urpN -X /home/fletch/.diff.exclude 850-mbind_part1/include/linux/mmzone.h 851-mbind_part2/include/linux/mmzone.h --- 850-mbind_part1/include/linux/mmzone.h Wed Aug 13 20:51:47 2003 +++ 851-mbind_part2/include/linux/mmzone.h Wed Aug 13 20:51:50 2003 @@ -370,6 +370,11 @@ static inline struct zonelist *get_node_ #define get_zonelist(gfp_mask) get_node_zonelist(numa_node_id(), gfp_mask) +/* Structure to keep track of memory segment (VMA) bindings */ +struct binding { + struct zonelist zonelist; +}; + #endif /* !__ASSEMBLY__ */ #endif /* __KERNEL__ */ #endif /* _LINUX_MMZONE_H */ diff -urpN -X /home/fletch/.diff.exclude 850-mbind_part1/include/linux/pagemap.h 851-mbind_part2/include/linux/pagemap.h --- 850-mbind_part1/include/linux/pagemap.h Wed Aug 13 20:51:47 2003 +++ 851-mbind_part2/include/linux/pagemap.h Wed Aug 13 20:51:50 2003 @@ -27,10 +27,28 @@ #define page_cache_release(page) put_page(page) void release_pages(struct page **pages, int nr, int cold); +#ifndef CONFIG_NUMA + static inline struct page *__page_cache_alloc(struct address_space *x, int gfp_mask) { return alloc_pages(gfp_mask, 0); } + +#else /* CONFIG_NUMA */ + +static inline struct page *__page_cache_alloc(struct address_space *x, int gfp_mask) +{ + struct zonelist *zonelist; + + if (!x->binding) + zonelist = get_zonelist(gfp_mask); + else + zonelist = &x->binding->zonelist; + + return __alloc_pages(gfp_mask, 0, zonelist); +} + +#endif /* !CONFIG_NUMA */ static inline struct page *page_cache_alloc(struct address_space *x) { diff -urpN -X /home/fletch/.diff.exclude 850-mbind_part1/kernel/sys.c 851-mbind_part2/kernel/sys.c --- 850-mbind_part1/kernel/sys.c Tue Aug 5 20:01:56 2003 +++ 851-mbind_part2/kernel/sys.c Wed Aug 13 20:51:50 2003 @@ -235,6 +235,7 @@ cond_syscall(sys_epoll_ctl) cond_syscall(sys_epoll_wait) cond_syscall(sys_pciconfig_read) cond_syscall(sys_pciconfig_write) +cond_syscall(sys_mbind) static int set_one_prio(struct task_struct *p, int niceval, int error) { diff -urpN -X /home/fletch/.diff.exclude 850-mbind_part1/mm/Makefile 851-mbind_part2/mm/Makefile --- 850-mbind_part1/mm/Makefile Thu Feb 13 11:08:15 2003 +++ 851-mbind_part2/mm/Makefile Wed Aug 13 20:51:50 2003 @@ -7,8 +7,10 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ shmem.o vmalloc.o -obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ +obj-y := bootmem.o fadvise.o filemap.o mempool.o oom_kill.o \ page_alloc.o page-writeback.o pdflush.o readahead.o \ slab.o swap.o truncate.o vcache.o vmscan.o $(mmu-y) obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o + +obj-$(CONFIG_NUMA) += mbind.o diff -urpN -X /home/fletch/.diff.exclude 850-mbind_part1/mm/mbind.c 851-mbind_part2/mm/mbind.c --- 850-mbind_part1/mm/mbind.c Wed Dec 31 16:00:00 1969 +++ 851-mbind_part2/mm/mbind.c Wed Aug 13 20:51:50 2003 @@ -0,0 +1,147 @@ +/* + * mm/mbind.c + * + * Written by: Matthew Dobson, IBM Corporation + * + * Copyright (C) 2003, IBM Corp. + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send feedback to + */ +#include +#include +#include +#include +#include + +/* Translate a cpumask to a nodemask */ +static inline void cpumask_to_nodemask(unsigned long * cpumask, unsigned long * nodemask) +{ + int i; + + for (i = 0; i < NR_CPUS; i++) + if (test_bit(i, cpumask)) + set_bit(cpu_to_node(i), nodemask); +} + +/* + * Adds the zones belonging to @pgdat to @zonelist. Returns the next + * index in @zonelist. + */ +static inline int add_node(pg_data_t *pgdat, struct zonelist *zonelist, int zone_num) +{ + int i; + struct zone *zone; + + for (i = MAX_NR_ZONES-1; i >=0 ; i--) { + zone = pgdat->node_zones + i; + if (zone->present_pages) + zonelist->zones[zone_num++] = zone; + } + return zone_num; +} + +/* Builds a binding for a region of memory, based on a bitmask of nodes. */ +static inline int build_binding(unsigned long * nodemask, struct binding *binding) +{ + int node, zone_num; + + memset(binding, 0, sizeof(struct binding)); + + /* Build binding zonelist */ + for (node = 0, zone_num = 0; node < MAX_NUMNODES; node++) + if (test_bit(node, nodemask) && node_online(node)) + zone_num = add_node(NODE_DATA(node), + &binding->zonelist, zone_num); + binding->zonelist.zones[zone_num] = NULL; + + if (zone_num == 0) + /* No zones were added to the zonelist. Let the caller know. */ + return -EINVAL; + + return 0; +} + + +/* + * mbind - Bind a range of a process' VM space to a set of memory blocks according to + * a predefined policy. + * @start: beginning address of memory region to bind + * @len: length of memory region to bind + * @mask_ptr: pointer to bitmask of cpus + * @mask_len: length of the bitmask + * @policy: flag specifying the policy to use for the segment + */ +asmlinkage unsigned long sys_mbind(unsigned long start, unsigned long len, + unsigned long *mask_ptr, unsigned int mask_len, unsigned long policy) +{ + DECLARE_BITMAP(cpu_mask, NR_CPUS); + DECLARE_BITMAP(node_mask, MAX_NUMNODES); + struct vm_area_struct *vma = NULL; + struct address_space *mapping; + int copy_len, error = 0; + + /* Deal with getting cpu_mask from userspace & translating to node_mask */ + CLEAR_BITMAP(cpu_mask, NR_CPUS); + CLEAR_BITMAP(node_mask, MAX_NUMNODES); + copy_len = min(mask_len, (unsigned int)NR_CPUS); + if (copy_from_user(cpu_mask, mask_ptr, (copy_len+7)/8)) { + error = -EFAULT; + goto out; + } + cpumask_to_nodemask(cpu_mask, node_mask); + + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm, start); + up_read(¤t->mm->mmap_sem); + /* This is an ugly, gross hack. This is purely because I've hurt my + * brain trying to come up with a brilliant way of implementing this + * for VMA's in general. Shared Memory VMA's lend themselves to binding + * both because of how they're implemented, and their actual uses. + * If anyone has a great place to squirrel-away some data about the + * requested binding, and a way to easily force the allocator to respect + * these bindings, then send a patch, or let me know. Otherwise, this + * will have to wait for a stroke of insight. + */ + if (!(vma && vma->vm_file && vma->vm_ops && + vma->vm_ops->nopage == shmem_nopage)) { + /* This isn't a shm segment. For now, we bail. */ + error = -EINVAL; + goto out; + } + + mapping = vma->vm_file->f_dentry->d_inode->i_mapping; + if (mapping->binding) { + kfree(mapping->binding); + mapping->binding = NULL; + } + mapping->binding = kmalloc(sizeof(struct binding), GFP_KERNEL); + if (!mapping->binding) { + error = -ENOMEM; + goto out; + } + error = build_binding(node_mask, mapping->binding); + if (error) { + kfree(mapping->binding); + mapping->binding = NULL; + } + +out: + return error; +}