diff -purN -X /home/mbligh/.diff.exclude 490-amd_sysrq_t/arch/i386/Kconfig 500-user_text_replication/arch/i386/Kconfig --- 490-amd_sysrq_t/arch/i386/Kconfig 2003-12-11 17:29:34.000000000 -0800 +++ 500-user_text_replication/arch/i386/Kconfig 2003-12-12 16:42:18.000000000 -0800 @@ -792,6 +792,17 @@ comment "NUMA (NUMA-Q) requires SMP, 64G comment "NUMA (Summit) requires SMP, 64GB highmem support, full ACPI" depends on X86_SUMMIT && (!HIGHMEM64G || !ACPI || ACPI_HT_ONLY) +config MAPPING_REPLICATE + bool " Numa user text replication" + depends on NUMA + default y + help + Selecting this option will allow the NUMA code to make node-local copies + of some kinds of read-only files, including executables and shared + libraries. + + If unsure, say "n". + config DISCONTIGMEM bool depends on NUMA diff -purN -X /home/mbligh/.diff.exclude 490-amd_sysrq_t/fs/inode.c 500-user_text_replication/fs/inode.c --- 490-amd_sysrq_t/fs/inode.c 2003-12-11 17:28:01.000000000 -0800 +++ 500-user_text_replication/fs/inode.c 2003-12-12 16:42:18.000000000 -0800 @@ -196,6 +196,9 @@ void inode_init_once(struct inode *inode INIT_LIST_HEAD(&inode->i_data.i_mmap_shared); spin_lock_init(&inode->i_lock); i_size_ordered_init(inode); +#ifdef CONFIG_MAPPING_REPLICATE + atomic_set(&inode->i_data.replicate, 0); +#endif } EXPORT_SYMBOL(inode_init_once); @@ -993,6 +996,7 @@ void generic_delete_inode(struct inode * if (inode->i_data.nrpages) truncate_inode_pages(&inode->i_data, 0); + clear_replication(inode); security_inode_delete(inode); @@ -1039,6 +1043,8 @@ static void generic_forget_inode(struct spin_unlock(&inode_lock); if (inode->i_data.nrpages) truncate_inode_pages(&inode->i_data, 0); + + clear_replication(inode); clear_inode(inode); destroy_inode(inode); } diff -purN -X /home/mbligh/.diff.exclude 490-amd_sysrq_t/fs/namei.c 500-user_text_replication/fs/namei.c --- 490-amd_sysrq_t/fs/namei.c 2003-10-14 15:50:29.000000000 -0700 +++ 500-user_text_replication/fs/namei.c 2003-12-12 16:42:18.000000000 -0800 @@ -241,29 +241,76 @@ int permission(struct inode * inode,int * who will try to move it in struct inode - just leave it here. */ static spinlock_t arbitration_lock = SPIN_LOCK_UNLOCKED; +/* + * if the inability to get_write_access() is because + * of replication going on, collapse the replication + * and try again + */ +static int inode_try_replication_disable(struct inode *inode) +{ + struct address_space *mapping = inode->i_mapping; + if (unlikely(mapping_replicate(inode->i_mapping))) { + spin_unlock(&arbitration_lock); + + /* the collapsing is like truncating, and is protected + * by i_sem */ + down(&inode->i_sem); + collapse_replication(mapping, NULL); + spin_lock(&arbitration_lock); + up(&inode->i_sem); + + return 1; + } + return 0; +} int get_write_access(struct inode * inode) { spin_lock(&arbitration_lock); +retry: if (atomic_read(&inode->i_writecount) < 0) { + /* this can drop and reacquire the arbitration_lock */ + if (inode_try_replication_disable(inode)) + goto retry; + spin_unlock(&arbitration_lock); return -ETXTBSY; } atomic_inc(&inode->i_writecount); + BUG_ON(mapping_replicate(inode->i_mapping)); spin_unlock(&arbitration_lock); return 0; } -int deny_write_access(struct file * file) +int __deny_write_access(struct file * file, int set_replicate) { + struct inode *inode = file->f_dentry->d_inode; + spin_lock(&arbitration_lock); - if (atomic_read(&file->f_dentry->d_inode->i_writecount) > 0) { + if (atomic_read(&inode->i_writecount) > 0) { spin_unlock(&arbitration_lock); return -ETXTBSY; } - atomic_dec(&file->f_dentry->d_inode->i_writecount); + atomic_dec(&inode->i_writecount); + + /* + * this is done under the arbitration_lock to prevent any + * races where a potential writer might not see that + * writing is denied because of replication, and not just + * a normal write deny. + */ +#ifdef CONFIG_MAPPING_REPLICATE + if (set_replicate && !mapping_replicate(inode->i_mapping)) + atomic_inc(&inode->i_data.replicate); +#endif + spin_unlock(&arbitration_lock); return 0; } +int deny_write_access(struct file * file) +{ + return __deny_write_access(file, 0); +} + void path_release(struct nameidata *nd) { dput(nd->dentry); diff -purN -X /home/mbligh/.diff.exclude 490-amd_sysrq_t/include/asm-i386/mmzone.h 500-user_text_replication/include/asm-i386/mmzone.h --- 490-amd_sysrq_t/include/asm-i386/mmzone.h 2003-12-11 17:16:48.000000000 -0800 +++ 500-user_text_replication/include/asm-i386/mmzone.h 2003-12-12 16:42:18.000000000 -0800 @@ -149,5 +149,7 @@ static inline void get_memcfg_numa(void) get_memcfg_numa_flat(); } +#define page_is_local(page) (page_to_nid(page) == numa_node_id()) + #endif /* CONFIG_DISCONTIGMEM */ #endif /* _ASM_MMZONE_H_ */ diff -purN -X /home/mbligh/.diff.exclude 490-amd_sysrq_t/include/linux/fs.h 500-user_text_replication/include/linux/fs.h --- 490-amd_sysrq_t/include/linux/fs.h 2003-12-11 17:28:01.000000000 -0800 +++ 500-user_text_replication/include/linux/fs.h 2003-12-12 16:42:18.000000000 -0800 @@ -339,8 +339,22 @@ struct address_space { #ifdef CONFIG_NUMA struct binding *binding; /* for memory bindings */ #endif +#ifdef CONFIG_MAPPING_REPLICATE + atomic_t replicate; +#endif }; +#ifdef CONFIG_MAPPING_REPLICATE + #define mapping_replicate(mapping) (atomic_read(&(mapping)->replicate) > 0) + #define clear_replication(inode) do { \ + if (atomic_read(&inode->i_data.replicate)) \ + atomic_dec(&inode->i_data.replicate); \ + } while (0) +#else + #define mapping_replicate(mapping) (0) + #define clear_replication(inode) do {} while(0) +#endif + struct block_device { dev_t bd_dev; /* not a kdev_t - it's a search key */ struct inode * bd_inode; /* will die */ @@ -1202,9 +1216,11 @@ static inline void invalidate_remote_ino } extern void invalidate_inode_pages2(struct address_space *mapping); extern void write_inode_now(struct inode *, int); +extern int file_try_replicate(struct file *file); extern int filemap_fdatawrite(struct address_space *); extern int filemap_flush(struct address_space *); extern int filemap_fdatawait(struct address_space *); +extern void collapse_replication(struct address_space *mapping, struct file *file); extern void sync_supers(void); extern void sync_filesystems(int wait); extern void emergency_sync(void); @@ -1218,6 +1234,7 @@ extern int permission(struct inode *, in extern int vfs_permission(struct inode *, int); extern int get_write_access(struct inode *); extern int deny_write_access(struct file *); +extern int __deny_write_access(struct file *, int); static inline void put_write_access(struct inode * inode) { atomic_dec(&inode->i_writecount); diff -purN -X /home/mbligh/.diff.exclude 490-amd_sysrq_t/include/linux/pagemap.h 500-user_text_replication/include/linux/pagemap.h --- 490-amd_sysrq_t/include/linux/pagemap.h 2003-12-11 17:28:01.000000000 -0800 +++ 500-user_text_replication/include/linux/pagemap.h 2003-12-12 16:42:18.000000000 -0800 @@ -96,6 +96,9 @@ extern struct page * find_or_create_page extern unsigned int find_get_pages(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages); +extern int find_get_replica_pages(struct address_space *mapping, + pgoff_t start, unsigned int nr_pages, + struct page **pages); /* * Returns locked page at given index in given cache, creating it if needed. @@ -118,7 +121,10 @@ int add_to_page_cache(struct page *page, int add_to_page_cache_lru(struct page *page, struct address_space *mapping, unsigned long index, int gfp_mask); extern void remove_from_page_cache(struct page *page); +extern int __insert_into_page_cache(struct page *page, struct address_space *mapping, + pgoff_t offset); extern void __remove_from_page_cache(struct page *page); +extern struct page *__page_cache_lookup(struct address_space *mapping, pgoff_t offset); extern atomic_t nr_pagecache; diff -purN -X /home/mbligh/.diff.exclude 490-amd_sysrq_t/include/linux/pagevec.h 500-user_text_replication/include/linux/pagevec.h --- 490-amd_sysrq_t/include/linux/pagevec.h 2002-12-09 18:46:25.000000000 -0800 +++ 500-user_text_replication/include/linux/pagevec.h 2003-12-12 16:42:18.000000000 -0800 @@ -24,6 +24,8 @@ void __pagevec_lru_add_active(struct pag void pagevec_strip(struct pagevec *pvec); unsigned int pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, pgoff_t start, unsigned int nr_pages); +unsigned int pagevec_lookup_replicas(struct pagevec *pvec, + struct address_space *mapping, unsigned int nr_pages); static inline void pagevec_init(struct pagevec *pvec, int cold) { diff -purN -X /home/mbligh/.diff.exclude 490-amd_sysrq_t/include/linux/radix-tree.h 500-user_text_replication/include/linux/radix-tree.h --- 490-amd_sysrq_t/include/linux/radix-tree.h 2003-04-21 14:14:50.000000000 -0700 +++ 500-user_text_replication/include/linux/radix-tree.h 2003-12-12 16:42:18.000000000 -0800 @@ -41,7 +41,7 @@ do { \ (root)->rnode = NULL; \ } while (0) -extern int radix_tree_insert(struct radix_tree_root *, unsigned long, void *); +extern void *radix_tree_insert(struct radix_tree_root *, unsigned long, void *); extern void *radix_tree_lookup(struct radix_tree_root *, unsigned long); extern void *radix_tree_delete(struct radix_tree_root *, unsigned long); extern unsigned int diff -purN -X /home/mbligh/.diff.exclude 490-amd_sysrq_t/init/main.c 500-user_text_replication/init/main.c --- 490-amd_sysrq_t/init/main.c 2003-12-11 17:16:53.000000000 -0800 +++ 500-user_text_replication/init/main.c 2003-12-12 16:42:18.000000000 -0800 @@ -83,6 +83,7 @@ extern void pidhash_init(void); extern void pidmap_init(void); extern void pte_chain_init(void); extern void radix_tree_init(void); +extern void page_cache_leaf_init(void); extern void free_initmem(void); extern void populate_rootfs(void); extern void driver_init(void); @@ -456,6 +457,7 @@ asmlinkage void __init start_kernel(void security_scaffolding_startup(); vfs_caches_init(num_physpages); radix_tree_init(); + page_cache_leaf_init(); signals_init(); /* rootfs populating might need page-writeback */ page_writeback_init(); diff -purN -X /home/mbligh/.diff.exclude 490-amd_sysrq_t/lib/radix-tree.c 500-user_text_replication/lib/radix-tree.c --- 490-amd_sysrq_t/lib/radix-tree.c 2003-04-21 14:14:53.000000000 -0700 +++ 500-user_text_replication/lib/radix-tree.c 2003-12-12 16:42:18.000000000 -0800 @@ -18,6 +18,7 @@ */ #include +#include #include #include #include @@ -168,8 +169,11 @@ static int radix_tree_extend(struct radi * @item: item to insert * * Insert an item into the radix tree at position @index. + * + * If the insertion fails because a duplicate element is present, + * return that element. */ -int radix_tree_insert(struct radix_tree_root *root, unsigned long index, void *item) +void *radix_tree_insert(struct radix_tree_root *root, unsigned long index, void *item) { struct radix_tree_node *node = NULL, *tmp, **slot; unsigned int height, shift; @@ -179,7 +183,7 @@ int radix_tree_insert(struct radix_tree_ if (index > radix_tree_maxindex(root->height)) { error = radix_tree_extend(root, index); if (error) - return error; + return ERR_PTR(error); } slot = &root->rnode; @@ -190,7 +194,7 @@ int radix_tree_insert(struct radix_tree_ if (*slot == NULL) { /* Have to add a child node. */ if (!(tmp = radix_tree_node_alloc(root))) - return -ENOMEM; + return ERR_PTR(-ENOMEM); *slot = tmp; if (node) node->count++; @@ -205,7 +209,7 @@ int radix_tree_insert(struct radix_tree_ } if (*slot != NULL) - return -EEXIST; + return *slot; /* used to be -EEXIST */ if (node) node->count++; diff -purN -X /home/mbligh/.diff.exclude 490-amd_sysrq_t/mm/filemap.c 500-user_text_replication/mm/filemap.c --- 490-amd_sysrq_t/mm/filemap.c 2003-12-11 17:16:05.000000000 -0800 +++ 500-user_text_replication/mm/filemap.c 2003-12-12 16:42:18.000000000 -0800 @@ -10,6 +10,7 @@ * the NFS filesystem used to do this differently, for example) */ #include +#include #include #include #include @@ -91,6 +92,254 @@ */ /* + * If replication is on, only the node-local page will be returned. If + * there is not a local page, it will not find anything. + * + * If find_any is set, a search for all pages will be done even if + * replication is on. This is useful when we're trying to make a + * local copy of the page and we just want any old copy of it. + */ +enum page_search { + PAGE_LOCAL, + PAGE_ANY +}; + +#ifndef CONFIG_MAPPING_REPLICATE +/* + * This is an attempt to keep the overhead when not doing replication + * to a bare minimum. Instead of storing a real page_cache_leaf in + * the radix tree, a plain page pointer is stored. + * + * This abstraction allows more common code to be used for both the + * replicated, and non-replicated cases. + */ +struct page_cache_leaf { + struct page page; +}; + +struct page *page_cache_leaf_to_page(struct page_cache_leaf *leaf, + struct address_space *mapping, enum page_search search_type) +{ + return &leaf->page; +} + +#define leaf_free(leaf) do {} while (0) +#define leaf_preload(gfpflags) (0) + +static inline struct page *make_local_replica_lock(struct address_space *mapping, + struct page *page) +{ + return page; +} + +static inline void drop_replica_pages(struct address_space *mapping) +{ +} +void collapse_replication(struct address_space *mapping, + struct file *file) +{ +} +static inline struct page *make_local_replica(struct address_space *mapping, struct page *page, struct page_cache_leaf *leaf) +{ + return page; +} + +#else /* CONFIG_MAPPING_REPLICATE */ + +struct page_cache_leaf { + struct page* pages[MAX_NUMNODES]; + /* + * This doesn't need to be an atomic because it's always + * modified under mapping->page_lock + */ + int count; + /* + * the duplicate_lock is not here to prevent any harmful races, it + * keeps collision overhead to a minimum. + * + * When 2 CPUs on the same node get into find_get_page() together, they + * can both try to make a copy at the same time. One is bound to get + * -EEXIST and back off properly, but copying that page is expensive. + * Better to just spin on this and wait for the other cpu to do the copy. + * + * This lock could be per-node. + */ + spinlock_t duplicate_lock; +}; + +DEFINE_PER_CPU(struct page_cache_leaf *, page_cache_leaf_preloads) = { 0, }; +static kmem_cache_t *page_cache_leaf_cachep; + +static inline void leaf_free(struct page_cache_leaf *leaf) +{ + struct page_cache_leaf **preload; + preload = &get_cpu_var(page_cache_leaf_preloads); + if (!*preload) + *preload = leaf; + else + kmem_cache_free(page_cache_leaf_cachep, leaf); + put_cpu_var(page_cache_leaf_preloads); +} + +void page_cache_leaf_ctor(void *node, kmem_cache_t *cachep, unsigned long flags) +{ + struct page_cache_leaf *leaf = node; + memset(node, 0, sizeof(struct page_cache_leaf)); + spin_lock_init(&leaf->duplicate_lock); +} + +int leaf_preload(int gfp_mask) +{ + struct page_cache_leaf **preload; + int error = 0; + + preload = &get_cpu_var(page_cache_leaf_preloads); + if (!*preload) + *preload = kmem_cache_alloc(page_cache_leaf_cachep, gfp_mask); + if (!*preload) + error = -ENOMEM; + + put_cpu_var(page_cache_leaf_preloads); + + return error; +} + +/* + * for the non-numa case, this can just cast *leaf to a page and return + */ +struct page *page_cache_leaf_to_page(struct page_cache_leaf *leaf, + struct address_space *mapping, enum page_search search_type) +{ + struct page *page = NULL; + int nid = numa_node_id(); + + /* Always look for a local copy first */ + if (mapping_replicate(mapping)) + page = leaf->pages[nid]; + + if (!page && (!mapping_replicate(mapping) || (search_type == PAGE_ANY))) + for (nid = 0; nid < numnodes; nid++) { + page = leaf->pages[nid]; + if (page) + break; + } + return page; +} +#endif + +void __init page_cache_leaf_init(void) +{ +#ifdef CONFIG_MAPPING_REPLICATE + page_cache_leaf_cachep = kmem_cache_create("page_cache_leaf", + sizeof(struct page_cache_leaf), 0, + 0, page_cache_leaf_ctor, NULL); + if (!page_cache_leaf_cachep) + panic ("Failed to create radix_tree_node cache\n"); +#endif +} + +#ifndef CONFIG_MAPPING_REPLICATE +int __insert_into_page_cache(struct page *page, struct address_space *mapping, + pgoff_t offset) +{ + struct page_cache_leaf *leaf, *errptr; + int error = 0; + + leaf = container_of(page, struct page_cache_leaf, page); + errptr = radix_tree_insert(&mapping->page_tree, offset, leaf); + + if (IS_ERR(errptr)) + error = PTR_ERR(errptr); + + return error; +} +#else +int __insert_into_page_cache(struct page *page, struct address_space *mapping, + pgoff_t offset) +{ + int error = 0; + int nid; + struct page_cache_leaf *leaf, **newleaf; + + nid = page_to_nid(page); + + /* + * If the leaf preload allocation failed, then at least check the + * tree to see if a leaf is already present. If one is present, + * then we got lucky and didn't really need to allocate anything. + * + * If that lookup *fails*, then we were really out of memory and + * error out. + */ + newleaf = &get_cpu_var(page_cache_leaf_preloads); + if (*newleaf) + leaf = radix_tree_insert(&mapping->page_tree, offset, *newleaf); + else { + leaf = radix_tree_lookup(&mapping->page_tree, offset); + if (!leaf) + leaf = ERR_PTR(-ENOMEM); + } + + if (IS_ERR(leaf)) { + error = PTR_ERR(leaf); + goto out; + } + + /* there's already a leaf node there */ + if (!mapping_replicate(mapping) && leaf) { + error = -EEXIST; + goto out; + } + + /* successful insertion, absorb the preloaded leaf */ + if (!leaf) { + leaf = *newleaf; + *newleaf = NULL; + } + + /* + * A !PageUptodate() will have some I/O done on it shortly. + * The readahead code puts pages like that in here. If + * there's a replica available, don't bother putting the + * page in, because the I/O is a duplicate. + */ + if (leaf->pages[nid]) { + error = -EEXIST; + } else { + /* + * Instead of -EEXIST, we could look for an + * Uptodate copy, and use that to make this + * page Uptodate, making a local replica. + */ + if (leaf->count > 1 && !PageUptodate(page)) { + error = -EEXIST; + } else { + leaf->pages[nid] = page; + leaf->count++; + } + } +out: + put_cpu_var(page_cache_leaf_preloads); + return error; +} +#endif + +struct page* +__page_cache_lookup(struct address_space *mapping, pgoff_t offset) +{ + struct page *page = NULL; + struct page_cache_leaf *leaf; + + leaf = radix_tree_lookup(&mapping->page_tree, offset); + if (!leaf) + goto out; + + page = page_cache_leaf_to_page(leaf, mapping, PAGE_ANY); +out: + return page; +} + +/* * Remove a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage * is safe. The caller must hold a write_lock on the mapping's page_lock. @@ -98,8 +347,21 @@ void __remove_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; - - radix_tree_delete(&mapping->page_tree, page->index); +#ifdef CONFIG_MAPPING_REPLICATE + struct page_cache_leaf *leaf; + leaf = radix_tree_lookup(&mapping->page_tree, page->index); + leaf->pages[page_to_nid(page)] = NULL; + if (--leaf->count == 0) { +#endif + radix_tree_delete(&mapping->page_tree, page->index); +#ifdef CONFIG_MAPPING_REPLICATE + /* + * if there is a free preload slot for this CPU, put the + * leaf back there instead of freeing it + */ + leaf_free(leaf); + } +#endif list_del(&page->list); page->mapping = NULL; @@ -128,6 +390,22 @@ static inline int sync_page(struct page return 0; } +#ifdef CONFIG_MAPPING_REPLICATE +/* + * synchronized by i_sem + */ +extern void drop_replica_pages(struct address_space *mapping); +inline void collapse_replication(struct address_space *mapping, + struct file *file) +{ + if (mapping_replicate(mapping)) { + atomic_dec(&mapping->replicate); + drop_replica_pages(mapping); + atomic_inc(&mapping->host->i_writecount); + } +} +#endif + /** * filemap_fdatawrite - start writeback against all of a mapping's dirty pages * @mapping: address space structure to write @@ -251,10 +529,16 @@ int add_to_page_cache(struct page *page, { int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); + if (error != 0) + goto err; + + /* this benefits from the radix_tree_preload()'s preempt_disable() */ + error = leaf_preload(gfp_mask & ~__GFP_HIGHMEM); + if (error == 0) { page_cache_get(page); spin_lock(&mapping->page_lock); - error = radix_tree_insert(&mapping->page_tree, offset, page); + error = __insert_into_page_cache(page, mapping, offset); if (!error) { SetPageLocked(page); ___add_to_page_cache(page, mapping, offset); @@ -264,11 +548,17 @@ int add_to_page_cache(struct page *page, spin_unlock(&mapping->page_lock); radix_tree_preload_end(); } +err: return error; } EXPORT_SYMBOL(add_to_page_cache); +/* + * The pages will *not* be added to the LRU immediately. They're only + * added after the entire pagevec is filled up. Don't worry, they'll + * get there eventually. + */ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t offset, int gfp_mask) { @@ -385,24 +675,236 @@ void __lock_page(struct page *page) EXPORT_SYMBOL(__lock_page); +#ifdef CONFIG_MAPPING_REPLICATE /* - * a rather lightweight function, finding and getting a reference to a - * hashed page atomically. + * This is fairly lazy with preemption to make the code simpler. It doesn't + * need to be perfect. Making a local replica is by no means required. If the + * replica page allocation fails, one of two things happens: + * 1. page cache returns non-local page, which gets mapped in somewhere. + * things are slightly slower + * 2. page cache returns NULL, when there was a page in the cache. + * I/O is resubmitted for the page, and a replica is made with + * the new data. + */ +DEFINE_PER_CPU(struct page *, replica_preloads) = { NULL, }; +void refill_replica_page_cpu(void) +{ + int cpu = get_cpu(); + int nid = cpu_to_node(cpu); + unsigned int gfp_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_NODE_STRICT; + struct page **page = &__get_cpu_var(replica_preloads); + + if (!*page) + *page = alloc_pages_node(nid, gfp_mask, 0); + + put_cpu(); +} + +/* I want to see this in the profiles */ +void make_local_replica_copy(struct page *dst, struct page *src) +{ + if (!page_is_local(dst)) { + printk("%s(): %d dst not local: %08lx src: %08lx\n", + __func__, smp_processor_id(), + page_to_pfn(dst), page_to_pfn(src)); + } + BUG_ON(!PageUptodate(src)); + copy_highpage(dst, src); +} + +static struct page *__make_local_replica(struct address_space *mapping, struct page *page) { + struct page *copy = page; + struct page **prealloc; + int err; + + if (!page) + goto out; + + if (!mapping_replicate(mapping)) + goto out; + + /* something is probably writing into the source page + * do *not* wait for this to get unlocked. We're under + * a lock here. Just punt on the copy. */ + if (TestSetPageLocked(page)) + goto out; + + /* the old page got unhashed since we pulled it out */ + if (page->mapping != mapping) { + unlock_page(page); + goto out; + } + + prealloc = &get_cpu_var(replica_preloads); + if (*prealloc) { + copy = *prealloc; + *prealloc = NULL; + } + put_cpu_var(replica_preloads); + + if (!copy) + goto out; + + make_local_replica_copy(copy, page); + /* + * Do this now so that add_to_page_cache_lru() won't confuse this + * with a readahead page that should get -EEXIST instead of just + * getting added. + */ + SetPageUptodate(copy); + + /* + * This should never actually have to allocate memory. It will + * be able to add the page to the already existing leaf. The + * leaf can't go away because we hold a ref count on the source + * page. + */ + err = add_to_page_cache_lru(copy, mapping, page->index, GFP_ATOMIC); + unlock_page(page); + switch (err) { + case 0: + unlock_page(copy); + break; + case -EEXIST: + page_cache_release(copy); + goto out; + default: + printk("%s(): ?? %d\n", __FUNCTION__, err); + page_cache_release(copy); + dump_stack(); + goto out; + } + return copy; +out: + return page; +} + + +/* + * We can not be making copies of pages that aren't up to date yet, + * so this function makes sure of that. + * + * Instead of just just returning the information that the page is + * unusable, it could go looking for other sources for the page, perhaps + * another node. + * + * The logic for this was taken from read_cache_page() */ -struct page * find_get_page(struct address_space *mapping, unsigned long offset) +static inline int replica_source_uptodate(struct address_space *mapping, struct page *page) { - struct page *page; + int ret = 1; + + if (likely(PageUptodate(page))) + goto out; + + lock_page(page); + if (!PageUptodate(page) || !page->mapping) + ret = 0; + unlock_page(page); +out: + return ret; +} +/* + * This needs to be called without mapping->page_lock held + */ +static inline struct page *make_local_replica(struct address_space *mapping, struct page *page, struct page_cache_leaf *leaf) +{ + struct page *copy; + + if (page_is_local(page)) + return page; + + /* + * if there's a problem with the source page, don't make a copy + * of it. The caller will fix this up. + */ + if (!replica_source_uptodate(mapping, page)) + return page; + + refill_replica_page_cpu(); + + spin_lock(&leaf->duplicate_lock); + /* + * now that we have the lock, do a crude check to see if anyone + * else has filled in the page we were looking for + */ + if (mapping_replicate(mapping) && + leaf->pages[numa_node_id()]) { + spin_unlock(&leaf->duplicate_lock); + page_cache_release(page); + return NULL; + } + copy = __make_local_replica(mapping, page); + spin_unlock(&leaf->duplicate_lock); + + if (copy != page) { + page_cache_release(page); + return copy; + } + + return page; +} + + +static struct page *make_local_replica_lock(struct address_space *mapping, struct page *page) { + struct page *copy; + return page; + + refill_replica_page_cpu(); + copy = __make_local_replica(mapping, page); + + /* + * this is the cowardly way to do it. Add the new copy, and pray + * that it shows up :) If the replication appears to have worked, + * drop the references to the source page. If the new page + * got removed in the meantime, find_lock_page() will just + * redo the locking anyway. + */ + if (copy != page) { + unlock_page(page); + page_cache_release(page); + copy = find_lock_page(mapping, page->index); + } + + return copy; +} +#endif +/* + * With no page replication, this is a rather function for finding and + * getting a reference to a hashed page atomically. + * + * When replicating pages, this becomes the place where the source for + * copies found and the new copy made. + */ +struct page * find_get_page(struct address_space *mapping, unsigned long offset) +{ + struct page_cache_leaf *leaf; + struct page *page, *copy; /* * We scan the hash list read-only. Addition to and removal from * the hash-list needs a held write-lock. */ + +repeat: spin_lock(&mapping->page_lock); - page = radix_tree_lookup(&mapping->page_tree, offset); - if (page) - page_cache_get(page); + leaf = radix_tree_lookup(&mapping->page_tree, offset); + /* nothing found */ + if (!leaf) { + spin_unlock(&mapping->page_lock); + return NULL; + } + + page = page_cache_leaf_to_page(leaf, mapping, PAGE_ANY); + page_cache_get(page); spin_unlock(&mapping->page_lock); - return page; + + /* A NULL in this context is like -EEXIST. Try again. */ + copy = make_local_replica(mapping, page, leaf); + if (!copy) + goto repeat; + + return copy; } EXPORT_SYMBOL(find_get_page); @@ -415,10 +917,11 @@ struct page *find_trylock_page(struct ad struct page *page; spin_lock(&mapping->page_lock); - page = radix_tree_lookup(&mapping->page_tree, offset); + page = __page_cache_lookup(mapping, offset); if (page && TestSetPageLocked(page)) page = NULL; spin_unlock(&mapping->page_lock); + page = make_local_replica_lock(mapping, page); return page; } @@ -442,12 +945,13 @@ struct page *find_lock_page(struct addre spin_lock(&mapping->page_lock); repeat: - page = radix_tree_lookup(&mapping->page_tree, offset); + page = __page_cache_lookup(mapping, offset); if (page) { page_cache_get(page); if (TestSetPageLocked(page)) { spin_unlock(&mapping->page_lock); lock_page(page); + page = make_local_replica_lock(mapping, page); spin_lock(&mapping->page_lock); /* Has the page been truncated while we slept? */ @@ -489,6 +993,8 @@ struct page *find_or_create_page(struct int err; repeat: page = find_lock_page(mapping, index); + /* this only locks if a replica is made */ + page = make_local_replica_lock(mapping, page); if (!page) { if (!cached_page) { cached_page = alloc_page(gfp_mask); @@ -526,22 +1032,85 @@ EXPORT_SYMBOL(find_or_create_page); * * find_get_pages() returns the number of pages which were found. */ -unsigned int find_get_pages(struct address_space *mapping, pgoff_t start, - unsigned int nr_pages, struct page **pages) +unsigned int find_get_pages(struct address_space *mapping, + pgoff_t start, unsigned int nr_pages, + struct page **pages) { - unsigned int i; unsigned int ret; + int i; spin_lock(&mapping->page_lock); + ret = radix_tree_gang_lookup(&mapping->page_tree, (void **)pages, start, nr_pages); - for (i = 0; i < ret; i++) + + for (i = 0; i < ret; i++) { + /* + * The radix tree lookups return leaves, which + * must be converted to pages + */ + struct page_cache_leaf * leaf = (struct page_cache_leaf *)pages[i]; + pages[i] = page_cache_leaf_to_page(leaf, mapping, PAGE_ANY); page_cache_get(pages[i]); + } spin_unlock(&mapping->page_lock); return ret; } /* + * This is used to find _just_ the replicated pages . It's + * used when we need to write to something where replication + * is active. + */ +int find_get_replica_pages(struct address_space *mapping, + pgoff_t start, unsigned int nr_pages, + struct page **pages) +{ +#ifdef CONFIG_MAPPING_REPLICATE + unsigned int nid = numa_node_id(); + unsigned int nr_leaves; + struct page_cache_leaf *leaf; + struct page_cache_leaf **leaves = (struct page_cache_leaf **)pages; + int pages_seen; + int i, j; + + /* + * this is the number of leaves which have been converted + * to leaves to be returned. Any array indexes <= this + * number are pages. Any > are leaves + */ + int nr_ret_pages = 0; + + spin_lock(&mapping->page_lock); + + nr_leaves = radix_tree_gang_lookup(&mapping->page_tree, + (void **)leaves, start, nr_pages); + for (i = 0; i < nr_leaves; i++) { + leaf = leaves[i]; + if (leaf->count <= 1) + continue; + + for (j=0, pages_seen = 0; + j < MAX_NUMNODES && pages_seen < leaf->count; + j++) { + if (j == nid || !leaf->pages[j]) + continue; + pages[nr_ret_pages] = leaf->pages[j]; + page_cache_get(pages[nr_ret_pages]); + pages_seen++; + nr_ret_pages++; + } + if (i < nr_ret_pages) + i = nr_ret_pages; /* don't forget i++ */ + } + spin_unlock(&mapping->page_lock); + return nr_ret_pages; +#else + return 0; +#endif +} + +/* * Same as grab_cache_page, but do not wait if the page is unavailable. * This is intended for speculative data generators, where the data can * be regenerated if the page couldn't be grabbed. This routine should @@ -1814,6 +2383,7 @@ generic_file_aio_write_nolock(struct kio */ fault_in_pages_readable(buf, bytes); + collapse_replication(mapping, file); page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec); if (!page) { status = -ENOMEM; @@ -1995,3 +2565,38 @@ out: } EXPORT_SYMBOL_GPL(generic_file_direct_IO); + +/* + * Some of this code is a bit redundant in the case where we're replicating + * an executable. It does a deny_write_access() just before this is called + * so this deny_write_access()'s error checking is unnecessary in that case. + * + * For overall reduction of code and cleanliness, we do a little extra here + */ + +int file_try_replicate(struct file *file) +{ +#ifdef CONFIG_MAPPING_REPLICATE + struct inode *inode = file ? file->f_dentry->d_inode : NULL; + int error = 1; + down(&inode->i_sem); + if (!mapping_replicate(inode->i_mapping)) { + error = __deny_write_access(file, 1); + if (error) + goto out_fail; + + /* + * there used to be a check here for dirty pages. it + * was incorrect. dirty pages are allowed, the only + * real problem is !Uptodate pages. + */ + BUG_ON(atomic_read(&inode->i_writecount) >= 0); + up(&inode->i_sem); + return 1; + } + +out_fail: + up(&inode->i_sem); +#endif + return 0; +} diff -purN -X /home/mbligh/.diff.exclude 490-amd_sysrq_t/mm/memory.c 500-user_text_replication/mm/memory.c --- 490-amd_sysrq_t/mm/memory.c 2003-12-11 17:16:39.000000000 -0800 +++ 500-user_text_replication/mm/memory.c 2003-12-12 16:42:18.000000000 -0800 @@ -1496,8 +1496,11 @@ retry: inc_rss(mm, new_page); flush_icache_page(vma, new_page); entry = mk_pte(new_page, vma->vm_page_prot); - if (write_access) + if (write_access) { entry = pte_mkwrite(pte_mkdirty(entry)); + BUG_ON(new_page->mapping && + mapping_replicate(new_page->mapping)); + } set_pte(page_table, entry); pte_chain = page_add_rmap(new_page, page_table, pte_chain); pte_unmap(page_table); diff -purN -X /home/mbligh/.diff.exclude 490-amd_sysrq_t/mm/mmap.c 500-user_text_replication/mm/mmap.c --- 490-amd_sysrq_t/mm/mmap.c 2003-12-11 17:29:48.000000000 -0800 +++ 500-user_text_replication/mm/mmap.c 2003-12-12 16:42:18.000000000 -0800 @@ -543,6 +543,7 @@ unsigned long do_mmap_pgoff(struct file inode = file ? file->f_dentry->d_inode : NULL; if (file) { + int try_to_replicate = 1; switch (flags & MAP_TYPE) { case MAP_SHARED: if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE)) @@ -564,11 +565,19 @@ unsigned long do_mmap_pgoff(struct file vm_flags |= VM_SHARED | VM_MAYSHARE; if (!(file->f_mode & FMODE_WRITE)) vm_flags &= ~(VM_MAYWRITE | VM_SHARED); - + /* + * If this is set, there is a possibility of a conversion + * to a writeable area later. Do not replicate + */ + if (vm_flags & VM_MAYWRITE) + try_to_replicate = 0; + /* fall through */ case MAP_PRIVATE: if (!(file->f_mode & FMODE_READ)) return -EACCES; + if (try_to_replicate) + file_try_replicate(file); break; default: @@ -661,6 +670,7 @@ munmap_back: if (error) goto free_vma; correct_wcount = 1; + file_try_replicate(file); } vma->vm_file = file; get_file(file); diff -purN -X /home/mbligh/.diff.exclude 490-amd_sysrq_t/mm/readahead.c 500-user_text_replication/mm/readahead.c --- 490-amd_sysrq_t/mm/readahead.c 2003-12-11 17:10:40.000000000 -0800 +++ 500-user_text_replication/mm/readahead.c 2003-12-12 16:42:18.000000000 -0800 @@ -236,7 +236,7 @@ __do_page_cache_readahead(struct address if (page_offset > end_index) break; - page = radix_tree_lookup(&mapping->page_tree, page_offset); + page = __page_cache_lookup(mapping, page_offset); if (page) continue; diff -purN -X /home/mbligh/.diff.exclude 490-amd_sysrq_t/mm/swap.c 500-user_text_replication/mm/swap.c --- 490-amd_sysrq_t/mm/swap.c 2003-11-24 16:12:33.000000000 -0800 +++ 500-user_text_replication/mm/swap.c 2003-12-12 16:42:18.000000000 -0800 @@ -357,6 +357,12 @@ unsigned int pagevec_lookup(struct pagev return pagevec_count(pvec); } +unsigned int pagevec_lookup_replicas(struct pagevec *pvec, struct address_space *mapping, unsigned int nr_pages) +{ + pvec->nr = find_get_replica_pages(mapping, 0, nr_pages, pvec->pages); + return pagevec_count(pvec); +} + #ifdef CONFIG_SMP /* diff -purN -X /home/mbligh/.diff.exclude 490-amd_sysrq_t/mm/swap_state.c 500-user_text_replication/mm/swap_state.c --- 490-amd_sysrq_t/mm/swap_state.c 2003-10-01 11:35:37.000000000 -0700 +++ 500-user_text_replication/mm/swap_state.c 2003-12-12 16:42:18.000000000 -0800 @@ -38,6 +38,9 @@ struct address_space swapper_space = { .truncate_count = ATOMIC_INIT(0), .private_lock = SPIN_LOCK_UNLOCKED, .private_list = LIST_HEAD_INIT(swapper_space.private_list), +#ifdef CONFIG_MAPPING_REPLICATE + .replicate = ATOMIC_INIT(0), +#endif }; #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) @@ -198,7 +201,7 @@ int move_to_swap_cache(struct page *page spin_lock(&swapper_space.page_lock); spin_lock(&mapping->page_lock); - err = radix_tree_insert(&swapper_space.page_tree, entry.val, page); + err = __insert_into_page_cache(page, &swapper_space, entry.val); if (!err) { __remove_from_page_cache(page); ___add_to_page_cache(page, &swapper_space, entry.val); @@ -234,7 +237,7 @@ int move_from_swap_cache(struct page *pa spin_lock(&swapper_space.page_lock); spin_lock(&mapping->page_lock); - err = radix_tree_insert(&mapping->page_tree, index, page); + err = __insert_into_page_cache(page, mapping, index); if (!err) { __delete_from_swap_cache(page); ___add_to_page_cache(page, mapping, index); diff -purN -X /home/mbligh/.diff.exclude 490-amd_sysrq_t/mm/truncate.c 500-user_text_replication/mm/truncate.c --- 490-amd_sysrq_t/mm/truncate.c 2003-10-14 15:50:36.000000000 -0700 +++ 500-user_text_replication/mm/truncate.c 2003-12-12 16:42:18.000000000 -0800 @@ -178,6 +178,33 @@ void truncate_inode_pages(struct address EXPORT_SYMBOL(truncate_inode_pages); + +/** + * drop_replica_pages - remove all replicated pages from a mapping + * @mapping: mapping to remove replication from + * + * Called under (and serialised by) inode->i_sem. + */ +void drop_replica_pages(struct address_space *mapping) +{ + struct pagevec pvec; + int num; + int i; + + pagevec_init(&pvec, 0); + while ((num = pagevec_lookup_replicas(&pvec, mapping, PAGEVEC_SIZE))) { + for (i=0; i