diff -aurpN -X /home/fletch/.diff.exclude 470-stacktrace/fs/hugetlbfs/inode.c 480-implicit_huge_pages/fs/hugetlbfs/inode.c --- 470-stacktrace/fs/hugetlbfs/inode.c Mon Nov 17 18:29:49 2003 +++ 480-implicit_huge_pages/fs/hugetlbfs/inode.c Fri Jan 9 23:09:13 2004 @@ -26,12 +26,17 @@ #include #include #include +#include #include +#include /* some random number */ #define HUGETLBFS_MAGIC 0x958458f6 +extern int mmap_use_hugepages; +extern int mmap_hugepages_map_sz; + static struct super_operations hugetlbfs_ops; static struct address_space_operations hugetlbfs_aops; struct file_operations hugetlbfs_file_operations; @@ -82,7 +87,7 @@ static int hugetlbfs_file_mmap(struct fi unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); #else -static unsigned long +unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { @@ -114,6 +119,65 @@ hugetlb_get_unmapped_area(struct file *f } } #endif + +int mmap_hugetlb_implicit(unsigned long len) +{ + /* Are we enabled? */ + if (!mmap_use_hugepages) + return 0; + /* Must be HPAGE aligned */ + if (len & ~HPAGE_MASK) + return 0; + /* Are we under the minimum size? */ + if (mmap_hugepages_map_sz + && len < (mmap_hugepages_map_sz << 20)) + return 0; + /* Do we have enough free huge pages? */ + if (!is_hugepage_mem_enough(len)) + return 0; + + return 1; +} + +unsigned long +try_hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long *flags) +{ + long pre_error = 0; + + /* Check some prerequisites */ + if (!capable(CAP_IPC_LOCK)) + pre_error = -EPERM; + else if (file) + pre_error = -EINVAL; + + /* Explicit requests for huge pages are allowed to return errors */ + if (*flags & MAP_HUGETLB) { + if (pre_error) + return pre_error; + return hugetlb_get_unmapped_area(NULL, addr, len, pgoff, *flags); + } + + /* + * When implicit request fails, return 0 so we can + * retry later with regular pages. + */ + if (mmap_hugetlb_implicit(len)) { + if (pre_error) + goto out; + addr = hugetlb_get_unmapped_area(NULL, addr, len, pgoff, *flags); + if (IS_ERR((void *)addr)) + goto out; + else { + *flags |= MAP_HUGETLB; + return addr; + } + } + +out: + *flags &= ~MAP_HUGETLB; + return 0; +} /* * Read a page. Again trivial. If it didn't already exist diff -aurpN -X /home/fletch/.diff.exclude 470-stacktrace/include/asm-i386/mman.h 480-implicit_huge_pages/include/asm-i386/mman.h --- 470-stacktrace/include/asm-i386/mman.h Mon Nov 17 18:29:33 2003 +++ 480-implicit_huge_pages/include/asm-i386/mman.h Fri Jan 9 23:09:13 2004 @@ -11,6 +11,11 @@ #define MAP_SHARED 0x01 /* Share changes */ #define MAP_PRIVATE 0x02 /* Changes are private */ +#ifdef CONFIG_HUGETLB_PAGE +#define MAP_HUGETLB 0x04 /* Use huge pages */ +#else +#define MAP_HUGETLB 0x00 +#endif #define MAP_TYPE 0x0f /* Mask for type of mapping */ #define MAP_FIXED 0x10 /* Interpret addr exactly */ #define MAP_ANONYMOUS 0x20 /* don't use a file */ diff -aurpN -X /home/fletch/.diff.exclude 470-stacktrace/include/asm-ppc64/mman.h 480-implicit_huge_pages/include/asm-ppc64/mman.h --- 470-stacktrace/include/asm-ppc64/mman.h Mon Nov 17 18:28:58 2003 +++ 480-implicit_huge_pages/include/asm-ppc64/mman.h Fri Jan 9 23:09:13 2004 @@ -18,6 +18,11 @@ #define MAP_SHARED 0x01 /* Share changes */ #define MAP_PRIVATE 0x02 /* Changes are private */ +#ifdef CONFIG_HUGETLB_PAGE +#define MAP_HUGETLB 0x04 +#else +#define MAP_HUGETLB 0x0 +#endif #define MAP_TYPE 0x0f /* Mask for type of mapping */ #define MAP_FIXED 0x10 /* Interpret addr exactly */ #define MAP_ANONYMOUS 0x20 /* don't use a file */ diff -aurpN -X /home/fletch/.diff.exclude 470-stacktrace/include/linux/hugetlb.h 480-implicit_huge_pages/include/linux/hugetlb.h --- 470-stacktrace/include/linux/hugetlb.h Fri Jan 9 17:40:08 2004 +++ 480-implicit_huge_pages/include/linux/hugetlb.h Fri Jan 9 23:09:13 2004 @@ -120,4 +120,9 @@ static inline void set_file_hugepages(st #endif /* !CONFIG_HUGETLBFS */ +unsigned long +hugetlb_get_unmapped_area(struct file *, unsigned long, unsigned long, + unsigned long, unsigned long); + + #endif /* _LINUX_HUGETLB_H */ diff -aurpN -X /home/fletch/.diff.exclude 470-stacktrace/include/linux/mman.h 480-implicit_huge_pages/include/linux/mman.h --- 470-stacktrace/include/linux/mman.h Mon Nov 17 18:29:34 2003 +++ 480-implicit_huge_pages/include/linux/mman.h Fri Jan 9 23:09:13 2004 @@ -58,6 +58,9 @@ calc_vm_flag_bits(unsigned long flags) return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) | _calc_vm_trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE) | +#ifdef CONFIG_HUGETLB_PAGE + _calc_vm_trans(flags, MAP_HUGETLB, VM_HUGETLB ) | +#endif _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ); } diff -aurpN -X /home/fletch/.diff.exclude 470-stacktrace/include/linux/sysctl.h 480-implicit_huge_pages/include/linux/sysctl.h --- 470-stacktrace/include/linux/sysctl.h Fri Jan 9 22:25:24 2004 +++ 480-implicit_huge_pages/include/linux/sysctl.h Fri Jan 9 23:09:13 2004 @@ -128,6 +128,10 @@ enum KERN_PANIC_ON_OOPS=57, /* int: whether we will panic on an oops */ KERN_HPPA_PWRSW=58, /* int: hppa soft-power enable */ KERN_HPPA_UNALIGNED=59, /* int: hppa unaligned-trap enable */ + KERN_SHMUSEHUGEPAGES=60, /* int: back shm with huge pages */ + KERN_MMAPUSEHUGEPAGES=61, /* int: back anon mmap with huge pages */ + KERN_HPAGES_PER_FILE=62, /* int: max bigpages per file */ + KERN_HPAGES_MAP_SZ=63, /* int: min size (MB) of mapping */ }; diff -aurpN -X /home/fletch/.diff.exclude 470-stacktrace/ipc/shm.c 480-implicit_huge_pages/ipc/shm.c --- 470-stacktrace/ipc/shm.c Mon Nov 17 18:28:20 2003 +++ 480-implicit_huge_pages/ipc/shm.c Fri Jan 9 23:09:13 2004 @@ -32,6 +32,9 @@ #define shm_flags shm_perm.mode +extern int shm_use_hugepages; +extern int shm_hugepages_per_file; + static struct file_operations shm_file_operations; static struct vm_operations_struct shm_vm_ops; @@ -165,6 +168,31 @@ static struct vm_operations_struct shm_v .nopage = shmem_nopage, }; +#ifdef CONFIG_HUGETLBFS +int shm_with_hugepages(int shmflag, size_t size) +{ + /* flag specified explicitly */ + if (shmflag & SHM_HUGETLB) + return 1; + /* Are we disabled? */ + if (!shm_use_hugepages) + return 0; + /* Must be HPAGE aligned */ + if (size & ~HPAGE_MASK) + return 0; + /* Are we under the max per file? */ + if ((size >> HPAGE_SHIFT) > shm_hugepages_per_file) + return 0; + /* Do we have enough free huge pages? */ + if (!is_hugepage_mem_enough(size)) + return 0; + + return 1; +} +#else +int shm_with_hugepages(int shmflag, size_t size) { return 0; } +#endif + static int newseg (key_t key, int shmflg, size_t size) { int error; @@ -194,8 +222,10 @@ static int newseg (key_t key, int shmflg return error; } - if (shmflg & SHM_HUGETLB) + if (shm_with_hugepages(shmflg, size)) { + shmflg |= SHM_HUGETLB; file = hugetlb_zero_setup(size); + } else { sprintf (name, "SYSV%08x", key); file = shmem_file_setup(name, size, VM_ACCOUNT); diff -aurpN -X /home/fletch/.diff.exclude 470-stacktrace/kernel/sysctl.c 480-implicit_huge_pages/kernel/sysctl.c --- 470-stacktrace/kernel/sysctl.c Fri Jan 9 22:57:58 2004 +++ 480-implicit_huge_pages/kernel/sysctl.c Fri Jan 9 23:09:13 2004 @@ -60,6 +60,8 @@ extern int cad_pid; extern int pid_max; extern int sysctl_lower_zone_protection; extern int min_free_kbytes; +extern int shm_use_hugepages, shm_hugepages_per_file; +extern int mmap_use_hugepages, mmap_hugepages_map_sz; extern int min_timeslice; extern int max_timeslice; extern int child_penalty; @@ -596,6 +598,40 @@ static ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#ifdef CONFIG_HUGETLBFS + { + .ctl_name = KERN_SHMUSEHUGEPAGES, + .procname = "shm-use-hugepages", + .data = &shm_use_hugepages, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_MMAPUSEHUGEPAGES, + .procname = "mmap-use-hugepages", + .data = &mmap_use_hugepages, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_HPAGES_PER_FILE, + .procname = "shm-hugepages-per-file", + .data = &shm_hugepages_per_file, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_HPAGES_MAP_SZ, + .procname = "mmap-hugepages-min-mapping", + .data = &mmap_hugepages_map_sz, + .maxlen = sizeof(int), + .mode 0644, + .proc_handler = &proc_dointvec, + }, +#endif { .ctl_name = 0 } }; diff -aurpN -X /home/fletch/.diff.exclude 470-stacktrace/mm/mmap.c 480-implicit_huge_pages/mm/mmap.c --- 470-stacktrace/mm/mmap.c Fri Jan 9 22:25:55 2004 +++ 480-implicit_huge_pages/mm/mmap.c Fri Jan 9 23:09:13 2004 @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -59,6 +60,9 @@ EXPORT_SYMBOL(sysctl_overcommit_memory); EXPORT_SYMBOL(sysctl_overcommit_ratio); EXPORT_SYMBOL(vm_committed_space); +int mmap_use_hugepages = 0; +int mmap_hugepages_map_sz = 256; + /* * Requires inode->i_mapping->i_shared_sem */ @@ -480,7 +484,7 @@ unsigned long do_mmap_pgoff(struct file int correct_wcount = 0; int error; struct rb_node ** rb_link, * rb_parent; - unsigned long charged = 0; + unsigned long charged = 0, addr_save = addr; if (file) { if (!file->f_op || !file->f_op->mmap) @@ -508,8 +512,17 @@ unsigned long do_mmap_pgoff(struct file /* Obtain the address to map to. we verify (or select) it and ensure * that it represents a valid section of the address space. + * VM_HUGETLB will never appear in vm_flags when CONFIG_HUGETLB is + * unset. */ - addr = get_unmapped_area(file, addr, len, pgoff, flags); +#ifdef CONFIG_HUGETLBFS + addr = try_hugetlb_get_unmapped_area(NULL, addr, len, pgoff, &flags); + if (IS_ERR((void *)addr)) + return addr; + else if (addr == 0) +#endif + addr = get_unmapped_area(file, addr_save, len, pgoff, flags); + if (addr & ~PAGE_MASK) return addr; @@ -573,6 +586,9 @@ unsigned long do_mmap_pgoff(struct file default: return -EINVAL; case MAP_PRIVATE: +#ifdef CONFIG_HUGETLBFS + case (MAP_PRIVATE|MAP_HUGETLB): +#endif vm_flags &= ~(VM_SHARED | VM_MAYSHARE); /* fall through */ case MAP_SHARED: @@ -657,10 +673,31 @@ munmap_back: error = file->f_op->mmap(file, vma); if (error) goto unmap_and_free_vma; - } else if (vm_flags & VM_SHARED) { - error = shmem_zero_setup(vma); - if (error) - goto free_vma; + } else if ((vm_flags & VM_SHARED) || (vm_flags & VM_HUGETLB)) { + if (!is_vm_hugetlb_page(vma)) { + error = shmem_zero_setup(vma); + if (error) + goto free_vma; + } else { + /* + * Presumably hugetlb_zero_setup() acquires a + * reference count for us. The difference + * between this and the shmem_zero_setup() + * case is that we can encounter an error + * _after_ allocating the file. The error + * path was adjusted slightly to fput() for us. + */ + struct file *new_file = hugetlb_zero_setup(len); + if (IS_ERR(new_file)) { + error = PTR_ERR(new_file); + goto free_vma; + } else { + vma->vm_file = new_file; + error = new_file->f_op->mmap(new_file, vma); + if (error) + goto unmap_and_free_vma; + } + } } /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform @@ -708,11 +745,21 @@ out: unmap_and_free_vma: if (correct_wcount) atomic_inc(&inode->i_writecount); - vma->vm_file = NULL; - fput(file); - /* Undo any partial mapping done by a device driver. */ + /* + * Undo any partial mapping done by a device driver. + * hugetlb wants to know the vma's file etc. so nuke + * the file afterward. + */ zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start); + + /* + * vma->vm_file may be different from file in the hugetlb case. + */ + if (vma->vm_file) + fput(vma->vm_file); + vma->vm_file = NULL; + free_vma: kmem_cache_free(vm_area_cachep, vma); unacct_error: diff -aurpN -X /home/fletch/.diff.exclude 470-stacktrace/mm/shmem.c 480-implicit_huge_pages/mm/shmem.c --- 470-stacktrace/mm/shmem.c Fri Jan 9 17:40:10 2004 +++ 480-implicit_huge_pages/mm/shmem.c Fri Jan 9 23:09:13 2004 @@ -40,6 +40,29 @@ #include #include +int shm_use_hugepages; + +/* + * On 64bit archs the vmalloc area is very large, + * so we allocate the array in vmalloc on 64bit archs. + * + * Assuming 2M pages (x86 and x86-64) those default setting + * will allow up to 128G of bigpages in a single file on + * 64bit archs and 64G on 32bit archs using the max + * kmalloc size of 128k. So tweaking in practice is needed + * only to go past 128G of bigpages per file on 64bit archs. + * + * This sysctl is in page units (each page large BIGPAGE_SIZE). + */ +#ifdef CONFIG_HUGETLBFS +#if BITS_PER_LONG == 64 +int shm_hugepages_per_file = 128UL << (30 - HPAGE_SHIFT); +#else +int shm_hugepages_per_file = 131072 / sizeof(struct page *); +#endif +#endif + + /* This magic number is used in glibc for posix shared memory */ #define TMPFS_MAGIC 0x01021994