diff -purN -X /home/mbligh/.diff.exclude 750-vsyscall_gtod_B2/fs/hugetlbfs/inode.c 760-implicit_hugetlb/fs/hugetlbfs/inode.c --- 750-vsyscall_gtod_B2/fs/hugetlbfs/inode.c 2004-02-20 15:41:06.000000000 -0800 +++ 760-implicit_hugetlb/fs/hugetlbfs/inode.c 2004-02-20 15:59:41.000000000 -0800 @@ -26,12 +26,17 @@ #include #include #include +#include #include +#include /* some random number */ #define HUGETLBFS_MAGIC 0x958458f6 +extern int mmap_use_hugepages; +extern int mmap_hugepages_map_sz; + static struct super_operations hugetlbfs_ops; static struct address_space_operations hugetlbfs_aops; struct file_operations hugetlbfs_file_operations; @@ -82,7 +87,7 @@ static int hugetlbfs_file_mmap(struct fi unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); #else -static unsigned long +unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { diff -purN -X /home/mbligh/.diff.exclude 750-vsyscall_gtod_B2/include/asm-i386/mman.h 760-implicit_hugetlb/include/asm-i386/mman.h --- 750-vsyscall_gtod_B2/include/asm-i386/mman.h 2003-10-14 15:50:32.000000000 -0700 +++ 760-implicit_hugetlb/include/asm-i386/mman.h 2004-02-20 15:59:41.000000000 -0800 @@ -16,6 +16,7 @@ #define MAP_ANONYMOUS 0x20 /* don't use a file */ #define MAP_GROWSDOWN 0x0100 /* stack-like segment */ +#define MAP_HUGETLB 0x0400 /* Backed by hugetlb pages */ #define MAP_DENYWRITE 0x0800 /* ETXTBSY */ #define MAP_EXECUTABLE 0x1000 /* mark it as an executable */ #define MAP_LOCKED 0x2000 /* pages are locked */ diff -purN -X /home/mbligh/.diff.exclude 750-vsyscall_gtod_B2/include/asm-ppc64/mman.h 760-implicit_hugetlb/include/asm-ppc64/mman.h --- 750-vsyscall_gtod_B2/include/asm-ppc64/mman.h 2003-10-01 11:48:24.000000000 -0700 +++ 760-implicit_hugetlb/include/asm-ppc64/mman.h 2004-02-20 15:59:41.000000000 -0800 @@ -26,6 +26,7 @@ #define MAP_LOCKED 0x80 #define MAP_GROWSDOWN 0x0100 /* stack-like segment */ +#define MAP_HUGETLB 0x0400 /* Backed with hugetlb pages */ #define MAP_DENYWRITE 0x0800 /* ETXTBSY */ #define MAP_EXECUTABLE 0x1000 /* mark it as an executable */ diff -purN -X /home/mbligh/.diff.exclude 750-vsyscall_gtod_B2/include/linux/hugetlb.h 760-implicit_hugetlb/include/linux/hugetlb.h --- 750-vsyscall_gtod_B2/include/linux/hugetlb.h 2004-02-18 14:57:20.000000000 -0800 +++ 760-implicit_hugetlb/include/linux/hugetlb.h 2004-02-20 15:59:41.000000000 -0800 @@ -42,6 +42,9 @@ mark_mm_hugetlb(struct mm_struct *mm, st #define hugetlb_free_pgtables(tlb, prev, start, end) do { } while (0) #endif +unsigned long try_hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long *flags); + #else /* !CONFIG_HUGETLB_PAGE */ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma) @@ -110,12 +113,21 @@ static inline void set_file_hugepages(st { file->f_op = &hugetlbfs_file_operations; } + +unsigned long +hugetlb_get_unmapped_area(struct file *, unsigned long, unsigned long, + unsigned long, unsigned long); #else /* !CONFIG_HUGETLBFS */ #define is_file_hugepages(file) 0 #define set_file_hugepages(file) BUG() #define hugetlb_zero_setup(size) ERR_PTR(-ENOSYS) +static inline unsigned long +hugetlb_get_unmapped_area(struct file * a, unsigned long b, unsigned long c, + unsigned long d, unsigned long e) { return -ENOSYS; } #endif /* !CONFIG_HUGETLBFS */ + + #endif /* _LINUX_HUGETLB_H */ diff -purN -X /home/mbligh/.diff.exclude 750-vsyscall_gtod_B2/include/linux/mman.h 760-implicit_hugetlb/include/linux/mman.h --- 750-vsyscall_gtod_B2/include/linux/mman.h 2003-10-14 15:50:34.000000000 -0700 +++ 760-implicit_hugetlb/include/linux/mman.h 2004-02-20 15:59:41.000000000 -0800 @@ -58,6 +58,9 @@ calc_vm_flag_bits(unsigned long flags) return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) | _calc_vm_trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE) | +#ifdef CONFIG_HUGETLB_PAGE + _calc_vm_trans(flags, MAP_HUGETLB, VM_HUGETLB ) | +#endif _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ); } diff -purN -X /home/mbligh/.diff.exclude 750-vsyscall_gtod_B2/include/linux/sysctl.h 760-implicit_hugetlb/include/linux/sysctl.h --- 750-vsyscall_gtod_B2/include/linux/sysctl.h 2004-02-18 14:57:21.000000000 -0800 +++ 760-implicit_hugetlb/include/linux/sysctl.h 2004-02-20 15:59:41.000000000 -0800 @@ -129,6 +129,10 @@ enum KERN_HPPA_UNALIGNED=59, /* int: hppa unaligned-trap enable */ KERN_PRINTK_RATELIMIT=60, /* int: tune printk ratelimiting */ KERN_PRINTK_RATELIMIT_BURST=61, /* int: tune printk ratelimiting */ + KERN_SHMUSEHUGEPAGES=62, /* int: back shm with huge pages */ + KERN_MMAPUSEHUGEPAGES=63, /* int: back anon mmap with huge pages */ + KERN_HPAGES_PER_FILE=64, /* int: max bigpages per file */ + KERN_HPAGES_MAP_SZ=65, /* int: min size (MB) of mapping */ }; diff -purN -X /home/mbligh/.diff.exclude 750-vsyscall_gtod_B2/ipc/shm.c 760-implicit_hugetlb/ipc/shm.c --- 750-vsyscall_gtod_B2/ipc/shm.c 2003-10-01 11:47:15.000000000 -0700 +++ 760-implicit_hugetlb/ipc/shm.c 2004-02-20 15:59:41.000000000 -0800 @@ -32,6 +32,9 @@ #define shm_flags shm_perm.mode +extern int shm_use_hugepages; +extern int shm_hugepages_per_file; + static struct file_operations shm_file_operations; static struct vm_operations_struct shm_vm_ops; @@ -165,6 +168,31 @@ static struct vm_operations_struct shm_v .nopage = shmem_nopage, }; +#ifdef CONFIG_HUGETLBFS +int shm_with_hugepages(int shmflag, size_t size) +{ + /* flag specified explicitly */ + if (shmflag & SHM_HUGETLB) + return 1; + /* Are we disabled? */ + if (!shm_use_hugepages) + return 0; + /* Must be HPAGE aligned */ + if (size & ~HPAGE_MASK) + return 0; + /* Are we under the max per file? */ + if ((size >> HPAGE_SHIFT) > shm_hugepages_per_file) + return 0; + /* Do we have enough free huge pages? */ + if (!is_hugepage_mem_enough(size)) + return 0; + + return 1; +} +#else +int shm_with_hugepages(int shmflag, size_t size) { return 0; } +#endif + static int newseg (key_t key, int shmflg, size_t size) { int error; @@ -194,8 +222,10 @@ static int newseg (key_t key, int shmflg return error; } - if (shmflg & SHM_HUGETLB) + if (shm_with_hugepages(shmflg, size)) { + shmflg |= SHM_HUGETLB; file = hugetlb_zero_setup(size); + } else { sprintf (name, "SYSV%08x", key); file = shmem_file_setup(name, size, VM_ACCOUNT); diff -purN -X /home/mbligh/.diff.exclude 750-vsyscall_gtod_B2/kernel/sysctl.c 760-implicit_hugetlb/kernel/sysctl.c --- 750-vsyscall_gtod_B2/kernel/sysctl.c 2004-02-18 16:23:32.000000000 -0800 +++ 760-implicit_hugetlb/kernel/sysctl.c 2004-02-20 15:59:41.000000000 -0800 @@ -63,6 +63,8 @@ extern int sysctl_lower_zone_protection; extern int min_free_kbytes; extern int printk_ratelimit_jiffies; extern int printk_ratelimit_burst; +extern int shm_use_hugepages, shm_hugepages_per_file; +extern int mmap_use_hugepages, mmap_hugepages_map_sz; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; @@ -593,6 +595,41 @@ static ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#ifdef CONFIG_HUGETLBFS + { + .ctl_name = KERN_SHMUSEHUGEPAGES, + .procname = "shm-use-hugepages", + .data = &shm_use_hugepages, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_MMAPUSEHUGEPAGES, + .procname = "mmap-use-hugepages", + .data = &mmap_use_hugepages, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_HPAGES_PER_FILE, + .procname = "shm-hugepages-per-file", + .data = &shm_hugepages_per_file, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_HPAGES_MAP_SZ, + .procname = "mmap-hugepages-min-mapping", + .data = &mmap_hugepages_map_sz, + .maxlen = sizeof(int), + .mode 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { .ctl_name = 0 } }; diff -purN -X /home/mbligh/.diff.exclude 750-vsyscall_gtod_B2/mm/mmap.c 760-implicit_hugetlb/mm/mmap.c --- 750-vsyscall_gtod_B2/mm/mmap.c 2004-02-18 16:20:14.000000000 -0800 +++ 760-implicit_hugetlb/mm/mmap.c 2004-02-20 15:59:41.000000000 -0800 @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -59,6 +60,9 @@ EXPORT_SYMBOL(sysctl_overcommit_memory); EXPORT_SYMBOL(sysctl_overcommit_ratio); EXPORT_SYMBOL(vm_committed_space); +int mmap_use_hugepages = 0; +int mmap_hugepages_map_sz = 256; + /* * Requires inode->i_mapping->i_shared_sem */ @@ -463,6 +467,46 @@ static int vma_merge(struct mm_struct *m return 0; } +#ifdef CONFIG_HUGETLBFS +int mmap_hugetlb_implicit(unsigned long len) +{ + /* Are we enabled? */ + if (!mmap_use_hugepages) + return 0; + /* Must be HPAGE aligned */ + if (len & ~HPAGE_MASK) + return 0; + /* Are we under the minimum size? */ + if (mmap_hugepages_map_sz + && len < (mmap_hugepages_map_sz << 20)) + return 0; + + return 1; +} +#else +int mmap_hugetlb_implicit(unsigned long len) { return 0; } +#endif + +unsigned long +try_hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long *flags) +{ + if (!capable(CAP_IPC_LOCK)) + return -EPERM; + + if (*flags & MAP_HUGETLB) { + return hugetlb_get_unmapped_area(NULL, addr, len, pgoff, *flags); + } + + if (mmap_hugetlb_implicit(len)) { + addr = hugetlb_get_unmapped_area(NULL, addr, len, pgoff, *flags); + if (!(addr & ~HPAGE_MASK)) + *flags |= MAP_HUGETLB; + return addr; + } + return -ENOMEM; +} + /* * The caller must hold down_write(current->mm->mmap_sem). */ @@ -478,7 +522,8 @@ unsigned long do_mmap_pgoff(struct file int correct_wcount = 0; int error; struct rb_node ** rb_link, * rb_parent; - unsigned long charged = 0; + unsigned long charged = 0, addr_save = addr; + int hugetlb_explicit = (flags & MAP_HUGETLB) != 0; if (file) { if (!file->f_op || !file->f_op->mmap) @@ -506,8 +551,14 @@ unsigned long do_mmap_pgoff(struct file /* Obtain the address to map to. we verify (or select) it and ensure * that it represents a valid section of the address space. + * VM_HUGETLB will never appear in vm_flags when CONFIG_HUGETLB is + * unset. */ - addr = get_unmapped_area(file, addr, len, pgoff, flags); + addr = try_hugetlb_get_unmapped_area(NULL, addr, len, pgoff, &flags); + if (!(flags & MAP_HUGETLB)) +hugetlb_fallback: + addr = get_unmapped_area(file, addr_save, len, pgoff, flags); + if (addr & ~PAGE_MASK) return addr; @@ -655,10 +706,44 @@ munmap_back: error = file->f_op->mmap(file, vma); if (error) goto unmap_and_free_vma; - } else if (vm_flags & VM_SHARED) { - error = shmem_zero_setup(vma); - if (error) - goto free_vma; + } else if ((vm_flags & VM_SHARED) || (vm_flags & VM_HUGETLB)) { + if (!is_vm_hugetlb_page(vma)) { + error = shmem_zero_setup(vma); + if (error) + goto free_vma; + } else { + /* + * Presumably hugetlb_zero_setup() acquires a + * reference count for us. The difference + * between this and the shmem_zero_setup() + * case is that we can encounter an error + * _after_ allocating the file. The error + * path was adjusted slightly to fput() for us. + */ + struct file *new_file = hugetlb_zero_setup(len); + if (IS_ERR(new_file)) { + if (hugetlb_explicit) { + error = PTR_ERR(new_file); + goto free_vma; + } else { + /* + * We tried an implicit hugetlb mmap + * but we failed to get the pages. + * We basically have to start over. + */ + flags &= ~MAP_HUGETLB; + kmem_cache_free(vm_area_cachep, vma); + if (charged) + vm_unacct_memory(charged); + goto hugetlb_fallback; + } + } else { + vma->vm_file = new_file; + error = new_file->f_op->mmap(new_file, vma); + if (error) + goto unmap_and_free_vma; + } + } } /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform @@ -706,11 +791,21 @@ out: unmap_and_free_vma: if (correct_wcount) atomic_inc(&inode->i_writecount); - vma->vm_file = NULL; - fput(file); - /* Undo any partial mapping done by a device driver. */ + /* + * Undo any partial mapping done by a device driver. + * hugetlb wants to know the vma's file etc. so nuke + * the file afterward. + */ zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start); + + /* + * vma->vm_file may be different from file in the hugetlb case. + */ + if (vma->vm_file) + fput(vma->vm_file); + vma->vm_file = NULL; + free_vma: kmem_cache_free(vm_area_cachep, vma); unacct_error: diff -purN -X /home/mbligh/.diff.exclude 750-vsyscall_gtod_B2/mm/shmem.c 760-implicit_hugetlb/mm/shmem.c --- 750-vsyscall_gtod_B2/mm/shmem.c 2004-02-04 16:24:35.000000000 -0800 +++ 760-implicit_hugetlb/mm/shmem.c 2004-02-20 15:59:41.000000000 -0800 @@ -40,6 +40,29 @@ #include #include +int shm_use_hugepages; + +/* + * On 64bit archs the vmalloc area is very large, + * so we allocate the array in vmalloc on 64bit archs. + * + * Assuming 2M pages (x86 and x86-64) those default setting + * will allow up to 128G of bigpages in a single file on + * 64bit archs and 64G on 32bit archs using the max + * kmalloc size of 128k. So tweaking in practice is needed + * only to go past 128G of bigpages per file on 64bit archs. + * + * This sysctl is in page units (each page large BIGPAGE_SIZE). + */ +#ifdef CONFIG_HUGETLBFS +#if BITS_PER_LONG == 64 +int shm_hugepages_per_file = 128UL << (30 - HPAGE_SHIFT); +#else +int shm_hugepages_per_file = 131072 / sizeof(struct page *); +#endif +#endif + + /* This magic number is used in glibc for posix shared memory */ #define TMPFS_MAGIC 0x01021994