From: Ingo Molnar Rework the i386 mm layout to allow applications to allocate more virtual memory, and larger contiguous chunks. - the patch is compatible with existing architectures that either make use of HAVE_ARCH_UNMAPPED_AREA or use the default mmap() allocator - there is no change in behavior. - 64-bit architectures can use the same mechanism to clean up 32-bit compatibility layouts: by defining HAVE_ARCH_PICK_MMAP_LAYOUT and providing a arch_pick_mmap_layout() function - which can then decide between various mmap() layout functions. - I also introduced a new personality bit (ADDR_COMPAT_LAYOUT) to signal older binaries that dont have PT_GNU_STACK. x86 uses this to revert back to the stock layout. I also changed x86 to not clear the personality bits upon exec(), like x86-64 already does. - once every architecture that uses HAVE_ARCH_UNMAPPED_AREA has defined its arch_pick_mmap_layout() function, we can get rid of HAVE_ARCH_UNMAPPED_AREA altogether, as a final cleanup. the new layout generation function (__get_unmapped_area()) got significant testing in FC1/2, so i'm pretty confident it's robust. Compiles & boots fine on an 'old' and on a 'new' x86 distro as well. The two known breakages were: http://www.redhatconfig.com/msg/67248.html [ 'cyzload' third-party utility broke. ] http://www.zipworld.com/au/~akpm/dde.tar.gz [ your editor broke :-) ] both were caused by application bugs that did: int ret = malloc(); if (ret <= 0) failure; such bugs are easy to spot if they happen, and if it happens it's possible to work it around immediately without having to change the binary, via the setarch patch. No other application has been found to be affected, and this particular change got pretty wide coverage already over RHEL3 and exec-shield, it's in use for more than a year. The setarch utility can be used to trigger the compatibility layout on x86, the following version has been patched to take the `-L' option: http://people.redhat.com/mingo/flexible-mmap/setarch-1.4-2.tar.gz "setarch -L i386 " will run the command with the old layout. Signed-off-by: Andrew Morton --- 25-akpm/arch/i386/mm/Makefile | 2 25-akpm/arch/i386/mm/mmap.c | 69 +++++++++++++++++++++ 25-akpm/fs/binfmt_aout.c | 2 25-akpm/fs/binfmt_elf.c | 2 25-akpm/fs/exec.c | 1 25-akpm/include/asm-i386/processor.h | 2 25-akpm/include/linux/personality.h | 1 25-akpm/include/linux/sched.h | 27 ++++++++ 25-akpm/kernel/fork.c | 2 25-akpm/mm/mmap.c | 111 ++++++++++++++++++++++++++++++----- include/asm-i386/elf.h | 0 11 files changed, 202 insertions(+), 17 deletions(-) diff -puN arch/i386/mm/Makefile~flexible-mmap-2.6.7-mm3-A8 arch/i386/mm/Makefile --- 25/arch/i386/mm/Makefile~flexible-mmap-2.6.7-mm3-A8 2004-08-09 22:01:02.057437656 -0700 +++ 25-akpm/arch/i386/mm/Makefile 2004-08-09 22:01:02.079434312 -0700 @@ -2,7 +2,7 @@ # Makefile for the linux i386-specific parts of the memory manager. # -obj-y := init.o pgtable.o fault.o ioremap.o extable.o pageattr.o +obj-y := init.o pgtable.o fault.o ioremap.o extable.o pageattr.o mmap.o obj-$(CONFIG_DISCONTIGMEM) += discontig.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o diff -puN /dev/null arch/i386/mm/mmap.c --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ 25-akpm/arch/i386/mm/mmap.c 2004-08-09 22:01:02.080434160 -0700 @@ -0,0 +1,69 @@ +/* + * linux/arch/i386/mm/mmap.c + * + * flexible mmap layout support + * + * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * + * Started by Ingo Molnar + */ + +#include +#include + +/* + * Top of mmap area (just below the process stack). + * + * Leave an at least ~128 MB hole. + */ +#define MIN_GAP (128*1024*1024) +#define MAX_GAP (TASK_SIZE/6*5) + +static inline unsigned long mmap_base(struct mm_struct *mm) +{ + unsigned long gap = current->rlim[RLIMIT_STACK].rlim_cur; + + if (gap < MIN_GAP) + gap = MIN_GAP; + else if (gap > MAX_GAP) + gap = MAX_GAP; + + return TASK_SIZE - (gap & PAGE_MASK); +} + +/* + * This function, called very early during the creation of a new + * process VM image, sets up which VM layout function to use: + */ +void arch_pick_mmap_layout(struct mm_struct *mm) +{ + /* + * Fall back to the standard layout if the personality + * bit is set: + */ + if (current->personality & ADDR_COMPAT_LAYOUT) { + mm->mmap_base = TASK_UNMAPPED_BASE; + mm->get_unmapped_area = arch_get_unmapped_area; + mm->unmap_area = arch_unmap_area; + } else { + mm->mmap_base = mmap_base(mm); + mm->get_unmapped_area = arch_get_unmapped_area_topdown; + mm->unmap_area = arch_unmap_area_topdown; + } +} diff -puN fs/binfmt_aout.c~flexible-mmap-2.6.7-mm3-A8 fs/binfmt_aout.c --- 25/fs/binfmt_aout.c~flexible-mmap-2.6.7-mm3-A8 2004-08-09 22:01:02.059437352 -0700 +++ 25-akpm/fs/binfmt_aout.c 2004-08-09 22:01:02.080434160 -0700 @@ -307,7 +307,7 @@ static int load_aout_binary(struct linux (current->mm->start_data = N_DATADDR(ex)); current->mm->brk = ex.a_bss + (current->mm->start_brk = N_BSSADDR(ex)); - current->mm->free_area_cache = TASK_UNMAPPED_BASE; + current->mm->free_area_cache = current->mm->mmap_base; current->mm->rss = 0; current->mm->mmap = NULL; diff -puN fs/binfmt_elf.c~flexible-mmap-2.6.7-mm3-A8 fs/binfmt_elf.c --- 25/fs/binfmt_elf.c~flexible-mmap-2.6.7-mm3-A8 2004-08-09 22:01:02.060437200 -0700 +++ 25-akpm/fs/binfmt_elf.c 2004-08-09 22:01:02.082433856 -0700 @@ -706,7 +706,7 @@ static int load_elf_binary(struct linux_ /* Do this so that we can load the interpreter, if need be. We will change some of these later */ current->mm->rss = 0; - current->mm->free_area_cache = TASK_UNMAPPED_BASE; + current->mm->free_area_cache = current->mm->mmap_base; retval = setup_arg_pages(bprm, executable_stack); if (retval < 0) { send_sig(SIGKILL, current, 0); diff -puN fs/exec.c~flexible-mmap-2.6.7-mm3-A8 fs/exec.c --- 25/fs/exec.c~flexible-mmap-2.6.7-mm3-A8 2004-08-09 22:01:02.064436592 -0700 +++ 25-akpm/fs/exec.c 2004-08-09 22:01:02.083433704 -0700 @@ -546,6 +546,7 @@ static int exec_mmap(struct mm_struct *m tsk->active_mm = mm; activate_mm(active_mm, mm); task_unlock(tsk); + arch_pick_mmap_layout(mm); if (old_mm) { if (active_mm != old_mm) BUG(); mmput(old_mm); diff -puN include/asm-i386/elf.h~flexible-mmap-2.6.7-mm3-A8 include/asm-i386/elf.h diff -puN include/asm-i386/processor.h~flexible-mmap-2.6.7-mm3-A8 include/asm-i386/processor.h --- 25/include/asm-i386/processor.h~flexible-mmap-2.6.7-mm3-A8 2004-08-09 22:01:02.070435680 -0700 +++ 25-akpm/include/asm-i386/processor.h 2004-08-09 22:01:02.083433704 -0700 @@ -296,6 +296,8 @@ extern unsigned int mca_pentium_flag; */ #define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3)) +#define HAVE_ARCH_PICK_MMAP_LAYOUT + /* * Size of io_bitmap. */ diff -puN include/linux/personality.h~flexible-mmap-2.6.7-mm3-A8 include/linux/personality.h --- 25/include/linux/personality.h~flexible-mmap-2.6.7-mm3-A8 2004-08-09 22:01:02.072435376 -0700 +++ 25-akpm/include/linux/personality.h 2004-08-09 22:01:02.084433552 -0700 @@ -30,6 +30,7 @@ extern int abi_fake_utsname; */ enum { MMAP_PAGE_ZERO = 0x0100000, + ADDR_COMPAT_LAYOUT = 0x0200000, READ_IMPLIES_EXEC = 0x0400000, ADDR_LIMIT_32BIT = 0x0800000, SHORT_INODE = 0x1000000, diff -puN include/linux/sched.h~flexible-mmap-2.6.7-mm3-A8 include/linux/sched.h --- 25/include/linux/sched.h~flexible-mmap-2.6.7-mm3-A8 2004-08-09 22:01:02.073435224 -0700 +++ 25-akpm/include/linux/sched.h 2004-08-09 22:01:02.085433400 -0700 @@ -189,10 +189,26 @@ extern int sysctl_max_map_count; #include +extern unsigned long +arch_get_unmapped_area(struct file *, unsigned long, unsigned long, + unsigned long, unsigned long); +extern unsigned long +arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags); +extern void arch_unmap_area(struct vm_area_struct *area); +extern void arch_unmap_area_topdown(struct vm_area_struct *area); + + struct mm_struct { struct vm_area_struct * mmap; /* list of VMAs */ struct rb_root mm_rb; struct vm_area_struct * mmap_cache; /* last find_vma result */ + unsigned long (*get_unmapped_area) (struct file *filp, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags); + void (*unmap_area) (struct vm_area_struct *area); + unsigned long mmap_base; /* base of mmap area */ unsigned long free_area_cache; /* first hole */ pgd_t * pgd; atomic_t mm_users; /* How many users with user space? */ @@ -997,6 +1013,17 @@ static inline void set_task_cpu(struct t #endif /* CONFIG_SMP */ +#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT +extern void arch_pick_mmap_layout(struct mm_struct *mm); +#else +static inline void arch_pick_mmap_layout(struct mm_struct *mm) +{ + mm->mmap_base = TASK_UNMAPPED_BASE; + mm->get_unmapped_area = arch_get_unmapped_area; + mm->unmap_area = arch_unmap_area; +} +#endif + #endif /* __KERNEL__ */ #endif diff -puN kernel/fork.c~flexible-mmap-2.6.7-mm3-A8 kernel/fork.c --- 25/kernel/fork.c~flexible-mmap-2.6.7-mm3-A8 2004-08-09 22:01:02.074435072 -0700 +++ 25-akpm/kernel/fork.c 2004-08-09 22:01:02.086433248 -0700 @@ -279,7 +279,7 @@ static inline int dup_mmap(struct mm_str mm->locked_vm = 0; mm->mmap = NULL; mm->mmap_cache = NULL; - mm->free_area_cache = TASK_UNMAPPED_BASE; + mm->free_area_cache = oldmm->mmap_base; mm->map_count = 0; mm->rss = 0; cpus_clear(mm->cpu_vm_mask); diff -puN mm/mmap.c~flexible-mmap-2.6.7-mm3-A8 mm/mmap.c --- 25/mm/mmap.c~flexible-mmap-2.6.7-mm3-A8 2004-08-09 22:01:02.076434768 -0700 +++ 25-akpm/mm/mmap.c 2004-08-09 22:01:02.088432944 -0700 @@ -1020,7 +1020,7 @@ EXPORT_SYMBOL(do_mmap_pgoff); * This function "knows" that -ENOMEM has the bits set. */ #ifndef HAVE_ARCH_UNMAPPED_AREA -static inline unsigned long +unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { @@ -1064,12 +1064,103 @@ full_search: addr = vma->vm_end; } } -#else -extern unsigned long -arch_get_unmapped_area(struct file *, unsigned long, unsigned long, - unsigned long, unsigned long); #endif +void arch_unmap_area(struct vm_area_struct *area) +{ + /* + * Is this a new hole at the lowest possible address? + */ + if (area->vm_start >= TASK_UNMAPPED_BASE && + area->vm_start < area->vm_mm->free_area_cache) + area->vm_mm->free_area_cache = area->vm_start; +} + +/* + * This mmap-allocator allocates new areas top-down from below the + * stack's low limit (the base): + */ +unsigned long +arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + struct vm_area_struct *vma, *prev_vma; + struct mm_struct *mm = current->mm; + unsigned long base = mm->mmap_base; + int first_time = 1; + + /* requested length too big for entire address space */ + if (len > TASK_SIZE) + return -ENOMEM; + + /* dont allow allocations above current base */ + if (mm->free_area_cache > base) + mm->free_area_cache = base; + + /* requesting a specific address */ + if (addr) { + addr = PAGE_ALIGN(addr); + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && + (!vma || addr + len <= vma->vm_start)) + return addr; + } + + /* make sure it can fit in the remaining address space */ + if (mm->free_area_cache < len) + goto fail; + + /* either no address requested or cant fit in requested address hole */ +try_again: + addr = (mm->free_area_cache - len) & PAGE_MASK; + do { + /* + * Lookup failure means no vma is above this address, + * i.e. return with success: + */ + if (!(vma = find_vma_prev(mm, addr, &prev_vma))) + return addr; + + /* + * new region fits between prev_vma->vm_end and + * vma->vm_start, use it: + */ + if (addr+len <= vma->vm_start && + (!prev_vma || (addr >= prev_vma->vm_end))) + /* remember the address as a hint for next time */ + return (mm->free_area_cache = addr); + else + /* pull free_area_cache down to the first hole */ + if (mm->free_area_cache == vma->vm_end) + mm->free_area_cache = vma->vm_start; + + /* try just below the current vma->vm_start */ + addr = vma->vm_start-len; + } while (len <= vma->vm_start); + +fail: + /* + * if hint left us with no space for the requested + * mapping then try again: + */ + if (first_time) { + mm->free_area_cache = base; + first_time = 0; + goto try_again; + } + return -ENOMEM; +} + +void arch_unmap_area_topdown(struct vm_area_struct *area) +{ + /* + * Is this a new hole at the highest possible address? + */ + if (area->vm_end > area->vm_mm->free_area_cache) + area->vm_mm->free_area_cache = area->vm_end; +} + unsigned long get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) @@ -1104,7 +1195,7 @@ get_unmapped_area(struct file *file, uns return file->f_op->get_unmapped_area(file, addr, len, pgoff, flags); - return arch_get_unmapped_area(file, addr, len, pgoff, flags); + return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); } EXPORT_SYMBOL(get_unmapped_area); @@ -1394,13 +1485,7 @@ static void unmap_vma(struct mm_struct * area->vm_mm->total_vm -= len >> PAGE_SHIFT; if (area->vm_flags & VM_LOCKED) area->vm_mm->locked_vm -= len >> PAGE_SHIFT; - /* - * Is this a new hole at the lowest possible address? - */ - if (area->vm_start >= TASK_UNMAPPED_BASE && - area->vm_start < area->vm_mm->free_area_cache) - area->vm_mm->free_area_cache = area->vm_start; - + area->vm_mm->unmap_area(area); remove_vm_struct(area); } _