From: "Serge E. Hallyn" The vm_enough_memory functionality was replicated in three separate places, and not always kept in sync. It also used capable() for authorization checks. This caused any process which ends up checking for this permission to have PF_SUPERPRIV set (inappropriately), and caused poor dependencies between stacked modules, since each LSM was generically asked to moderate capable(CAP_SYS_ADMIN) without knowing why. Signed-off-by: Serge Hallyn Signed-off-by: Chris Wright Signed-off-by: Andrew Morton --- 25-akpm/include/linux/mm.h | 1 25-akpm/mm/mmap.c | 88 +++++++++++++++++++++++++++++++++++++++ 25-akpm/security/commoncap.c | 81 +---------------------------------- 25-akpm/security/dummy.c | 64 +--------------------------- 25-akpm/security/selinux/hooks.c | 70 ++++++------------------------- 5 files changed, 112 insertions(+), 192 deletions(-) diff -puN include/linux/mm.h~merge-_vm_enough_memorys-into-a-common-helper include/linux/mm.h --- 25/include/linux/mm.h~merge-_vm_enough_memorys-into-a-common-helper 2005-01-10 17:29:23.309343360 -0800 +++ 25-akpm/include/linux/mm.h 2005-01-10 17:29:23.320341688 -0800 @@ -711,6 +711,7 @@ static inline void vma_nonlinear_insert( } /* mmap.c */ +extern int __vm_enough_memory(long pages, int cap_sys_admin); extern void vma_adjust(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert); extern struct vm_area_struct *vma_merge(struct mm_struct *, diff -puN mm/mmap.c~merge-_vm_enough_memorys-into-a-common-helper mm/mmap.c --- 25/mm/mmap.c~merge-_vm_enough_memorys-into-a-common-helper 2005-01-10 17:29:23.311343056 -0800 +++ 25-akpm/mm/mmap.c 2005-01-10 17:29:23.322341384 -0800 @@ -61,10 +61,98 @@ int sysctl_overcommit_ratio = 50; /* def int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; atomic_t vm_committed_space = ATOMIC_INIT(0); +/* + * Check that a process has enough memory to allocate a new virtual + * mapping. 0 means there is enough memory for the allocation to + * succeed and -ENOMEM implies there is not. + * + * We currently support three overcommit policies, which are set via the + * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting + * + * Strict overcommit modes added 2002 Feb 26 by Alan Cox. + * Additional code 2002 Jul 20 by Robert Love. + * + * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. + * + * Note this is a helper function intended to be used by LSMs which + * wish to use this logic. + */ +int __vm_enough_memory(long pages, int cap_sys_admin) +{ + unsigned long free, allowed; + + vm_acct_memory(pages); + + /* + * Sometimes we want to use more memory than we have + */ + if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) + return 0; + + if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { + unsigned long n; + + free = get_page_cache_size(); + free += nr_swap_pages; + + /* + * Any slabs which are created with the + * SLAB_RECLAIM_ACCOUNT flag claim to have contents + * which are reclaimable, under pressure. The dentry + * cache and most inode caches should fall into this + */ + free += atomic_read(&slab_reclaim_pages); + + /* + * Leave the last 3% for root + */ + if (!cap_sys_admin) + free -= free / 32; + + if (free > pages) + return 0; + + /* + * nr_free_pages() is very expensive on large systems, + * only call if we're about to fail. + */ + n = nr_free_pages(); + if (!cap_sys_admin) + n -= n / 32; + free += n; + + if (free > pages) + return 0; + vm_unacct_memory(pages); + return -ENOMEM; + } + + allowed = (totalram_pages - hugetlb_total_pages()) + * sysctl_overcommit_ratio / 100; + /* + * Leave the last 3% for root + */ + if (!cap_sys_admin) + allowed -= allowed / 32; + allowed += total_swap_pages; + + /* Don't let a single process grow too big: + leave 3% of the size of this process for other processes */ + allowed -= current->mm->total_vm / 32; + + if (atomic_read(&vm_committed_space) < allowed) + return 0; + + vm_unacct_memory(pages); + + return -ENOMEM; +} + EXPORT_SYMBOL(sysctl_overcommit_memory); EXPORT_SYMBOL(sysctl_overcommit_ratio); EXPORT_SYMBOL(sysctl_max_map_count); EXPORT_SYMBOL(vm_committed_space); +EXPORT_SYMBOL(__vm_enough_memory); /* * Requires inode->i_mapping->i_mmap_lock diff -puN security/commoncap.c~merge-_vm_enough_memorys-into-a-common-helper security/commoncap.c --- 25/security/commoncap.c~merge-_vm_enough_memorys-into-a-common-helper 2005-01-10 17:29:23.312342904 -0800 +++ 25-akpm/security/commoncap.c 2005-01-10 17:29:23.323341232 -0800 @@ -316,86 +316,13 @@ int cap_syslog (int type) return 0; } -/* - * Check that a process has enough memory to allocate a new virtual - * mapping. 0 means there is enough memory for the allocation to - * succeed and -ENOMEM implies there is not. - * - * We currently support three overcommit policies, which are set via the - * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting - * - * Strict overcommit modes added 2002 Feb 26 by Alan Cox. - * Additional code 2002 Jul 20 by Robert Love. - */ int cap_vm_enough_memory(long pages) { - unsigned long free, allowed; + int cap_sys_admin = 0; - vm_acct_memory(pages); - - /* - * Sometimes we want to use more memory than we have - */ - if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) - return 0; - - if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { - unsigned long n; - - free = get_page_cache_size(); - free += nr_swap_pages; - - /* - * Any slabs which are created with the - * SLAB_RECLAIM_ACCOUNT flag claim to have contents - * which are reclaimable, under pressure. The dentry - * cache and most inode caches should fall into this - */ - free += atomic_read(&slab_reclaim_pages); - - /* - * Leave the last 3% for root - */ - if (!capable(CAP_SYS_ADMIN)) - free -= free / 32; - - if (free > pages) - return 0; - - /* - * nr_free_pages() is very expensive on large systems, - * only call if we're about to fail. - */ - n = nr_free_pages(); - if (!capable(CAP_SYS_ADMIN)) - n -= n / 32; - free += n; - - if (free > pages) - return 0; - vm_unacct_memory(pages); - return -ENOMEM; - } - - allowed = (totalram_pages - hugetlb_total_pages()) - * sysctl_overcommit_ratio / 100; - /* - * Leave the last 3% for root - */ - if (!capable(CAP_SYS_ADMIN)) - allowed -= allowed / 32; - allowed += total_swap_pages; - - /* Don't let a single process grow too big: - leave 3% of the size of this process for other processes */ - allowed -= current->mm->total_vm / 32; - - if (atomic_read(&vm_committed_space) < allowed) - return 0; - - vm_unacct_memory(pages); - - return -ENOMEM; + if (cap_capable(current, CAP_SYS_ADMIN) == 0) + cap_sys_admin = 1; + return __vm_enough_memory(pages, cap_sys_admin); } EXPORT_SYMBOL(cap_capable); diff -puN security/dummy.c~merge-_vm_enough_memorys-into-a-common-helper security/dummy.c --- 25/security/dummy.c~merge-_vm_enough_memorys-into-a-common-helper 2005-01-10 17:29:23.314342600 -0800 +++ 25-akpm/security/dummy.c 2005-01-10 17:29:23.324341080 -0800 @@ -108,69 +108,13 @@ static int dummy_settime(struct timespec return 0; } -/* - * Check that a process has enough memory to allocate a new virtual - * mapping. 0 means there is enough memory for the allocation to - * succeed and -ENOMEM implies there is not. - * - * We currently support three overcommit policies, which are set via the - * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting - */ static int dummy_vm_enough_memory(long pages) { - unsigned long free, allowed; + int cap_sys_admin = 0; - vm_acct_memory(pages); - - /* - * Sometimes we want to use more memory than we have - */ - if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) - return 0; - - if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { - free = get_page_cache_size(); - free += nr_free_pages(); - free += nr_swap_pages; - - /* - * Any slabs which are created with the - * SLAB_RECLAIM_ACCOUNT flag claim to have contents - * which are reclaimable, under pressure. The dentry - * cache and most inode caches should fall into this - */ - free += atomic_read(&slab_reclaim_pages); - - /* - * Leave the last 3% for root - */ - if (current->euid) - free -= free / 32; - - if (free > pages) - return 0; - vm_unacct_memory(pages); - return -ENOMEM; - } - - allowed = (totalram_pages - hugetlb_total_pages()) - * sysctl_overcommit_ratio / 100; - allowed += total_swap_pages; - - /* Leave the last 3% for root */ - if (current->euid) - allowed -= allowed / 32; - - /* Don't let a single process grow too big: - leave 3% of the size of this process for other processes */ - allowed -= current->mm->total_vm / 32; - - if (atomic_read(&vm_committed_space) < allowed) - return 0; - - vm_unacct_memory(pages); - - return -ENOMEM; + if (dummy_capable(current, CAP_SYS_ADMIN) == 0) + cap_sys_admin = 1; + return __vm_enough_memory(pages, cap_sys_admin); } static int dummy_bprm_alloc_security (struct linux_binprm *bprm) diff -puN security/selinux/hooks.c~merge-_vm_enough_memorys-into-a-common-helper security/selinux/hooks.c --- 25/security/selinux/hooks.c~merge-_vm_enough_memorys-into-a-common-helper 2005-01-10 17:29:23.316342296 -0800 +++ 25-akpm/security/selinux/hooks.c 2005-01-10 17:29:23.327340624 -0800 @@ -1515,69 +1515,29 @@ static int selinux_syslog(int type) * mapping. 0 means there is enough memory for the allocation to * succeed and -ENOMEM implies there is not. * - * We currently support three overcommit policies, which are set via the - * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting + * Note that secondary_ops->capable and task_has_perm_noaudit return 0 + * if the capability is granted, but __vm_enough_memory requires 1 if + * the capability is granted. * - * Strict overcommit modes added 2002 Feb 26 by Alan Cox. - * Additional code 2002 Jul 20 by Robert Love. + * Do not audit the selinux permission check, as this is applied to all + * processes that allocate mappings. */ static int selinux_vm_enough_memory(long pages) { - unsigned long free, allowed; - int rc; + int rc, cap_sys_admin = 0; struct task_security_struct *tsec = current->security; - vm_acct_memory(pages); - - /* - * Sometimes we want to use more memory than we have - */ - if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) - return 0; - - if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { - free = get_page_cache_size(); - free += nr_free_pages(); - free += nr_swap_pages; - - /* - * Any slabs which are created with the - * SLAB_RECLAIM_ACCOUNT flag claim to have contents - * which are reclaimable, under pressure. The dentry - * cache and most inode caches should fall into this - */ - free += atomic_read(&slab_reclaim_pages); - - /* - * Leave the last 3% for privileged processes. - * Don't audit the check, as it is applied to all processes - * that allocate mappings. - */ - rc = secondary_ops->capable(current, CAP_SYS_ADMIN); - if (!rc) { - rc = avc_has_perm_noaudit(tsec->sid, tsec->sid, - SECCLASS_CAPABILITY, - CAP_TO_MASK(CAP_SYS_ADMIN), NULL); - } - if (rc) - free -= free / 32; - - if (free > pages) - return 0; - vm_unacct_memory(pages); - return -ENOMEM; - } - - allowed = (totalram_pages - hugetlb_total_pages()) - * sysctl_overcommit_ratio / 100; - allowed += total_swap_pages; - - if (atomic_read(&vm_committed_space) < allowed) - return 0; + rc = secondary_ops->capable(current, CAP_SYS_ADMIN); + if (rc == 0) + rc = avc_has_perm_noaudit(tsec->sid, tsec->sid, + SECCLASS_CAPABILITY, + CAP_TO_MASK(CAP_SYS_ADMIN), + NULL); - vm_unacct_memory(pages); + if (rc == 0) + cap_sys_admin = 1; - return -ENOMEM; + return __vm_enough_memory(pages, cap_sys_admin); } /* binprm security operations */ _