# This is a BitKeeper generated patch for the following project: # Project Name: Linux kernel tree # This patch format is intended for GNU patch command version 2.5 or higher. # This patch includes the following deltas: # ChangeSet v2.5.74 -> 1.1384 # kernel/ksyms.c 1.206 -> 1.207 # arch/x86_64/ia32/ia32_binfmt.c 1.14 -> 1.15 # arch/alpha/mm/numa.c 1.12 -> 1.13 # include/linux/mm.h 1.121 -> 1.123 # arch/i386/kernel/cpu/common.c 1.21 -> 1.22 # fs/open.c 1.42 -> 1.43 # include/asm-mips64/mmzone.h 1.7 -> 1.8 # mm/page_alloc.c 1.163 -> 1.166 # arch/arm/mm/init.c 1.22 -> 1.23 # kernel/fork.c 1.127 -> 1.128 # drivers/block/ll_rw_blk.c 1.174 -> 1.175 # include/linux/mman.h 1.3 -> 1.4 # fs/attr.c 1.17 -> 1.18 # fs/proc/proc_misc.c 1.80 -> 1.81 # arch/i386/Kconfig 1.64 -> 1.65 # arch/x86_64/mm/init.c 1.19 -> 1.20 # mm/swapfile.c 1.79 -> 1.80 # security/dummy.c 1.26 -> 1.27 # include/asm-i386/timer.h 1.7 -> 1.9 # fs/jbd/transaction.c 1.68 -> 1.69 # arch/i386/kernel/io_apic.c 1.74 -> 1.75 # fs/ramfs/inode.c 1.34 -> 1.35 # security/capability.c 1.18 -> 1.19 # fs/exec.c 1.85 -> 1.87 # mm/swap.c 1.51 -> 1.52 # fs/coda/file.c 1.10 -> 1.11 # mm/mprotect.c 1.22 -> 1.23 # mm/shmem.c 1.127 -> 1.128 # arch/ia64/mm/init.c 1.44 -> 1.45 # arch/i386/mm/pageattr.c 1.4 -> 1.5 # drivers/net/e100/e100_main.c 1.78 -> 1.79 # arch/i386/kernel/time.c 1.37 -> 1.38 # include/asm-i386/mmzone.h 1.12 -> 1.13 # fs/ext2/ialloc.c 1.34 -> 1.35 # include/asm-i386/cacheflush.h 1.3 -> 1.4 # mm/mremap.c 1.29 -> 1.31 # include/linux/mmzone.h 1.39 -> 1.40 # arch/ia64/kernel/sys_ia64.c 1.23 -> 1.24 # arch/ppc64/mm/init.c 1.46 -> 1.47 # mm/mmap.c 1.87 -> 1.88 # fs/nfs/file.c 1.28 -> 1.29 # arch/i386/kernel/timers/timer_tsc.c 1.19 -> 1.21 # arch/ppc64/mm/numa.c 1.7 -> 1.8 # arch/i386/kernel/timers/timer_cyclone.c 1.8 -> 1.9 # include/linux/security.h 1.24 -> 1.25 # fs/proc/root.c 1.13 -> 1.14 # mm/slab.c 1.90 -> 1.92 # init/Kconfig 1.16 -> 1.17 # arch/x86_64/mm/numa.c 1.3 -> 1.4 # include/asm-x86_64/mmzone.h 1.3 -> 1.4 # drivers/net/Kconfig 1.34 -> 1.35 # mm/nommu.c 1.3 -> 1.4 # kernel/exit.c 1.104 -> 1.105 # arch/s390/kernel/compat_exec.c 1.2 -> 1.3 # fs/jbd/commit.c 1.36 -> 1.37 # Documentation/filesystems/Locking 1.42 -> 1.43 # arch/i386/lib/delay.c 1.4 -> 1.5 # include/asm-alpha/mmzone.h 1.8 -> 1.9 # arch/mips/kernel/sysirix.c 1.11 -> 1.12 # arch/ia64/ia32/binfmt_elf32.c 1.13 -> 1.14 # arch/arm26/mm/init.c 1.1 -> 1.2 # include/asm-ppc64/mmzone.h 1.11 -> 1.12 # arch/i386/kernel/timers/timer.c 1.8 -> 1.10 # drivers/block/cciss.c 1.82 -> 1.83 # include/linux/slab.h 1.25 -> 1.27 # fs/block_dev.c 1.133 -> 1.134 # arch/i386/mm/pgtable.c 1.12 -> 1.13 # # The following is the BitKeeper ChangeSet Log # -------------------------------------------- # 03/07/02 torvalds@home.osdl.org 1.1361 # Linux 2.5.74 # -------------------------------------------- # 03/07/02 ilmari@ilmari.org 1.1362 # [PATCH] Allow modular DM # # With the recent fixes, io_schedule needs to be exported for modular dm # to work. # -------------------------------------------- # 03/07/02 akpm@osdl.org 1.1363 # [PATCH] move_vma() make_pages_present() fix # # From: Hugh Dickins # # mremap's move_vma VM_LOCKED case was still wrong. # # If the do_munmap unmaps a part of new_vma, then its vm_start and vm_end # from before cannot both be the right addresses for the make_pages_present # range, and may BUG() there. # # We need [new_addr, new_addr+new_len) to be locked down; but # move_page_tables already transferred the locked pages [new_addr, # new_addr+old_len), and they're either held in a VM_LOCKED vma throughout, # or temporarily in no vma: in neither case can be swapped out, so no need to # run over that range again. # -------------------------------------------- # 03/07/02 akpm@osdl.org 1.1364 # [PATCH] page unmapping debug # # From: Manfred Spraul # # Manfred's latest page unmapping debug patch. # # The patch adds support for a special debug mode to both the page and the slab # allocator: Unused pages are removed from the kernel linear mapping. This # means that now any access to freed memory will cause an immediate exception. # Right now, read accesses remain totally unnoticed and write accesses may be # catched by the slab poisoning, but usually far too late for a meaningfull bug # report. # # The implementation is based on a new arch dependant function, # kernel_map_pages(), that removes the pages from the linear mapping. It's # right now only implemented for i386. # # Changelog: # # - Add kernel_map_pages() for i386, based on change_page_attr. If # DEBUG_PAGEALLOC is not set, then the function is an empty stub. The stub # is in , i.e. it exists for all archs. # # - Make change_page_attr irq safe. Note that it's not fully irq safe due to # the lack of the tlb flush ipi, but it's good enough for kernel_map_pages(). # Another problem is that kernel_map_pages is not permitted to fail, thus # PSE is disabled if DEBUG_PAGEALLOC is enabled # # - use kernel_map pages for the page allocator. # # - use kernel_map_pages for the slab allocator. # # I couldn't resist and added additional debugging support into mm/slab.c: # # * at kfree time, the complete backtrace of the kfree caller is stored # in the freed object. # # * a ptrinfo() function that dumps all known data about a kernel virtual # address: the pte value, if it belongs to a slab cache the cache name and # additional info. # # * merging of common code: new helper function obj_dbglen and obj_dbghdr # for the conversion between the user visible object pointers/len and the # actual, internal addresses and len values. # -------------------------------------------- # 03/07/02 akpm@osdl.org 1.1365 # [PATCH] NUMA memory reporting fix # # From: Dave Hansen # # The current numa meminfo code exports (via sysfs) pgdat->node_size, as # totalram. This variable is consistently used elsewhere to mean "the number # of physical pages that this particular node spans". This is _not_ what we # want to see from meminfo, which is: "how much actual memory does this node # have?" # # The following patch removes pgdat->node_size, and replaces it with # ->node_spanned_pages. This is to avoid confusion with a new variable, # node_present_pages, which is the _actual_ value that we want to export in # meminfo. Most of the patch is a simple s/node_size/node_spanned_pages/. # The node_size() macro is also removed, and replaced with new ones for # node_{spanned,present}_pages() to avoid confusion. # # We were bitten by this problem in this bug: # http://bugme.osdl.org/show_bug.cgi?id=818 # # Compiled and tested on NUMA-Q. # -------------------------------------------- # 03/07/02 akpm@osdl.org 1.1366 # [PATCH] ramfs: use rgeneric_file_llseek # # Teach ramfs to use generic_file_llseek: default_llseek takes lock_kernel(). # -------------------------------------------- # 03/07/02 akpm@osdl.org 1.1367 # [PATCH] inode_change_ok(): remove lock_kernel() # # `attr' is on the stack, and the inode's contents can change as soon as we # return from inode_change_ok() anyway. I can't see anything which is actually # being locked in there. # -------------------------------------------- # 03/07/02 akpm@osdl.org 1.1368 # [PATCH] nommu vmtruncate: remove lock_kernel() # # lock_kernel() need not be held across truncate. # -------------------------------------------- # 03/07/02 akpm@osdl.org 1.1369 # [PATCH] procfs: remove some unneeded lock_kernel()s # # From: William Lee Irwin III # # Remove spurious BKL acquisitions in /proc/. The BKL is not required to # access nr_threads for reporting, and get_locks_status() takes it # internally, wrapping all operations with it. # -------------------------------------------- # 03/07/02 akpm@osdl.org 1.1370 # [PATCH] remove lock_kernel() from file_ops.flush() # # Rework the file_ops.flush() API sothat it is no longer called under # lock_kernel(). Push lock_kernel() down to all impementations except CIFS, # which doesn't want it. # -------------------------------------------- # 03/07/02 akpm@osdl.org 1.1371 # [PATCH] block_llseek(): remove lock_kernel() # # Replace it with the blockdev inode's i_sem. And we only really need that for # atomic access to file->f_pos. # -------------------------------------------- # 03/07/02 akpm@osdl.org 1.1372 # [PATCH] Make CONFIG_TC35815 depend on CONFIG_TOSHIBA_JMR3927 # # From: Adrian Bunk # # I got an error at the final linking with CONFIG_TC35815 enabled since # the variables tc_readl and tc_writel are not available. # # The only place where they are defined is arch/mips/pci/ops-jmr3927.c. # -------------------------------------------- # 03/07/02 akpm@osdl.org 1.1373 # [PATCH] Report detached thread exit to the debugger # # From: Daniel Jacobowitz # # Right now, CLONE_DETACHED threads silently vanish from GDB's sight when # they exit. This patch lets the thread report its exit to the debugger, and # then be auto-reaped as soon as it is collected, instead of being reaped as # soon as it exits and not reported at all. # # GDB works either way, but this is more correct and will be useful for some # later GDB patches. # -------------------------------------------- # 03/07/02 akpm@osdl.org 1.1374 # [PATCH] timer renaming and cleanups # # From: john stultz # # This renames the bad "timer" variable to "cur_timer" and moves externs to # .h files. # -------------------------------------------- # 03/07/02 akpm@osdl.org 1.1375 # [PATCH] fix lost_tick detector for speedstep # # From: john stultz # # The patch tries to resolve issues caused by running the TSC based lost # tick compensation code on CPUs that change frequency (speedstep, etc). # # Should the CPU be in slow mode when calibrate_tsc() executes, the kernel # will assume we have so many cycles per tick. Later when the cpu speeds up, # the kernel will start noting that too many cycles have past since the last # interrupt. Since this can occasionally happen, the lost tick compensation # code then tries to fix this by incrementing jiffies. Thus every tick we # end up incrementing jiffies many times, causing timers to expire too # quickly and time to rush ahead. # # This patch detects when there has been 100 consecutive interrupts where we # had to compensate for lost ticks. If this occurs, we spit out a warning # and fall back to using the PIT as a time source. # # I've tested this on my speedstep enabled laptop with success, and others # laptop users seeing this problem have reported it works for them. Also to # ensure we don't fall back to the slower PIT too quickly, I tested the code # on a system I have that looses ~30 ticks about every second and it can # still manage to use the TSC as a good time source. # # This solves most of the "time doubling" problems seen on laptops. # Additionally this revision has been modified to use the cleanups made in # rename-timer_A1. # -------------------------------------------- # 03/07/02 akpm@osdl.org 1.1376 # [PATCH] fix lost-tick compensation corner-case # # From: john stultz # # This patch catches a corner case in the lost-tick compensation code. # # There is a check to see if we overflowed between reads of the two time # sources, however should the high res time source be slightly slower then # what we calibrated, its possible to trigger this code when no ticks have # been lost. # # This patch adds an extra check to insure we have seen more then one tick # before we check for this overflow. This seems to resolve the remaining # "time doubling" issues that I've seen reported. # -------------------------------------------- # 03/07/02 akpm@osdl.org 1.1377 # [PATCH] cleanup and generalise lowmem_page_address # # From: William Lee Irwin III # # This patch allows architectures to micro-optimize lowmem_page_address() at # their whims. Roman Zippel originally wrote and/or suggested this back when # dependencies on page->virtual existing were being shaken out. That's # long-settled, so it's fine to do this now. # -------------------------------------------- # 03/07/02 akpm@osdl.org 1.1378 # [PATCH] Security hook for vm_enough_memory # # From: Stephen Smalley # # This patch against 2.5.73 replaces vm_enough_memory with a security hook # per Alan Cox's suggestion so that security modules can completely replace # the logic if desired. # # Note that the patch changes the interface to follow the convention of the # other security hooks, i.e. return 0 if ok or -errno on failure (-ENOMEM in # this case) rather than returning a boolean. It also exports various # variables and functions required for the vm_enough_memory logic. # -------------------------------------------- # 03/07/02 akpm@osdl.org 1.1379 # [PATCH] ext2: inode allocation race fix # # ext2's inode allocator will call find_group_orlov(), which will return a # suitable blockgroup in which the inode should be allocated. But by the time # we actually try to allocate an inode in the blockgroup, other CPUs could have # used them all up. # # ext2 will bogusly fail with "ext2_new_inode: Free inodes count corrupted in # group NN". # # # To fix this we just advance onto the next blockgroup if the rare race # happens. If we've scanned all blockgroups then return -ENOSPC. # # # (This is a bit inaccurate: after we've scanned all blockgroups, there may # still be available inodes due to inode freeing activity in other blockgroups. # This cannot be fixed without fs-wide locking. The effect is a slightly # early ENOSPC in a nearly-full filesystem). # -------------------------------------------- # 03/07/02 akpm@osdl.org 1.1380 # [PATCH] fix double mmdrop() on exec path # # If load_elf_binary() (and the other binary handlers) fail after # flush_old_exec() (for example, in setup_arg_pages()) then do_execve() will go # through and do mmdrop(bprm.mm). # # But bprm.mm is now current->mm. We've just freed the current process's mm. # The kernel dies in a most ghastly manner. # # Fix that up by nulling out bprm.mm in flush_old_exec(), at the point where we # consumed the mm. Handle the null pointer in the do_execve() error path. # # Also: don't open-code free_arg_pages() in do_execve(): call it instead. # -------------------------------------------- # 03/07/02 akpm@osdl.org 1.1381 # [PATCH] ext3: fix journal_release_buffer() race # # CPU0 CPU1 # # journal_get_write_access(bh) # (Add buffer to t_reserved_list) # # journal_get_write_access(bh) # (It's already on t_reserved_list: # nothing to do) # # (We decide we don't want to # journal the buffer after all) # journal_release_buffer() # (It gets pulled off the transaction) # # # journal_dirty_metadata() # (The buffer isn't on the reserved # list! The kernel explodes) # # # Simple fix: just leave the buffer on t_reserved_list in # journal_release_buffer(). If nobody ends up claiming the buffer then it will # get thrown away at start of transaction commit. # -------------------------------------------- # 03/07/02 akpm@osdl.org 1.1382 # [PATCH] Set limits on CONFIG_LOG_BUF_SHIFT # # From: bert hubert # # Attached patch adds a range check to LOG_BUF_SHIFT and clarifies the # configuration somewhat. I managed to build a non-booting kernel because I # thought 64 was a nice power of two, which lead to the kernel blocking when # it tried to actually use or allocate a 2^64 buffer. # -------------------------------------------- # 03/07/02 akpm@osdl.org 1.1383 # [PATCH] Fix cciss hang # # From: Jens Axboe # # It fixes a hang when performing large I/O's. Has been tested and acked by # the maintainer, "Wiran, Francis" . # -------------------------------------------- # 03/07/02 akpm@osdl.org 1.1384 # [PATCH] e100 use-after-free fix # # I though Scott had recently merged this but it seems not. We'll be # needing this patch if you merge Manfred's page unmapping debug patch. # -------------------------------------------- # diff -Nru a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking --- a/Documentation/filesystems/Locking Thu Jul 3 01:12:43 2003 +++ b/Documentation/filesystems/Locking Thu Jul 3 01:12:43 2003 @@ -318,7 +318,7 @@ ioctl: yes (see below) mmap: no open: maybe (see below) -flush: yes +flush: no release: no fsync: yes (see below) fasync: yes (see below) diff -Nru a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c --- a/arch/alpha/mm/numa.c Thu Jul 3 01:12:43 2003 +++ b/arch/alpha/mm/numa.c Thu Jul 3 01:12:43 2003 @@ -338,7 +338,7 @@ lmem_map = node_mem_map(nid); pfn = NODE_DATA(nid)->node_start_pfn; - for (i = 0; i < node_size(nid); i++, pfn++) + for (i = 0; i < node_spanned_pages(nid); i++, pfn++) if (page_is_ram(pfn) && PageReserved(lmem_map+i)) reservedpages++; } @@ -372,7 +372,7 @@ printk("Free swap: %6dkB\n",nr_swap_pages<<(PAGE_SHIFT-10)); for (nid = 0; nid < numnodes; nid++) { struct page * lmem_map = node_mem_map(nid); - i = node_size(nid); + i = node_spanned_pages(nid); while (i-- > 0) { total++; if (PageReserved(lmem_map+i)) diff -Nru a/arch/arm/mm/init.c b/arch/arm/mm/init.c --- a/arch/arm/mm/init.c Thu Jul 3 01:12:43 2003 +++ b/arch/arm/mm/init.c Thu Jul 3 01:12:43 2003 @@ -79,7 +79,7 @@ struct page *page, *end; page = NODE_MEM_MAP(node); - end = page + NODE_DATA(node)->node_size; + end = page + NODE_DATA(node)->node_spanned_pages; do { total++; @@ -576,7 +576,7 @@ for (node = 0; node < numnodes; node++) { pg_data_t *pgdat = NODE_DATA(node); - if (pgdat->node_size != 0) + if (pgdat->node_spanned_pages != 0) totalram_pages += free_all_bootmem_node(pgdat); } diff -Nru a/arch/arm26/mm/init.c b/arch/arm26/mm/init.c --- a/arch/arm26/mm/init.c Thu Jul 3 01:12:43 2003 +++ b/arch/arm26/mm/init.c Thu Jul 3 01:12:43 2003 @@ -68,7 +68,7 @@ page = NODE_MEM_MAP(0); - end = page + NODE_DATA(0)->node_size; + end = page + NODE_DATA(0)->node_spanned_pages; do { total++; @@ -353,7 +353,7 @@ max_mapnr = virt_to_page(high_memory) - mem_map; /* this will put all unused low memory onto the freelists */ - if (pgdat->node_size != 0) + if (pgdat->node_spanned_pages != 0) totalram_pages += free_all_bootmem_node(pgdat); printk(KERN_INFO "Memory:"); diff -Nru a/arch/i386/Kconfig b/arch/i386/Kconfig --- a/arch/i386/Kconfig Thu Jul 3 01:12:43 2003 +++ b/arch/i386/Kconfig Thu Jul 3 01:12:43 2003 @@ -1339,6 +1339,14 @@ best used in conjunction with the NMI watchdog so that spinlock deadlocks are also debuggable. +config DEBUG_PAGEALLOC + bool "Page alloc debugging" + depends on DEBUG_KERNEL + help + Unmap pages from the kernel linear mapping after free_pages(). + This results in a large slowdown, but helps to find certain types + of memory corruptions. + config DEBUG_HIGHMEM bool "Highmem debugging" depends on DEBUG_KERNEL && HIGHMEM diff -Nru a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c --- a/arch/i386/kernel/cpu/common.c Thu Jul 3 01:12:43 2003 +++ b/arch/i386/kernel/cpu/common.c Thu Jul 3 01:12:43 2003 @@ -430,6 +430,14 @@ rise_init_cpu(); nexgen_init_cpu(); umc_init_cpu(); + +#ifdef CONFIG_DEBUG_PAGEALLOC + /* pse is not compatible with on-the-fly unmapping, + * disable it even if the cpus claim to support it. + */ + clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); + disable_pse = 1; +#endif } /* * cpu_init() initializes state that is per-CPU. Some data is already diff -Nru a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c --- a/arch/i386/kernel/io_apic.c Thu Jul 3 01:12:43 2003 +++ b/arch/i386/kernel/io_apic.c Thu Jul 3 01:12:43 2003 @@ -35,6 +35,7 @@ #include #include #include +#include #include @@ -2052,7 +2053,6 @@ */ static inline void check_timer(void) { - extern int timer_ack; int pin1, pin2; int vector; diff -Nru a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c --- a/arch/i386/kernel/time.c Thu Jul 3 01:12:43 2003 +++ b/arch/i386/kernel/time.c Thu Jul 3 01:12:43 2003 @@ -80,8 +80,7 @@ spinlock_t i8253_lock = SPIN_LOCK_UNLOCKED; EXPORT_SYMBOL(i8253_lock); -extern struct timer_opts timer_none; -struct timer_opts* timer = &timer_none; +struct timer_opts *cur_timer = &timer_none; /* * This version of gettimeofday has microsecond resolution @@ -93,14 +92,14 @@ unsigned long usec, sec; do { + unsigned long lost; + seq = read_seqbegin(&xtime_lock); - usec = timer->get_offset(); - { - unsigned long lost = jiffies - wall_jiffies; - if (lost) - usec += lost * (1000000 / HZ); - } + usec = cur_timer->get_offset(); + lost = jiffies - wall_jiffies; + if (lost) + usec += lost * (1000000 / HZ); sec = xtime.tv_sec; usec += (xtime.tv_nsec / 1000); } while (read_seqretry(&xtime_lock, seq)); @@ -126,7 +125,7 @@ * wall time. Discover what correction gettimeofday() would have * made, and then undo it! */ - tv->tv_nsec -= timer->get_offset() * NSEC_PER_USEC; + tv->tv_nsec -= cur_timer->get_offset() * NSEC_PER_USEC; tv->tv_nsec -= (jiffies - wall_jiffies) * TICK_NSEC; while (tv->tv_nsec < 0) { @@ -180,7 +179,7 @@ */ unsigned long long monotonic_clock(void) { - return timer->monotonic_clock(); + return cur_timer->monotonic_clock(); } EXPORT_SYMBOL(monotonic_clock); @@ -189,7 +188,8 @@ * timer_interrupt() needs to keep up the real-time clock, * as well as call the "do_timer()" routine every clocktick */ -static inline void do_timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) +static inline void do_timer_interrupt(int irq, void *dev_id, + struct pt_regs *regs) { #ifdef CONFIG_X86_IO_APIC if (timer_ack) { @@ -259,7 +259,7 @@ */ write_seqlock(&xtime_lock); - timer->mark_offset(); + cur_timer->mark_offset(); do_timer_interrupt(irq, NULL, regs); @@ -301,16 +301,13 @@ device_initcall(time_init_device); - void __init time_init(void) { - xtime.tv_sec = get_cmos_time(); wall_to_monotonic.tv_sec = -xtime.tv_sec; xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ); wall_to_monotonic.tv_nsec = -xtime.tv_nsec; - - timer = select_timer(); + cur_timer = select_timer(); time_init_hook(); } diff -Nru a/arch/i386/kernel/timers/timer.c b/arch/i386/kernel/timers/timer.c --- a/arch/i386/kernel/timers/timer.c Thu Jul 3 01:12:43 2003 +++ b/arch/i386/kernel/timers/timer.c Thu Jul 3 01:12:43 2003 @@ -3,12 +3,6 @@ #include #include -/* list of externed timers */ -extern struct timer_opts timer_pit; -extern struct timer_opts timer_tsc; -#ifdef CONFIG_X86_CYCLONE_TIMER -extern struct timer_opts timer_cyclone; -#endif /* list of timers, ordered by preference, NULL terminated */ static struct timer_opts* timers[] = { #ifdef CONFIG_X86_CYCLONE_TIMER @@ -28,6 +22,15 @@ return 1; } __setup("clock=", clock_setup); + + +/* The chosen timesource has been found to be bad. + * Fall back to a known good timesource (the PIT) + */ +void clock_fallback(void) +{ + cur_timer = &timer_pit; +} /* iterates through the list of timers, returning the first * one that initializes successfully. diff -Nru a/arch/i386/kernel/timers/timer_cyclone.c b/arch/i386/kernel/timers/timer_cyclone.c --- a/arch/i386/kernel/timers/timer_cyclone.c Thu Jul 3 01:12:43 2003 +++ b/arch/i386/kernel/timers/timer_cyclone.c Thu Jul 3 01:12:43 2003 @@ -88,7 +88,7 @@ * between cyclone and pit reads (as noted when * usec delta is > 90% # of usecs/tick) */ - if (abs(delay - delay_at_last_interrupt) > (900000/HZ)) + if (lost && abs(delay - delay_at_last_interrupt) > (900000/HZ)) jiffies++; } diff -Nru a/arch/i386/kernel/timers/timer_tsc.c b/arch/i386/kernel/timers/timer_tsc.c --- a/arch/i386/kernel/timers/timer_tsc.c Thu Jul 3 01:12:43 2003 +++ b/arch/i386/kernel/timers/timer_tsc.c Thu Jul 3 01:12:43 2003 @@ -124,6 +124,7 @@ int countmp; static int count1 = 0; unsigned long long this_offset, last_offset; + static int lost_count = 0; write_lock(&monotonic_lock); last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; @@ -178,9 +179,19 @@ delta += delay_at_last_interrupt; lost = delta/(1000000/HZ); delay = delta%(1000000/HZ); - if (lost >= 2) + if (lost >= 2) { jiffies += lost-1; + /* sanity check to ensure we're not always loosing ticks */ + if (lost_count++ > 100) { + printk(KERN_WARNING "Loosing too many ticks!\n"); + printk(KERN_WARNING "TSC cannot be used as a timesource." + " (Are you running with SpeedStep?)\n"); + printk(KERN_WARNING "Falling back to a sane timesource.\n"); + clock_fallback(); + } + } else + lost_count = 0; /* update the monotonic base value */ this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; monotonic_base += cycles_2_ns(this_offset - last_offset); @@ -194,7 +205,7 @@ * between tsc and pit reads (as noted when * usec delta is > 90% # of usecs/tick) */ - if (abs(delay - delay_at_last_interrupt) > (900000/HZ)) + if (lost && abs(delay - delay_at_last_interrupt) > (900000/HZ)) jiffies++; } diff -Nru a/arch/i386/lib/delay.c b/arch/i386/lib/delay.c --- a/arch/i386/lib/delay.c Thu Jul 3 01:12:43 2003 +++ b/arch/i386/lib/delay.c Thu Jul 3 01:12:43 2003 @@ -25,7 +25,7 @@ void __delay(unsigned long loops) { - timer->delay(loops); + cur_timer->delay(loops); } inline void __const_udelay(unsigned long xloops) diff -Nru a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c --- a/arch/i386/mm/pageattr.c Thu Jul 3 01:12:43 2003 +++ b/arch/i386/mm/pageattr.c Thu Jul 3 01:12:43 2003 @@ -13,6 +13,10 @@ #include #include +static spinlock_t cpa_lock = SPIN_LOCK_UNLOCKED; +static struct list_head df_list = LIST_HEAD_INIT(df_list); + + static inline pte_t *lookup_address(unsigned long address) { pgd_t *pgd = pgd_offset_k(address); @@ -31,10 +35,15 @@ { int i; unsigned long addr; - struct page *base = alloc_pages(GFP_KERNEL, 0); + struct page *base; pte_t *pbase; + + spin_unlock_irq(&cpa_lock); + base = alloc_pages(GFP_KERNEL, 0); + spin_lock_irq(&cpa_lock); if (!base) return NULL; + address = __pa(address); addr = address & LARGE_PAGE_MASK; pbase = (pte_t *)page_address(base); @@ -87,7 +96,7 @@ } static int -__change_page_attr(struct page *page, pgprot_t prot, struct page **oldpage) +__change_page_attr(struct page *page, pgprot_t prot) { pte_t *kpte; unsigned long address; @@ -123,7 +132,7 @@ } if (cpu_has_pse && (atomic_read(&kpte_page->count) == 1)) { - *oldpage = kpte_page; + list_add(&kpte_page->list, &df_list); revert_page(kpte_page, address); } return 0; @@ -134,12 +143,6 @@ on_each_cpu(flush_kernel_map, NULL, 1, 1); } -struct deferred_page { - struct deferred_page *next; - struct page *fpage; -}; -static struct deferred_page *df_list; /* protected by init_mm.mmap_sem */ - /* * Change the page attributes of an page in the linear mapping. * @@ -156,47 +159,54 @@ int change_page_attr(struct page *page, int numpages, pgprot_t prot) { int err = 0; - struct page *fpage; int i; + unsigned long flags; - down_write(&init_mm.mmap_sem); + spin_lock_irqsave(&cpa_lock, flags); for (i = 0; i < numpages; i++, page++) { - fpage = NULL; - err = __change_page_attr(page, prot, &fpage); + err = __change_page_attr(page, prot); if (err) break; - if (fpage) { - struct deferred_page *df; - df = kmalloc(sizeof(struct deferred_page), GFP_KERNEL); - if (!df) { - flush_map(); - __free_page(fpage); - } else { - df->next = df_list; - df->fpage = fpage; - df_list = df; - } - } } - up_write(&init_mm.mmap_sem); + spin_unlock_irqrestore(&cpa_lock, flags); return err; } void global_flush_tlb(void) { - struct deferred_page *df, *next_df; + LIST_HEAD(l); + struct list_head* n; - down_read(&init_mm.mmap_sem); - df = xchg(&df_list, NULL); - up_read(&init_mm.mmap_sem); + BUG_ON(irqs_disabled()); + + spin_lock_irq(&cpa_lock); + list_splice_init(&df_list, &l); + spin_unlock_irq(&cpa_lock); flush_map(); - for (; df; df = next_df) { - next_df = df->next; - if (df->fpage) - __free_page(df->fpage); - kfree(df); - } + n = l.next; + while (n != &l) { + struct page *pg = list_entry(n, struct page, list); + n = n->next; + __free_page(pg); + } } + +#ifdef CONFIG_DEBUG_PAGEALLOC +void kernel_map_pages(struct page *page, int numpages, int enable) +{ + if (PageHighMem(page)) + return; + /* the return value is ignored - the calls cannot fail, + * large pages are disabled at boot time. + */ + change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0)); + /* we should perform an IPI and flush all tlbs, + * but that can deadlock->flush only current cpu. + */ + __flush_tlb_all(); +} +EXPORT_SYMBOL(kernel_map_pages); +#endif EXPORT_SYMBOL(change_page_attr); EXPORT_SYMBOL(global_flush_tlb); diff -Nru a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c --- a/arch/i386/mm/pgtable.c Thu Jul 3 01:12:43 2003 +++ b/arch/i386/mm/pgtable.c Thu Jul 3 01:12:43 2003 @@ -34,7 +34,7 @@ show_free_areas(); printk("Free swap: %6dkB\n",nr_swap_pages<<(PAGE_SHIFT-10)); for_each_pgdat(pgdat) { - for (i = 0; i < pgdat->node_size; ++i) { + for (i = 0; i < pgdat->node_spanned_pages; ++i) { page = pgdat->node_mem_map + i; total++; if (PageHighMem(page)) diff -Nru a/arch/ia64/ia32/binfmt_elf32.c b/arch/ia64/ia32/binfmt_elf32.c --- a/arch/ia64/ia32/binfmt_elf32.c Thu Jul 3 01:12:43 2003 +++ b/arch/ia64/ia32/binfmt_elf32.c Thu Jul 3 01:12:43 2003 @@ -13,6 +13,7 @@ #include #include +#include #include #include @@ -177,7 +178,7 @@ if (!mpnt) return -ENOMEM; - if (!vm_enough_memory((IA32_STACK_TOP - (PAGE_MASK & (unsigned long) bprm->p))>>PAGE_SHIFT)) { + if (security_vm_enough_memory((IA32_STACK_TOP - (PAGE_MASK & (unsigned long) bprm->p))>>PAGE_SHIFT)) { kmem_cache_free(vm_area_cachep, mpnt); return -ENOMEM; } diff -Nru a/arch/ia64/kernel/sys_ia64.c b/arch/ia64/kernel/sys_ia64.c --- a/arch/ia64/kernel/sys_ia64.c Thu Jul 3 01:12:43 2003 +++ b/arch/ia64/kernel/sys_ia64.c Thu Jul 3 01:12:43 2003 @@ -100,7 +100,6 @@ asmlinkage unsigned long ia64_brk (unsigned long brk) { - extern int vm_enough_memory (long pages); unsigned long rlim, retval, newbrk, oldbrk; struct mm_struct *mm = current->mm; diff -Nru a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c --- a/arch/ia64/mm/init.c Thu Jul 3 01:12:43 2003 +++ b/arch/ia64/mm/init.c Thu Jul 3 01:12:43 2003 @@ -232,7 +232,7 @@ printk("Free swap: %6dkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); for_each_pgdat(pgdat) { printk("Node ID: %d\n", pgdat->node_id); - for(i = 0; i < pgdat->node_size; i++) { + for(i = 0; i < pgdat->node_spanned_pages; i++) { if (PageReserved(pgdat->node_mem_map+i)) reserved++; else if (PageSwapCache(pgdat->node_mem_map+i)) @@ -240,7 +240,7 @@ else if (page_count(pgdat->node_mem_map + i)) shared += page_count(pgdat->node_mem_map + i) - 1; } - printk("\t%d pages of RAM\n", pgdat->node_size); + printk("\t%d pages of RAM\n", pgdat->node_spanned_pages); printk("\t%d reserved pages\n", reserved); printk("\t%d pages shared\n", shared); printk("\t%d pages swap cached\n", cached); diff -Nru a/arch/mips/kernel/sysirix.c b/arch/mips/kernel/sysirix.c --- a/arch/mips/kernel/sysirix.c Thu Jul 3 01:12:43 2003 +++ b/arch/mips/kernel/sysirix.c Thu Jul 3 01:12:43 2003 @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -527,8 +528,6 @@ return get_seconds(); } -int vm_enough_memory(long pages); - /* * IRIX is completely broken... it returns 0 on success, otherwise * ENOMEM. @@ -585,7 +584,7 @@ /* * Check if we have enough memory.. */ - if (!vm_enough_memory((newbrk-oldbrk) >> PAGE_SHIFT)) { + if (security_vm_enough_memory((newbrk-oldbrk) >> PAGE_SHIFT)) { ret = -ENOMEM; goto out; } diff -Nru a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c --- a/arch/ppc64/mm/init.c Thu Jul 3 01:12:43 2003 +++ b/arch/ppc64/mm/init.c Thu Jul 3 01:12:43 2003 @@ -109,7 +109,7 @@ show_free_areas(); printk("Free swap: %6dkB\n",nr_swap_pages<<(PAGE_SHIFT-10)); for_each_pgdat(pgdat) { - for (i = 0; i < pgdat->node_size; i++) { + for (i = 0; i < pgdat->node_spanned_pages; i++) { page = pgdat->node_mem_map + i; total++; if (PageReserved(page)) @@ -564,7 +564,7 @@ int nid; for (nid = 0; nid < numnodes; nid++) { - if (node_data[nid].node_size != 0) { + if (node_data[nid].node_spanned_pages != 0) { printk("freeing bootmem node %x\n", nid); totalram_pages += free_all_bootmem_node(NODE_DATA(nid)); diff -Nru a/arch/ppc64/mm/numa.c b/arch/ppc64/mm/numa.c --- a/arch/ppc64/mm/numa.c Thu Jul 3 01:12:43 2003 +++ b/arch/ppc64/mm/numa.c Thu Jul 3 01:12:43 2003 @@ -160,21 +160,21 @@ * this simple case and complain if there is a gap in * memory */ - if (node_data[numa_domain].node_size) { + if (node_data[numa_domain].node_spanned_pages) { unsigned long shouldstart = node_data[numa_domain].node_start_pfn + - node_data[numa_domain].node_size; + node_data[numa_domain].node_spanned_pages; if (shouldstart != (start / PAGE_SIZE)) { printk(KERN_ERR "Hole in node, disabling " "region start %lx length %lx\n", start, size); continue; } - node_data[numa_domain].node_size += size / PAGE_SIZE; + node_data[numa_domain].node_spanned_pages += size / PAGE_SIZE; } else { node_data[numa_domain].node_start_pfn = start / PAGE_SIZE; - node_data[numa_domain].node_size = size / PAGE_SIZE; + node_data[numa_domain].node_spanned_pages = size / PAGE_SIZE; } for (i = start ; i < (start+size); i += MEMORY_INCREMENT) @@ -202,7 +202,7 @@ map_cpu_to_node(i, 0); node_data[0].node_start_pfn = 0; - node_data[0].node_size = lmb_end_of_DRAM() / PAGE_SIZE; + node_data[0].node_spanned_pages = lmb_end_of_DRAM() / PAGE_SIZE; for (i = 0 ; i < lmb_end_of_DRAM(); i += MEMORY_INCREMENT) numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = 0; @@ -224,12 +224,12 @@ unsigned long bootmem_paddr; unsigned long bootmap_pages; - if (node_data[nid].node_size == 0) + if (node_data[nid].node_spanned_pages == 0) continue; start_paddr = node_data[nid].node_start_pfn * PAGE_SIZE; end_paddr = start_paddr + - (node_data[nid].node_size * PAGE_SIZE); + (node_data[nid].node_spanned_pages * PAGE_SIZE); dbg("node %d\n", nid); dbg("start_paddr = %lx\n", start_paddr); @@ -311,7 +311,7 @@ unsigned long start_pfn; unsigned long end_pfn; - if (node_data[nid].node_size == 0) + if (node_data[nid].node_spanned_pages == 0) continue; start_pfn = plat_node_bdata[nid].node_boot_start >> PAGE_SHIFT; diff -Nru a/arch/s390/kernel/compat_exec.c b/arch/s390/kernel/compat_exec.c --- a/arch/s390/kernel/compat_exec.c Thu Jul 3 01:12:43 2003 +++ b/arch/s390/kernel/compat_exec.c Thu Jul 3 01:12:43 2003 @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -55,7 +56,7 @@ if (!mpnt) return -ENOMEM; - if (!vm_enough_memory((STACK_TOP - (PAGE_MASK & (unsigned long) bprm->p))>>PAGE_SHIFT)) { + if (security_vm_enough_memory((STACK_TOP - (PAGE_MASK & (unsigned long) bprm->p))>>PAGE_SHIFT)) { kmem_cache_free(vm_area_cachep, mpnt); return -ENOMEM; } diff -Nru a/arch/x86_64/ia32/ia32_binfmt.c b/arch/x86_64/ia32/ia32_binfmt.c --- a/arch/x86_64/ia32/ia32_binfmt.c Thu Jul 3 01:12:43 2003 +++ b/arch/x86_64/ia32/ia32_binfmt.c Thu Jul 3 01:12:43 2003 @@ -14,6 +14,8 @@ #include #include #include +#include + #include #include #include @@ -339,7 +341,7 @@ if (!mpnt) return -ENOMEM; - if (!vm_enough_memory((IA32_STACK_TOP - (PAGE_MASK & (unsigned long) bprm->p))>>PAGE_SHIFT)) { + if (security_vm_enough_memory((IA32_STACK_TOP - (PAGE_MASK & (unsigned long) bprm->p))>>PAGE_SHIFT)) { kmem_cache_free(vm_area_cachep, mpnt); return -ENOMEM; } diff -Nru a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c --- a/arch/x86_64/mm/init.c Thu Jul 3 01:12:43 2003 +++ b/arch/x86_64/mm/init.c Thu Jul 3 01:12:43 2003 @@ -64,7 +64,7 @@ printk("Free swap: %6dkB\n",nr_swap_pages<<(PAGE_SHIFT-10)); for_each_pgdat(pgdat) { - for (i = 0; i < pgdat->node_size; ++i) { + for (i = 0; i < pgdat->node_spanned_pages; ++i) { page = pgdat->node_mem_map + i; total++; if (PageReserved(page)) diff -Nru a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c --- a/arch/x86_64/mm/numa.c Thu Jul 3 01:12:43 2003 +++ b/arch/x86_64/mm/numa.c Thu Jul 3 01:12:43 2003 @@ -86,7 +86,7 @@ memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; NODE_DATA(nodeid)->node_start_pfn = start_pfn; - NODE_DATA(nodeid)->node_size = end_pfn - start_pfn; + NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn; /* Find a place for the bootmem map */ bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); diff -Nru a/drivers/block/cciss.c b/drivers/block/cciss.c --- a/drivers/block/cciss.c Thu Jul 3 01:12:43 2003 +++ b/drivers/block/cciss.c Thu Jul 3 01:12:43 2003 @@ -1887,7 +1887,7 @@ BUG(); if (( c = cmd_alloc(h, 1)) == NULL) - goto startio; + goto full; blkdev_dequeue_request(creq); @@ -1960,8 +1960,9 @@ h->maxQsinceinit = h->Qdepth; goto queue; -startio: +full: blk_stop_queue(q); +startio: start_io(h); } diff -Nru a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c --- a/drivers/block/ll_rw_blk.c Thu Jul 3 01:12:43 2003 +++ b/drivers/block/ll_rw_blk.c Thu Jul 3 01:12:43 2003 @@ -1072,8 +1072,8 @@ **/ void blk_start_queue(request_queue_t *q) { - if (test_and_clear_bit(QUEUE_FLAG_STOPPED, &q->queue_flags)) - schedule_work(&q->unplug_work); + clear_bit(QUEUE_FLAG_STOPPED, &q->queue_flags); + schedule_work(&q->unplug_work); } /** diff -Nru a/drivers/net/Kconfig b/drivers/net/Kconfig --- a/drivers/net/Kconfig Thu Jul 3 01:12:43 2003 +++ b/drivers/net/Kconfig Thu Jul 3 01:12:43 2003 @@ -1397,7 +1397,7 @@ config TC35815 tristate "TOSHIBA TC35815 Ethernet support" - depends on NET_PCI && PCI + depends on NET_PCI && PCI && TOSHIBA_JMR3927 config DGRS tristate "Digi Intl. RightSwitch SE-X support" diff -Nru a/drivers/net/e100/e100_main.c b/drivers/net/e100/e100_main.c --- a/drivers/net/e100/e100_main.c Thu Jul 3 01:12:43 2003 +++ b/drivers/net/e100/e100_main.c Thu Jul 3 01:12:43 2003 @@ -1085,9 +1085,9 @@ goto exit1; } - e100_prepare_xmit_buff(bdp, skb); - bdp->drv_stats.net_stats.tx_bytes += skb->len; + + e100_prepare_xmit_buff(bdp, skb); dev->trans_start = jiffies; diff -Nru a/fs/attr.c b/fs/attr.c --- a/fs/attr.c Thu Jul 3 01:12:43 2003 +++ b/fs/attr.c Thu Jul 3 01:12:43 2003 @@ -22,8 +22,6 @@ int retval = -EPERM; unsigned int ia_valid = attr->ia_valid; - lock_kernel(); - /* If force is set do it anyway. */ if (ia_valid & ATTR_FORCE) goto fine; @@ -58,7 +56,6 @@ fine: retval = 0; error: - unlock_kernel(); return retval; } diff -Nru a/fs/block_dev.c b/fs/block_dev.c --- a/fs/block_dev.c Thu Jul 3 01:12:43 2003 +++ b/fs/block_dev.c Thu Jul 3 01:12:43 2003 @@ -155,11 +155,13 @@ */ static loff_t block_llseek(struct file *file, loff_t offset, int origin) { - /* ewww */ - loff_t size = file->f_dentry->d_inode->i_bdev->bd_inode->i_size; + struct inode *bd_inode; + loff_t size; loff_t retval; - lock_kernel(); + bd_inode = file->f_dentry->d_inode->i_bdev->bd_inode; + down(&bd_inode->i_sem); + size = bd_inode->i_size; switch (origin) { case 2: @@ -175,7 +177,7 @@ } retval = offset; } - unlock_kernel(); + up(&bd_inode->i_sem); return retval; } diff -Nru a/fs/coda/file.c b/fs/coda/file.c --- a/fs/coda/file.c Thu Jul 3 01:12:43 2003 +++ b/fs/coda/file.c Thu Jul 3 01:12:43 2003 @@ -153,19 +153,22 @@ struct inode *coda_inode; int err = 0, fcnt; + lock_kernel(); + coda_vfs_stat.flush++; /* last close semantics */ fcnt = file_count(coda_file); - if (fcnt > 1) return 0; + if (fcnt > 1) + goto out; /* No need to make an upcall when we have not made any modifications * to the file */ if ((coda_file->f_flags & O_ACCMODE) == O_RDONLY) - return 0; + goto out; if (use_coda_close) - return 0; + goto out; cfi = CODA_FTOC(coda_file); BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); @@ -180,6 +183,8 @@ err = 0; } +out: + unlock_kernel(); return err; } diff -Nru a/fs/exec.c b/fs/exec.c --- a/fs/exec.c Thu Jul 3 01:12:43 2003 +++ b/fs/exec.c Thu Jul 3 01:12:43 2003 @@ -392,7 +392,7 @@ if (!mpnt) return -ENOMEM; - if (!vm_enough_memory((STACK_TOP - (PAGE_MASK & (unsigned long) bprm->p))>>PAGE_SHIFT)) { + if (security_vm_enough_memory((STACK_TOP - (PAGE_MASK & (unsigned long) bprm->p))>>PAGE_SHIFT)) { kmem_cache_free(vm_area_cachep, mpnt); return -ENOMEM; } @@ -441,9 +441,9 @@ { int i; - for (i = 0 ; i < MAX_ARG_PAGES ; i++) { + for (i = 0; i < MAX_ARG_PAGES; i++) { if (bprm->page[i]) - __free_page(bprm->page[i]); + __free_page(bprm->page[i]); bprm->page[i] = NULL; } } @@ -772,6 +772,8 @@ if (retval) goto out; + bprm->mm = NULL; /* We're using it now */ + /* This is the point of no return */ current->sas_ss_sp = current->sas_ss_size = 0; @@ -999,7 +1001,7 @@ } read_lock(&binfmt_lock); put_binfmt(fmt); - if (retval != -ENOEXEC) + if (retval != -ENOEXEC || bprm->mm == NULL) break; if (!bprm->file) { read_unlock(&binfmt_lock); @@ -1007,7 +1009,7 @@ } } read_unlock(&binfmt_lock); - if (retval != -ENOEXEC) { + if (retval != -ENOEXEC || bprm->mm == NULL) { break; #ifdef CONFIG_KMOD }else{ @@ -1035,7 +1037,6 @@ struct linux_binprm bprm; struct file *file; int retval; - int i; sched_balance_exec(); @@ -1103,17 +1104,14 @@ out: /* Something went wrong, return the inode and free the argument pages*/ - for (i = 0 ; i < MAX_ARG_PAGES ; i++) { - struct page * page = bprm.page[i]; - if (page) - __free_page(page); - } + free_arg_pages(&bprm); if (bprm.security) security_bprm_free(&bprm); out_mm: - mmdrop(bprm.mm); + if (bprm.mm) + mmdrop(bprm.mm); out_file: if (bprm.file) { diff -Nru a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c --- a/fs/ext2/ialloc.c Thu Jul 3 01:12:43 2003 +++ b/fs/ext2/ialloc.c Thu Jul 3 01:12:43 2003 @@ -489,17 +489,18 @@ return group; } -struct inode * ext2_new_inode(struct inode * dir, int mode) +struct inode *ext2_new_inode(struct inode *dir, int mode) { struct super_block *sb; struct buffer_head *bitmap_bh = NULL; struct buffer_head *bh2; int group, i; - ino_t ino; + ino_t ino = 0; struct inode * inode; - struct ext2_group_desc * desc; - struct ext2_super_block * es; + struct ext2_group_desc *gdp; + struct ext2_super_block *es; struct ext2_inode_info *ei; + struct ext2_sb_info *sbi; int err; sb = dir->i_sb; @@ -508,36 +509,62 @@ return ERR_PTR(-ENOMEM); ei = EXT2_I(inode); - es = EXT2_SB(sb)->s_es; + sbi = EXT2_SB(sb); + es = sbi->s_es; repeat: if (S_ISDIR(mode)) { - if (test_opt (sb, OLDALLOC)) + if (test_opt(sb, OLDALLOC)) group = find_group_dir(sb, dir); else group = find_group_orlov(sb, dir); } else group = find_group_other(sb, dir); - err = -ENOSPC; - if (group == -1) + if (group == -1) { + err = -ENOSPC; goto fail; + } - err = -EIO; - bitmap_bh = read_inode_bitmap(sb, group); - if (!bitmap_bh) - goto fail2; - - i = ext2_find_first_zero_bit((unsigned long *)bitmap_bh->b_data, - EXT2_INODES_PER_GROUP(sb)); - if (i >= EXT2_INODES_PER_GROUP(sb)) - goto bad_count; - if (ext2_set_bit_atomic(sb_bgl_lock(EXT2_SB(sb), group), - i, (void *) bitmap_bh->b_data)) { + for (i = 0; i < sbi->s_groups_count; i++) { + gdp = ext2_get_group_desc(sb, group, &bh2); brelse(bitmap_bh); - ext2_release_inode(sb, group, S_ISDIR(mode)); - goto repeat; + bitmap_bh = read_inode_bitmap(sb, group); + if (!bitmap_bh) { + err = -EIO; + goto fail2; + } + + i = ext2_find_first_zero_bit((unsigned long *)bitmap_bh->b_data, + EXT2_INODES_PER_GROUP(sb)); + if (i >= EXT2_INODES_PER_GROUP(sb)) { + /* + * Rare race: find_group_xx() decided that there were + * free inodes in this group, but by the time we tried + * to allocate one, they're all gone. This can also + * occur because the counters which find_group_orlov() + * uses are approximate. So just go and search the + * next block group. + */ + if (++group == sbi->s_groups_count) + group = 0; + continue; + } + if (ext2_set_bit_atomic(sb_bgl_lock(EXT2_SB(sb), group), + i, bitmap_bh->b_data)) { + brelse(bitmap_bh); + bitmap_bh = NULL; + ext2_release_inode(sb, group, S_ISDIR(mode)); + goto repeat; + } + goto got; } + /* + * Scanned all blockgroups. + */ + err = -ENOSPC; + goto fail2; +got: mark_buffer_dirty(bitmap_bh); if (sb->s_flags & MS_SYNCHRONOUS) sync_dirty_buffer(bitmap_bh); @@ -605,8 +632,9 @@ inode->i_generation = EXT2_SB(sb)->s_next_generation++; insert_inode_hash(inode); - if(DQUOT_ALLOC_INODE(inode)) { + if (DQUOT_ALLOC_INODE(inode)) { DQUOT_DROP(inode); + err = -ENOSPC; goto fail3; } err = ext2_init_acl(inode, dir); @@ -631,21 +659,6 @@ make_bad_inode(inode); iput(inode); return ERR_PTR(err); - -bad_count: - brelse(bitmap_bh); - ext2_error (sb, "ext2_new_inode", - "Free inodes count corrupted in group %d", - group); - /* Is it really ENOSPC? */ - err = -ENOSPC; - if (sb->s_flags & MS_RDONLY) - goto fail; - - desc = ext2_get_group_desc (sb, group, &bh2); - desc->bg_free_inodes_count = 0; - mark_buffer_dirty(bh2); - goto repeat; } unsigned long ext2_count_free_inodes (struct super_block * sb) diff -Nru a/fs/jbd/commit.c b/fs/jbd/commit.c --- a/fs/jbd/commit.c Thu Jul 3 01:12:43 2003 +++ b/fs/jbd/commit.c Thu Jul 3 01:12:43 2003 @@ -169,10 +169,23 @@ * that multiple journal_get_write_access() calls to the same * buffer are perfectly permissable. */ - while (commit_transaction->t_reserved_list) { - jh = commit_transaction->t_reserved_list; - JBUFFER_TRACE(jh, "reserved, unused: refile"); - journal_refile_buffer(journal, jh); + { + int nr = 0; + while (commit_transaction->t_reserved_list) { + jh = commit_transaction->t_reserved_list; + JBUFFER_TRACE(jh, "reserved, unused: refile"); + journal_refile_buffer(journal, jh); + nr++; + } + if (nr) { + static int noisy; + + if (noisy < 10) { + noisy++; + printk("%s: freed %d reserved buffers\n", + __FUNCTION__, nr); + } + } } /* diff -Nru a/fs/jbd/transaction.c b/fs/jbd/transaction.c --- a/fs/jbd/transaction.c Thu Jul 3 01:12:43 2003 +++ b/fs/jbd/transaction.c Thu Jul 3 01:12:43 2003 @@ -1168,37 +1168,24 @@ * journal_release_buffer: undo a get_write_access without any buffer * updates, if the update decided in the end that it didn't need access. * - * journal_get_write_access() can block, so it is quite possible for a - * journaling component to decide after the write access is returned - * that global state has changed and the update is no longer required. - * * The caller passes in the number of credits which should be put back for * this buffer (zero or one). + * + * We leave the buffer attached to t_reserved_list because even though this + * handle doesn't want it, some other concurrent handle may want to journal + * this buffer. If that handle is curently in between get_write_access() and + * journal_dirty_metadata() then it expects the buffer to be reserved. If + * we were to rip it off t_reserved_list here, the other handle will explode + * when journal_dirty_metadata is presented with a non-reserved buffer. + * + * If nobody really wants to journal this buffer then it will be thrown + * away at the start of commit. */ void journal_release_buffer(handle_t *handle, struct buffer_head *bh, int credits) { - transaction_t *transaction = handle->h_transaction; - journal_t *journal = transaction->t_journal; - struct journal_head *jh = bh2jh(bh); - - JBUFFER_TRACE(jh, "entry"); - - /* If the buffer is reserved but not modified by this - * transaction, then it is safe to release it. In all other - * cases, just leave the buffer as it is. */ - - jbd_lock_bh_state(bh); - spin_lock(&journal->j_list_lock); - if (jh->b_jlist == BJ_Reserved && jh->b_transaction == transaction && - !buffer_jbddirty(jh2bh(jh))) { - JBUFFER_TRACE(jh, "unused: refiling it"); - __journal_refile_buffer(jh); - } - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); + BUFFER_TRACE(bh, "entry"); handle->h_buffer_credits += credits; - JBUFFER_TRACE(jh, "exit"); } /** diff -Nru a/fs/nfs/file.c b/fs/nfs/file.c --- a/fs/nfs/file.c Thu Jul 3 01:12:43 2003 +++ b/fs/nfs/file.c Thu Jul 3 01:12:43 2003 @@ -104,11 +104,13 @@ dfprintk(VFS, "nfs: flush(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino); + lock_kernel(); status = nfs_wb_file(inode, file); if (!status) { status = file->f_error; file->f_error = 0; } + unlock_kernel(); return status; } diff -Nru a/fs/open.c b/fs/open.c --- a/fs/open.c Thu Jul 3 01:12:43 2003 +++ b/fs/open.c Thu Jul 3 01:12:43 2003 @@ -952,11 +952,8 @@ return 0; } retval = 0; - if (filp->f_op && filp->f_op->flush) { - lock_kernel(); + if (filp->f_op && filp->f_op->flush) retval = filp->f_op->flush(filp); - unlock_kernel(); - } dnotify_flush(filp, id); locks_remove_posix(filp, id); fput(filp); diff -Nru a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c --- a/fs/proc/proc_misc.c Thu Jul 3 01:12:43 2003 +++ b/fs/proc/proc_misc.c Thu Jul 3 01:12:43 2003 @@ -497,11 +497,10 @@ static int locks_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { - int len; - lock_kernel(); - len = get_locks_status(page, start, off, count); - unlock_kernel(); - if (len < count) *eof = 1; + int len = get_locks_status(page, start, off, count); + + if (len < count) + *eof = 1; return len; } diff -Nru a/fs/proc/root.c b/fs/proc/root.c --- a/fs/proc/root.c Thu Jul 3 01:12:43 2003 +++ b/fs/proc/root.c Thu Jul 3 01:12:43 2003 @@ -81,11 +81,13 @@ static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry) { - if (dir->i_ino == PROC_ROOT_INO) { /* check for safety... */ - lock_kernel(); + /* + * nr_threads is actually protected by the tasklist_lock; + * however, it's conventional to do reads, especially for + * reporting, without any locking whatsoever. + */ + if (dir->i_ino == PROC_ROOT_INO) /* check for safety... */ dir->i_nlink = proc_root.nlink + nr_threads; - unlock_kernel(); - } if (!proc_lookup(dir, dentry)) { return NULL; diff -Nru a/fs/ramfs/inode.c b/fs/ramfs/inode.c --- a/fs/ramfs/inode.c Thu Jul 3 01:12:43 2003 +++ b/fs/ramfs/inode.c Thu Jul 3 01:12:43 2003 @@ -146,6 +146,7 @@ .mmap = generic_file_mmap, .fsync = simple_sync_file, .sendfile = generic_file_sendfile, + .llseek = generic_file_llseek, }; static struct inode_operations ramfs_file_inode_operations = { diff -Nru a/include/asm-alpha/mmzone.h b/include/asm-alpha/mmzone.h --- a/include/asm-alpha/mmzone.h Thu Jul 3 01:12:43 2003 +++ b/include/asm-alpha/mmzone.h Thu Jul 3 01:12:43 2003 @@ -31,7 +31,6 @@ #define pa_to_nid(pa) alpha_pa_to_nid(pa) #define NODE_DATA(nid) (&node_data[(nid)]) -#define node_size(nid) (NODE_DATA(nid)->node_size) #define node_localnr(pfn, nid) ((pfn) - NODE_DATA(nid)->node_start_pfn) @@ -124,7 +123,7 @@ #define pfn_to_nid(pfn) pa_to_nid(((u64)pfn << PAGE_SHIFT)) #define pfn_valid(pfn) \ (((pfn) - node_start_pfn(pfn_to_nid(pfn))) < \ - node_size(pfn_to_nid(pfn))) \ + node_spanned_pages(pfn_to_nid(pfn))) \ #define virt_addr_valid(kaddr) pfn_valid((__pa(kaddr) >> PAGE_SHIFT)) diff -Nru a/include/asm-i386/cacheflush.h b/include/asm-i386/cacheflush.h --- a/include/asm-i386/cacheflush.h Thu Jul 3 01:12:43 2003 +++ b/include/asm-i386/cacheflush.h Thu Jul 3 01:12:43 2003 @@ -17,4 +17,9 @@ void global_flush_tlb(void); int change_page_attr(struct page *page, int numpages, pgprot_t prot); +#ifdef CONFIG_DEBUG_PAGEALLOC +/* internal debugging function */ +void kernel_map_pages(struct page *page, int numpages, int enable); +#endif + #endif /* _I386_CACHEFLUSH_H */ diff -Nru a/include/asm-i386/mmzone.h b/include/asm-i386/mmzone.h --- a/include/asm-i386/mmzone.h Thu Jul 3 01:12:43 2003 +++ b/include/asm-i386/mmzone.h Thu Jul 3 01:12:43 2003 @@ -32,8 +32,7 @@ #define alloc_bootmem_low_pages_node(ignore, x) \ __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0) -#define node_size(nid) (node_data[nid]->node_size) -#define node_localnr(pfn, nid) ((pfn) - node_data[nid]->node_start_pfn) +#define node_localnr(pfn, nid) ((pfn) - node_data[nid]->node_start_pfn) /* * Following are macros that each numa implmentation must define. @@ -54,7 +53,7 @@ #define node_end_pfn(nid) \ ({ \ pg_data_t *__pgdat = NODE_DATA(nid); \ - __pgdat->node_start_pfn + __pgdat->node_size; \ + __pgdat->node_start_pfn + __pgdat->node_spanned_pages; \ }) #define local_mapnr(kvaddr) \ diff -Nru a/include/asm-i386/timer.h b/include/asm-i386/timer.h --- a/include/asm-i386/timer.h Thu Jul 3 01:12:43 2003 +++ b/include/asm-i386/timer.h Thu Jul 3 01:12:43 2003 @@ -21,8 +21,21 @@ #define TICK_SIZE (tick_nsec / 1000) extern struct timer_opts* select_timer(void); +extern void clock_fallback(void); /* Modifiers for buggy PIT handling */ extern int pit_latch_buggy; + +extern struct timer_opts *cur_timer; +extern int timer_ack; + +/* list of externed timers */ +extern struct timer_opts timer_none; +extern struct timer_opts timer_pit; +extern struct timer_opts timer_tsc; +#ifdef CONFIG_X86_CYCLONE_TIMER +extern struct timer_opts timer_cyclone; +#endif + #endif diff -Nru a/include/asm-mips64/mmzone.h b/include/asm-mips64/mmzone.h --- a/include/asm-mips64/mmzone.h Thu Jul 3 01:12:43 2003 +++ b/include/asm-mips64/mmzone.h Thu Jul 3 01:12:43 2003 @@ -24,7 +24,7 @@ #define PHYSADDR_TO_NID(pa) NASID_TO_COMPACT_NODEID(NASID_GET(pa)) #define PLAT_NODE_DATA(n) (plat_node_data[n]) -#define PLAT_NODE_DATA_SIZE(n) (PLAT_NODE_DATA(n)->gendata.node_size) +#define PLAT_NODE_DATA_SIZE(n) (PLAT_NODE_DATA(n)->gendata.node_spanned_pages) #define PLAT_NODE_DATA_LOCALNR(p, n) \ (((p) >> PAGE_SHIFT) - PLAT_NODE_DATA(n)->gendata.node_start_pfn) diff -Nru a/include/asm-ppc64/mmzone.h b/include/asm-ppc64/mmzone.h --- a/include/asm-ppc64/mmzone.h Thu Jul 3 01:12:43 2003 +++ b/include/asm-ppc64/mmzone.h Thu Jul 3 01:12:43 2003 @@ -54,7 +54,6 @@ */ #define NODE_DATA(nid) (&node_data[nid]) -#define node_size(nid) (NODE_DATA(nid)->node_size) #define node_localnr(pfn, nid) ((pfn) - NODE_DATA(nid)->node_start_pfn) /* diff -Nru a/include/asm-x86_64/mmzone.h b/include/asm-x86_64/mmzone.h --- a/include/asm-x86_64/mmzone.h Thu Jul 3 01:12:43 2003 +++ b/include/asm-x86_64/mmzone.h Thu Jul 3 01:12:43 2003 @@ -40,8 +40,7 @@ #define node_mem_map(nid) (NODE_DATA(nid)->node_mem_map) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \ - NODE_DATA(nid)->node_size) -#define node_size(nid) (NODE_DATA(nid)->node_size) + NODE_DATA(nid)->node_spanned_pages) #define local_mapnr(kvaddr) \ ( (__pa(kvaddr) >> PAGE_SHIFT) - node_start_pfn(kvaddr_to_nid(kvaddr)) ) diff -Nru a/include/linux/mm.h b/include/linux/mm.h --- a/include/linux/mm.h Thu Jul 3 01:12:43 2003 +++ b/include/linux/mm.h Thu Jul 3 01:12:43 2003 @@ -339,9 +339,14 @@ page->flags |= zone_num << ZONE_SHIFT; } -static inline void * lowmem_page_address(struct page *page) +#ifndef CONFIG_DISCONTIGMEM +/* The array of struct pages - for discontigmem use pgdat->lmem_map */ +extern struct page *mem_map; +#endif + +static inline void *lowmem_page_address(struct page *page) { - return __va( ( (page - page_zone(page)->zone_mem_map) + page_zone(page)->zone_start_pfn) << PAGE_SHIFT); + return __va(page_to_pfn(page) << PAGE_SHIFT); } #if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) @@ -395,11 +400,6 @@ #define VM_FAULT_MINOR 1 #define VM_FAULT_MAJOR 2 -#ifndef CONFIG_DISCONTIGMEM -/* The array of struct pages - for discontigmem use pgdat->lmem_map */ -extern struct page *mem_map; -#endif - extern void show_free_areas(void); struct page *shmem_nopage(struct vm_area_struct * vma, @@ -609,5 +609,13 @@ int write); extern int remap_page_range(struct vm_area_struct *vma, unsigned long from, unsigned long to, unsigned long size, pgprot_t prot); + +#ifndef CONFIG_DEBUG_PAGEALLOC +static inline void +kernel_map_pages(struct page *page, int numpages, int enable) +{ +} +#endif + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff -Nru a/include/linux/mman.h b/include/linux/mman.h --- a/include/linux/mman.h Thu Jul 3 01:12:43 2003 +++ b/include/linux/mman.h Thu Jul 3 01:12:43 2003 @@ -9,7 +9,8 @@ #define MREMAP_MAYMOVE 1 #define MREMAP_FIXED 2 -extern int vm_enough_memory(long pages); +extern int sysctl_overcommit_memory; +extern int sysctl_overcommit_ratio; extern atomic_t vm_committed_space; #ifdef CONFIG_SMP diff -Nru a/include/linux/mmzone.h b/include/linux/mmzone.h --- a/include/linux/mmzone.h Thu Jul 3 01:12:43 2003 +++ b/include/linux/mmzone.h Thu Jul 3 01:12:43 2003 @@ -184,11 +184,16 @@ unsigned long *valid_addr_bitmap; struct bootmem_data *bdata; unsigned long node_start_pfn; - unsigned long node_size; + unsigned long node_present_pages; /* total number of physical pages */ + unsigned long node_spanned_pages; /* total size of physical page + range, including holes */ int node_id; struct pglist_data *pgdat_next; wait_queue_head_t kswapd_wait; } pg_data_t; + +#define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) +#define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages) extern int numnodes; extern struct pglist_data *pgdat_list; diff -Nru a/include/linux/security.h b/include/linux/security.h --- a/include/linux/security.h Thu Jul 3 01:12:43 2003 +++ b/include/linux/security.h Thu Jul 3 01:12:43 2003 @@ -49,6 +49,7 @@ extern int cap_task_post_setuid (uid_t old_ruid, uid_t old_euid, uid_t old_suid, int flags); extern void cap_task_reparent_to_init (struct task_struct *p); extern int cap_syslog (int type); +extern int cap_vm_enough_memory (long pages); static inline int cap_netlink_send (struct sk_buff *skb) { @@ -958,6 +959,10 @@ * See the syslog(2) manual page for an explanation of the @type values. * @type contains the type of action. * Return 0 if permission is granted. + * @vm_enough_memory: + * Check permissions for allocating a new virtual mapping. + * @pages contains the number of pages. + * Return 0 if permission is granted. * * @register_security: * allow module stacking. @@ -989,6 +994,7 @@ int (*quotactl) (int cmds, int type, int id, struct super_block * sb); int (*quota_on) (struct file * f); int (*syslog) (int type); + int (*vm_enough_memory) (long pages); int (*bprm_alloc_security) (struct linux_binprm * bprm); void (*bprm_free_security) (struct linux_binprm * bprm); @@ -1238,6 +1244,11 @@ return security_ops->syslog(type); } +static inline int security_vm_enough_memory(long pages) +{ + return security_ops->vm_enough_memory(pages); +} + static inline int security_bprm_alloc (struct linux_binprm *bprm) { return security_ops->bprm_alloc_security (bprm); @@ -1896,6 +1907,11 @@ static inline int security_syslog(int type) { return cap_syslog(type); +} + +static inline int security_vm_enough_memory(long pages) +{ + return cap_vm_enough_memory(pages); } static inline int security_bprm_alloc (struct linux_binprm *bprm) diff -Nru a/include/linux/slab.h b/include/linux/slab.h --- a/include/linux/slab.h Thu Jul 3 01:12:43 2003 +++ b/include/linux/slab.h Thu Jul 3 01:12:43 2003 @@ -114,6 +114,10 @@ extern kmem_cache_t *sighand_cachep; extern kmem_cache_t *bio_cachep; +void ptrinfo(unsigned long addr); + +extern atomic_t slab_reclaim_pages; + #endif /* __KERNEL__ */ #endif /* _LINUX_SLAB_H */ diff -Nru a/init/Kconfig b/init/Kconfig --- a/init/Kconfig Thu Jul 3 01:12:43 2003 +++ b/init/Kconfig Thu Jul 3 01:12:43 2003 @@ -93,7 +93,8 @@ limited in memory. config LOG_BUF_SHIFT - int "Kernel log buffer size" if DEBUG_KERNEL + int "Kernel log buffer size (16 => 64KB, 17 => 128KB)" if DEBUG_KERNEL + range 12 20 default 17 if ARCH_S390 default 16 if X86_NUMAQ || IA64 default 15 if SMP diff -Nru a/kernel/exit.c b/kernel/exit.c --- a/kernel/exit.c Thu Jul 3 01:12:43 2003 +++ b/kernel/exit.c Thu Jul 3 01:12:43 2003 @@ -651,6 +651,8 @@ if (tsk->exit_signal != -1) { int signal = tsk->parent == tsk->real_parent ? tsk->exit_signal : SIGCHLD; do_notify_parent(tsk, signal); + } else if (tsk->ptrace) { + do_notify_parent(tsk, SIGCHLD); } tsk->state = TASK_ZOMBIE; @@ -715,7 +717,7 @@ tsk->exit_code = code; exit_notify(tsk); - if (tsk->exit_signal == -1) + if (tsk->exit_signal == -1 && tsk->ptrace == 0) release_task(tsk); schedule(); @@ -859,7 +861,7 @@ BUG_ON(state != TASK_DEAD); return 0; } - if (unlikely(p->exit_signal == -1)) + if (unlikely(p->exit_signal == -1 && p->ptrace == 0)) /* * This can only happen in a race with a ptraced thread * dying on another processor. @@ -889,8 +891,12 @@ /* Double-check with lock held. */ if (p->real_parent != p->parent) { __ptrace_unlink(p); - do_notify_parent(p, p->exit_signal); p->state = TASK_ZOMBIE; + /* If this is a detached thread, this is where it goes away. */ + if (p->exit_signal == -1) + release_task (p); + else + do_notify_parent(p, p->exit_signal); p = NULL; } write_unlock_irq(&tasklist_lock); diff -Nru a/kernel/fork.c b/kernel/fork.c --- a/kernel/fork.c Thu Jul 3 01:12:43 2003 +++ b/kernel/fork.c Thu Jul 3 01:12:43 2003 @@ -286,7 +286,7 @@ continue; if (mpnt->vm_flags & VM_ACCOUNT) { unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; - if (!vm_enough_memory(len)) + if (security_vm_enough_memory(len)) goto fail_nomem; charge += len; } diff -Nru a/kernel/ksyms.c b/kernel/ksyms.c --- a/kernel/ksyms.c Thu Jul 3 01:12:43 2003 +++ b/kernel/ksyms.c Thu Jul 3 01:12:43 2003 @@ -462,6 +462,7 @@ #endif EXPORT_SYMBOL(schedule_timeout); EXPORT_SYMBOL(yield); +EXPORT_SYMBOL(io_schedule); EXPORT_SYMBOL(__cond_resched); EXPORT_SYMBOL(set_user_nice); EXPORT_SYMBOL(task_nice); diff -Nru a/mm/mmap.c b/mm/mmap.c --- a/mm/mmap.c Thu Jul 3 01:12:43 2003 +++ b/mm/mmap.c Thu Jul 3 01:12:43 2003 @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -53,65 +54,9 @@ int sysctl_overcommit_ratio = 50; /* default is 50% */ atomic_t vm_committed_space = ATOMIC_INIT(0); -/* - * Check that a process has enough memory to allocate a new virtual - * mapping. 1 means there is enough memory for the allocation to - * succeed and 0 implies there is not. - * - * We currently support three overcommit policies, which are set via the - * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-acounting - * - * Strict overcommit modes added 2002 Feb 26 by Alan Cox. - * Additional code 2002 Jul 20 by Robert Love. - */ -extern atomic_t slab_reclaim_pages; -int vm_enough_memory(long pages) -{ - unsigned long free, allowed; - - vm_acct_memory(pages); - - /* - * Sometimes we want to use more memory than we have - */ - if (sysctl_overcommit_memory == 1) - return 1; - - if (sysctl_overcommit_memory == 0) { - free = get_page_cache_size(); - free += nr_free_pages(); - free += nr_swap_pages; - - /* - * Any slabs which are created with the - * SLAB_RECLAIM_ACCOUNT flag claim to have contents - * which are reclaimable, under pressure. The dentry - * cache and most inode caches should fall into this - */ - free += atomic_read(&slab_reclaim_pages); - - /* - * Leave the last 3% for root - */ - if (!capable(CAP_SYS_ADMIN)) - free -= free / 32; - - if (free > pages) - return 1; - vm_unacct_memory(pages); - return 0; - } - - allowed = totalram_pages * sysctl_overcommit_ratio / 100; - allowed += total_swap_pages; - - if (atomic_read(&vm_committed_space) < allowed) - return 1; - - vm_unacct_memory(pages); - - return 0; -} +EXPORT_SYMBOL(sysctl_overcommit_memory); +EXPORT_SYMBOL(sysctl_overcommit_ratio); +EXPORT_SYMBOL(vm_committed_space); /* * Requires inode->i_mapping->i_shared_sem @@ -646,7 +591,7 @@ * Private writable mapping: check memory availability */ charged = len >> PAGE_SHIFT; - if (!vm_enough_memory(charged)) + if (security_vm_enough_memory(charged)) return -ENOMEM; vm_flags |= VM_ACCOUNT; } @@ -950,7 +895,7 @@ grow = (address - vma->vm_end) >> PAGE_SHIFT; /* Overcommit.. */ - if (!vm_enough_memory(grow)) { + if (security_vm_enough_memory(grow)) { spin_unlock(&vma->vm_mm->page_table_lock); return -ENOMEM; } @@ -1004,7 +949,7 @@ grow = (vma->vm_start - address) >> PAGE_SHIFT; /* Overcommit.. */ - if (!vm_enough_memory(grow)) { + if (security_vm_enough_memory(grow)) { spin_unlock(&vma->vm_mm->page_table_lock); return -ENOMEM; } @@ -1376,7 +1321,7 @@ if (mm->map_count > MAX_MAP_COUNT) return -ENOMEM; - if (!vm_enough_memory(len >> PAGE_SHIFT)) + if (security_vm_enough_memory(len >> PAGE_SHIFT)) return -ENOMEM; flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; diff -Nru a/mm/mprotect.c b/mm/mprotect.c --- a/mm/mprotect.c Thu Jul 3 01:12:43 2003 +++ b/mm/mprotect.c Thu Jul 3 01:12:43 2003 @@ -175,7 +175,7 @@ if (newflags & VM_WRITE) { if (!(vma->vm_flags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) { charged = (end - start) >> PAGE_SHIFT; - if (!vm_enough_memory(charged)) + if (security_vm_enough_memory(charged)) return -ENOMEM; newflags |= VM_ACCOUNT; } diff -Nru a/mm/mremap.c b/mm/mremap.c --- a/mm/mremap.c Thu Jul 3 01:12:43 2003 +++ b/mm/mremap.c Thu Jul 3 01:12:43 2003 @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -244,9 +245,7 @@ } if (!move_page_tables(vma, new_addr, addr, old_len)) { - unsigned long must_fault_in; - unsigned long fault_in_start; - unsigned long fault_in_end; + unsigned long vm_locked = vma->vm_flags & VM_LOCKED; if (allocated_vma) { *new_vma = *vma; @@ -272,14 +271,8 @@ } else vma = NULL; /* nothing more to do */ - must_fault_in = new_vma->vm_flags & VM_LOCKED; - fault_in_start = new_vma->vm_start; - fault_in_end = new_vma->vm_end; - do_munmap(current->mm, addr, old_len); - /* new_vma could have been invalidated by do_munmap */ - /* Restore VM_ACCOUNT if one or two pieces of vma left */ if (vma) { vma->vm_flags |= VM_ACCOUNT; @@ -288,9 +281,11 @@ } current->mm->total_vm += new_len >> PAGE_SHIFT; - if (must_fault_in) { + if (vm_locked) { current->mm->locked_vm += new_len >> PAGE_SHIFT; - make_pages_present(fault_in_start, fault_in_end); + if (new_len > old_len) + make_pages_present(new_addr + old_len, + new_addr + new_len); } return new_addr; } @@ -391,7 +386,7 @@ if (vma->vm_flags & VM_ACCOUNT) { charged = (new_len - old_len) >> PAGE_SHIFT; - if (!vm_enough_memory(charged)) + if (security_vm_enough_memory(charged)) goto out_nc; } diff -Nru a/mm/nommu.c b/mm/nommu.c --- a/mm/nommu.c Thu Jul 3 01:12:43 2003 +++ b/mm/nommu.c Thu Jul 3 01:12:43 2003 @@ -62,11 +62,8 @@ inode->i_size = offset; out_truncate: - if (inode->i_op && inode->i_op->truncate) { - lock_kernel(); + if (inode->i_op && inode->i_op->truncate) inode->i_op->truncate(inode); - unlock_kernel(); - } return 0; out_sig: send_sig(SIGXFSZ, current, 0); diff -Nru a/mm/page_alloc.c b/mm/page_alloc.c --- a/mm/page_alloc.c Thu Jul 3 01:12:43 2003 +++ b/mm/page_alloc.c Thu Jul 3 01:12:43 2003 @@ -32,6 +32,8 @@ #include #include +#include + DECLARE_BITMAP(node_online_map, MAX_NUMNODES); DECLARE_BITMAP(memblk_online_map, MAX_NR_MEMBLKS); struct pglist_data *pgdat_list; @@ -41,6 +43,9 @@ int numnodes = 1; int sysctl_lower_zone_protection = 0; +EXPORT_SYMBOL(totalram_pages); +EXPORT_SYMBOL(nr_swap_pages); + /* * Used by page_zone() to look up the address of the struct zone whose * id is encoded in the upper bits of page->flags @@ -265,6 +270,7 @@ mod_page_state(pgfree, 1 << order); free_pages_check(__FUNCTION__, page); list_add(&page->list, &list); + kernel_map_pages(page, 1<pageset[get_cpu()].pcp[cold]; @@ -556,7 +563,7 @@ (!wait && z->free_pages >= z->pages_high)) { page = buffered_rmqueue(z, order, cold); if (page) - return page; + goto got_pg; } min += z->pages_low * sysctl_lower_zone_protection; } @@ -579,7 +586,7 @@ (!wait && z->free_pages >= z->pages_high)) { page = buffered_rmqueue(z, order, cold); if (page) - return page; + goto got_pg; } min += local_min * sysctl_lower_zone_protection; } @@ -594,7 +601,7 @@ page = buffered_rmqueue(z, order, cold); if (page) - return page; + goto got_pg; } goto nopage; } @@ -622,7 +629,7 @@ (!wait && z->free_pages >= z->pages_high)) { page = buffered_rmqueue(z, order, cold); if (page) - return page; + goto got_pg; } min += z->pages_low * sysctl_lower_zone_protection; } @@ -653,6 +660,9 @@ current->comm, order, gfp_mask); } return NULL; +got_pg: + kernel_map_pages(page, 1 << order, 1); + return page; } /* @@ -726,6 +736,7 @@ return sum; } +EXPORT_SYMBOL(nr_free_pages); unsigned int nr_used_zone_pages(void) { @@ -818,6 +829,7 @@ EXPORT_PER_CPU_SYMBOL(page_states); atomic_t nr_pagecache = ATOMIC_INIT(0); +EXPORT_SYMBOL(nr_pagecache); #ifdef CONFIG_SMP DEFINE_PER_CPU(long, nr_pagecache_local) = 0; #endif @@ -896,7 +908,7 @@ { pg_data_t *pgdat = NODE_DATA(nid); - val->totalram = pgdat->node_size; + val->totalram = pgdat->node_present_pages; val->freeram = nr_free_pages_pgdat(pgdat); val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages; @@ -1131,12 +1143,13 @@ for (i = 0; i < MAX_NR_ZONES; i++) totalpages += zones_size[i]; - pgdat->node_size = totalpages; + pgdat->node_spanned_pages = totalpages; realtotalpages = totalpages; if (zholes_size) for (i = 0; i < MAX_NR_ZONES; i++) realtotalpages -= zholes_size[i]; + pgdat->node_present_pages = realtotalpages; printk("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); } @@ -1342,7 +1355,7 @@ pgdat->node_start_pfn = node_start_pfn; calculate_zone_totalpages(pgdat, zones_size, zholes_size); if (!node_mem_map) { - size = (pgdat->node_size + 1) * sizeof(struct page); + size = (pgdat->node_spanned_pages + 1) * sizeof(struct page); node_mem_map = alloc_bootmem_node(pgdat, size); } pgdat->node_mem_map = node_mem_map; diff -Nru a/mm/shmem.c b/mm/shmem.c --- a/mm/shmem.c Thu Jul 3 01:12:43 2003 +++ b/mm/shmem.c Thu Jul 3 01:12:43 2003 @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -507,7 +508,7 @@ */ change = VM_ACCT(attr->ia_size) - VM_ACCT(inode->i_size); if (change > 0) { - if (!vm_enough_memory(change)) + if (security_vm_enough_memory(change)) return -ENOMEM; } else if (attr->ia_size < inode->i_size) { vm_unacct_memory(-change); @@ -1139,7 +1140,7 @@ maxpos = inode->i_size; if (maxpos < pos + count) { maxpos = pos + count; - if (!vm_enough_memory(VM_ACCT(maxpos) - VM_ACCT(inode->i_size))) { + if (security_vm_enough_memory(VM_ACCT(maxpos) - VM_ACCT(inode->i_size))) { err = -ENOMEM; goto out; } @@ -1493,7 +1494,7 @@ memcpy(info, symname, len); inode->i_op = &shmem_symlink_inline_operations; } else { - if (!vm_enough_memory(VM_ACCT(1))) { + if (security_vm_enough_memory(VM_ACCT(1))) { iput(inode); return -ENOMEM; } @@ -1887,7 +1888,7 @@ if (size > SHMEM_MAX_BYTES) return ERR_PTR(-EINVAL); - if ((flags & VM_ACCOUNT) && !vm_enough_memory(VM_ACCT(size))) + if ((flags & VM_ACCOUNT) && security_vm_enough_memory(VM_ACCT(size))) return ERR_PTR(-ENOMEM); error = -ENOMEM; diff -Nru a/mm/slab.c b/mm/slab.c --- a/mm/slab.c Thu Jul 3 01:12:43 2003 +++ b/mm/slab.c Thu Jul 3 01:12:43 2003 @@ -89,7 +89,12 @@ #include #include #include +#include +#include + #include +#include +#include /* * DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL, @@ -351,6 +356,34 @@ #define POISON_AFTER 0x6b /* for use-after-free poisoning */ #define POISON_END 0xa5 /* end-byte of poisoning */ +static inline int obj_dbghead(kmem_cache_t *cachep) +{ + if (cachep->flags & SLAB_RED_ZONE) + return BYTES_PER_WORD; + return 0; +} + +static inline int obj_dbglen(kmem_cache_t *cachep) +{ + int len = 0; + + if (cachep->flags & SLAB_RED_ZONE) { + len += 2*BYTES_PER_WORD; + } + if (cachep->flags & SLAB_STORE_USER) { + len += BYTES_PER_WORD; + } + return len; +} +#else +static inline int obj_dbghead(kmem_cache_t *cachep) +{ + return 0; +} +static inline int obj_dbglen(kmem_cache_t *cachep) +{ + return 0; +} #endif /* @@ -430,6 +463,7 @@ * SLAB_RECLAIM_ACCOUNT turns this on per-slab */ atomic_t slab_reclaim_pages; +EXPORT_SYMBOL(slab_reclaim_pages); /* * chicken and egg problem: delay the per-cpu array allocation @@ -765,16 +799,45 @@ } #if DEBUG -static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val) + +#ifdef CONFIG_DEBUG_PAGEALLOC +static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr, unsigned long caller) { - int size = cachep->objsize; - if (cachep->flags & SLAB_RED_ZONE) { - addr += BYTES_PER_WORD; - size -= 2*BYTES_PER_WORD; - } - if (cachep->flags & SLAB_STORE_USER) { - size -= BYTES_PER_WORD; + int size = cachep->objsize-obj_dbglen(cachep); + + addr = (unsigned long *)&((char*)addr)[obj_dbghead(cachep)]; + + if (size < 5*sizeof(unsigned long)) + return; + + *addr++=0x12345678; + *addr++=caller; + *addr++=smp_processor_id(); + size -= 3*sizeof(unsigned long); + { + unsigned long *sptr = &caller; + unsigned long svalue; + + while (((long) sptr & (THREAD_SIZE-1)) != 0) { + svalue = *sptr++; + if (kernel_text_address(svalue)) { + *addr++=svalue; + size -= sizeof(unsigned long); + if (size <= sizeof(unsigned long)) + break; + } + } + } + *addr++=0x87654321; +} +#endif + +static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val) +{ + int size = cachep->objsize-obj_dbglen(cachep); + addr = &((char*)addr)[obj_dbghead(cachep)]; + memset(addr, val, size); *(unsigned char *)(addr+size-1) = POISON_END; } @@ -796,15 +859,11 @@ static void check_poison_obj(kmem_cache_t *cachep, void *addr) { - int size = cachep->objsize; void *end; - if (cachep->flags & SLAB_RED_ZONE) { - addr += BYTES_PER_WORD; - size -= 2*BYTES_PER_WORD; - } - if (cachep->flags & SLAB_STORE_USER) { - size -= BYTES_PER_WORD; - } + int size = cachep->objsize-obj_dbglen(cachep); + + addr = &((char*)addr)[obj_dbghead(cachep)]; + end = scan_poisoned_obj(addr, size); if (end) { int s; @@ -858,8 +917,16 @@ void *objp = slabp->s_mem + cachep->objsize * i; int objlen = cachep->objsize; - if (cachep->flags & SLAB_POISON) + if (cachep->flags & SLAB_POISON) { +#ifdef CONFIG_DEBUG_PAGEALLOC + if ((cachep->objsize%PAGE_SIZE)==0 && OFF_SLAB(cachep)) + kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE,1); + else + check_poison_obj(cachep, objp); +#else check_poison_obj(cachep, objp); +#endif + } if (cachep->flags & SLAB_STORE_USER) objlen -= BYTES_PER_WORD; @@ -952,6 +1019,10 @@ } #if FORCED_DEBUG +#ifdef CONFIG_DEBUG_PAGEALLOC + if (size < PAGE_SIZE-3*BYTES_PER_WORD && size > 128) + size = PAGE_SIZE-3*BYTES_PER_WORD; +#endif /* * Enable redzoning and last user accounting, except * - for caches with forced alignment: redzoning would violate the @@ -1404,6 +1475,8 @@ slab_error(cachep, "constructor overwrote the" " start of an object"); } + if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) + kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0); #else if (cachep->ctor) cachep->ctor(objp, cachep, ctor_flags); @@ -1584,25 +1657,28 @@ * caller can perform a verify of its state (debugging). * Called without the cache-lock held. */ - if (cachep->flags & SLAB_RED_ZONE) { - cachep->ctor(objp+BYTES_PER_WORD, + cachep->ctor(objp+obj_dbghead(cachep), cachep, SLAB_CTOR_CONSTRUCTOR|SLAB_CTOR_VERIFY); - } else { - cachep->ctor(objp, cachep, SLAB_CTOR_CONSTRUCTOR|SLAB_CTOR_VERIFY); - } } if (cachep->flags & SLAB_POISON && cachep->dtor) { /* we want to cache poison the object, * call the destruction callback */ - if (cachep->flags & SLAB_RED_ZONE) - cachep->dtor(objp+BYTES_PER_WORD, cachep, 0); - else - cachep->dtor(objp, cachep, 0); + cachep->dtor(objp+obj_dbghead(cachep), cachep, 0); } - if (cachep->flags & SLAB_POISON) + if (cachep->flags & SLAB_POISON) { +#ifdef CONFIG_DEBUG_PAGEALLOC + if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) { + store_stackinfo(cachep, objp, POISON_AFTER); + kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0); + } else { + poison_obj(cachep, objp, POISON_AFTER); + } +#else poison_obj(cachep, objp, POISON_AFTER); #endif + } +#endif return objp; } @@ -1617,6 +1693,7 @@ for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) { entries++; BUG_ON(entries > cachep->num); + BUG_ON(i < 0 || i >= cachep->num); } BUG_ON(entries != cachep->num - slabp->inuse); #endif @@ -1746,9 +1823,16 @@ if (!objp) return objp; - if (cachep->flags & SLAB_POISON) { + if (cachep->flags & SLAB_POISON) { +#ifdef CONFIG_DEBUG_PAGEALLOC + if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) + kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 1); + else + check_poison_obj(cachep, objp); +#else check_poison_obj(cachep, objp); - poison_obj(cachep, objp, POISON_BEFORE); +#endif + poison_obj(cachep, objp, POISON_BEFORE); } if (cachep->flags & SLAB_STORE_USER) { objlen -= BYTES_PER_WORD; @@ -2085,16 +2169,7 @@ unsigned int kmem_cache_size(kmem_cache_t *cachep) { - unsigned int objlen = cachep->objsize; - -#if DEBUG - if (cachep->flags & SLAB_RED_ZONE) - objlen -= 2*BYTES_PER_WORD; - if (cachep->flags & SLAB_STORE_USER) - objlen -= BYTES_PER_WORD; -#endif - - return objlen; + return cachep->objsize-obj_dbglen(cachep); } kmem_cache_t * kmem_find_general_cachep (size_t size, int gfpflags) @@ -2626,3 +2701,70 @@ return size; } +void ptrinfo(unsigned long addr) +{ + struct page *page; + + printk("Dumping data about address %p.\n", (void*)addr); + if (!virt_addr_valid((void*)addr)) { + printk("virt addr invalid.\n"); + return; + } + do { + pgd_t *pgd = pgd_offset_k(addr); + pmd_t *pmd; + if (pgd_none(*pgd)) { + printk("No pgd.\n"); + break; + } + pmd = pmd_offset(pgd, addr); + if (pmd_none(*pmd)) { + printk("No pmd.\n"); + break; + } +#ifdef CONFIG_X86 + if (pmd_large(*pmd)) { + printk("Large page.\n"); + break; + } +#endif + printk("normal page, pte_val 0x%llx\n", + (unsigned long long)pte_val(*pte_offset_kernel(pmd, addr))); + } while(0); + + page = virt_to_page((void*)addr); + printk("struct page at %p, flags %lxh.\n", page, page->flags); + if (PageSlab(page)) { + kmem_cache_t *c; + struct slab *s; + unsigned long flags; + int objnr; + void *objp; + + c = GET_PAGE_CACHE(page); + printk("belongs to cache %s.\n",c->name); + + spin_lock_irqsave(&c->spinlock, flags); + s = GET_PAGE_SLAB(page); + printk("slabp %p with %d inuse objects (from %d).\n", + s, s->inuse, c->num); + check_slabp(c,s); + + objnr = (addr-(unsigned long)s->s_mem)/c->objsize; + objp = s->s_mem+c->objsize*objnr; + printk("points into object no %d, starting at %p, len %d.\n", + objnr, objp, c->objsize); + if (objnr >= c->num) { + printk("Bad obj number.\n"); + } else { + kernel_map_pages(virt_to_page(objp), c->objsize/PAGE_SIZE, 1); + + printk("redzone: %lxh/%lxh/%lxh.\n", + ((unsigned long*)objp)[0], + ((unsigned long*)(objp+c->objsize))[-2], + ((unsigned long*)(objp+c->objsize))[-1]); + } + spin_unlock_irqrestore(&c->spinlock, flags); + + } +} diff -Nru a/mm/swap.c b/mm/swap.c --- a/mm/swap.c Thu Jul 3 01:12:43 2003 +++ b/mm/swap.c Thu Jul 3 01:12:43 2003 @@ -20,6 +20,7 @@ #include #include #include +#include #include #include /* for try_to_release_page() */ #include @@ -370,6 +371,7 @@ } preempt_enable(); } +EXPORT_SYMBOL(vm_acct_memory); #endif diff -Nru a/mm/swapfile.c b/mm/swapfile.c --- a/mm/swapfile.c Thu Jul 3 01:12:43 2003 +++ b/mm/swapfile.c Thu Jul 3 01:12:43 2003 @@ -20,7 +20,9 @@ #include #include #include +#include #include +#include #include #include @@ -30,6 +32,8 @@ int total_swap_pages; static int swap_overflow; +EXPORT_SYMBOL(total_swap_pages); + static const char Bad_file[] = "Bad swap file entry "; static const char Unused_file[] = "Unused swap file entry "; static const char Bad_offset[] = "Bad swap offset entry "; @@ -1042,7 +1046,7 @@ swap_list_unlock(); goto out_dput; } - if (vm_enough_memory(p->pages)) + if (!security_vm_enough_memory(p->pages)) vm_unacct_memory(p->pages); else { err = -ENOMEM; diff -Nru a/security/capability.c b/security/capability.c --- a/security/capability.c Thu Jul 3 01:12:43 2003 +++ b/security/capability.c Thu Jul 3 01:12:43 2003 @@ -15,6 +15,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -275,6 +278,65 @@ return 0; } +/* + * Check that a process has enough memory to allocate a new virtual + * mapping. 0 means there is enough memory for the allocation to + * succeed and -ENOMEM implies there is not. + * + * We currently support three overcommit policies, which are set via the + * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-acounting + * + * Strict overcommit modes added 2002 Feb 26 by Alan Cox. + * Additional code 2002 Jul 20 by Robert Love. + */ +int cap_vm_enough_memory(long pages) +{ + unsigned long free, allowed; + + vm_acct_memory(pages); + + /* + * Sometimes we want to use more memory than we have + */ + if (sysctl_overcommit_memory == 1) + return 0; + + if (sysctl_overcommit_memory == 0) { + free = get_page_cache_size(); + free += nr_free_pages(); + free += nr_swap_pages; + + /* + * Any slabs which are created with the + * SLAB_RECLAIM_ACCOUNT flag claim to have contents + * which are reclaimable, under pressure. The dentry + * cache and most inode caches should fall into this + */ + free += atomic_read(&slab_reclaim_pages); + + /* + * Leave the last 3% for root + */ + if (!capable(CAP_SYS_ADMIN)) + free -= free / 32; + + if (free > pages) + return 0; + vm_unacct_memory(pages); + return -ENOMEM; + } + + allowed = totalram_pages * sysctl_overcommit_ratio / 100; + allowed += total_swap_pages; + + if (atomic_read(&vm_committed_space) < allowed) + return 0; + + vm_unacct_memory(pages); + + return -ENOMEM; +} + EXPORT_SYMBOL(cap_capable); EXPORT_SYMBOL(cap_ptrace); EXPORT_SYMBOL(cap_capget); @@ -286,6 +348,7 @@ EXPORT_SYMBOL(cap_task_post_setuid); EXPORT_SYMBOL(cap_task_reparent_to_init); EXPORT_SYMBOL(cap_syslog); +EXPORT_SYMBOL(cap_vm_enough_memory); #ifdef CONFIG_SECURITY @@ -307,6 +370,8 @@ .task_reparent_to_init = cap_task_reparent_to_init, .syslog = cap_syslog, + + .vm_enough_memory = cap_vm_enough_memory, }; #if defined(CONFIG_SECURITY_CAPABILITIES_MODULE) diff -Nru a/security/dummy.c b/security/dummy.c --- a/security/dummy.c Thu Jul 3 01:12:43 2003 +++ b/security/dummy.c Thu Jul 3 01:12:43 2003 @@ -17,6 +17,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -97,6 +100,54 @@ return 0; } +static int dummy_vm_enough_memory(long pages) +{ + unsigned long free, allowed; + + vm_acct_memory(pages); + + /* + * Sometimes we want to use more memory than we have + */ + if (sysctl_overcommit_memory == 1) + return 0; + + if (sysctl_overcommit_memory == 0) { + free = get_page_cache_size(); + free += nr_free_pages(); + free += nr_swap_pages; + + /* + * Any slabs which are created with the + * SLAB_RECLAIM_ACCOUNT flag claim to have contents + * which are reclaimable, under pressure. The dentry + * cache and most inode caches should fall into this + */ + free += atomic_read(&slab_reclaim_pages); + + /* + * Leave the last 3% for root + */ + if (current->euid) + free -= free / 32; + + if (free > pages) + return 0; + vm_unacct_memory(pages); + return -ENOMEM; + } + + allowed = totalram_pages * sysctl_overcommit_ratio / 100; + allowed += total_swap_pages; + + if (atomic_read(&vm_committed_space) < allowed) + return 0; + + vm_unacct_memory(pages); + + return -ENOMEM; +} + static int dummy_bprm_alloc_security (struct linux_binprm *bprm) { return 0; @@ -793,6 +844,7 @@ set_to_dummy_if_null(ops, quota_on); set_to_dummy_if_null(ops, sysctl); set_to_dummy_if_null(ops, syslog); + set_to_dummy_if_null(ops, vm_enough_memory); set_to_dummy_if_null(ops, bprm_alloc_security); set_to_dummy_if_null(ops, bprm_free_security); set_to_dummy_if_null(ops, bprm_compute_creds);