diff -urN linux.orig/arch/i386/kernel/irq.c linux/arch/i386/kernel/irq.c --- linux.orig/arch/i386/kernel/irq.c Thu May 10 16:04:39 2001 +++ linux/arch/i386/kernel/irq.c Thu May 10 12:16:21 2001 @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -576,7 +577,10 @@ irq_desc_t *desc = irq_desc + irq; struct irqaction * action; unsigned int status; + struct page_reservation *saved_irq_rsv; + saved_irq_rsv = current->page_reservations; + current->page_reservations = &irq_rsv; kstat.irqs[cpu][irq]++; spin_lock(&desc->lock); desc->handler->ack(irq); @@ -638,6 +642,7 @@ if (softirq_active(cpu) & softirq_mask(cpu)) do_softirq(); + current->page_reservations = saved_irq_rsv; return 1; } diff -urN linux.orig/fs/buffer.c linux/fs/buffer.c --- linux.orig/fs/buffer.c Thu May 10 16:07:27 2001 +++ linux/fs/buffer.c Thu May 10 12:15:35 2001 @@ -45,6 +45,7 @@ #include #include #include +#include #include #include @@ -2735,6 +2736,7 @@ */ int bdflush(void *sem) { + static struct page_reservation rsv; struct task_struct *tsk = current; int flushed; /* @@ -2748,6 +2750,12 @@ strcpy(tsk->comm, "bdflush"); bdflush_tsk = tsk; + init_page_reservation(&rsv, RSV_MULTISHOT, ZONE_NORMAL); + if (reserve_pages(&rsv, GFP_KERNEL, 32)) + panic("bdflush unable to reserve emergency pages!\n"); + tsk->page_reservations = &rsv; + + /* avoid getting signals */ spin_lock_irq(&tsk->sigmask_lock); flush_signals(tsk); @@ -2778,6 +2786,8 @@ the next schedule will block. */ __set_current_state(TASK_RUNNING); } + + destroy_page_reservation(&rsv); } /* @@ -2788,6 +2798,7 @@ */ int kupdate(void *sem) { + static struct page_reservation rsv; struct task_struct * tsk = current; int interval; @@ -2795,6 +2806,11 @@ tsk->pgrp = 1; strcpy(tsk->comm, "kupdated"); + init_page_reservation(&rsv, RSV_MULTISHOT, ZONE_NORMAL); + if (reserve_pages(&rsv, GFP_KERNEL, 32)) + panic("bdflush unable to reserve emergency pages!\n"); + tsk->page_reservations = &rsv; + /* sigstop and sigcont will stop and wakeup kupdate */ spin_lock_irq(&tsk->sigmask_lock); sigfillset(&tsk->blocked); @@ -2833,6 +2849,7 @@ #endif sync_old_buffers(); } + destroy_page_reservation(&rsv); } static int __init bdflush_init(void) diff -urN linux.orig/include/linux/mm/reservation.h linux/include/linux/mm/reservation.h --- linux.orig/include/linux/mm/reservation.h Wed Dec 31 19:00:00 1969 +++ linux/include/linux/mm/reservation.h Thu May 10 12:16:21 2001 @@ -0,0 +1,48 @@ +#ifndef __LINUX__MM__RESERVATION_H +#define __LINUX__MM__RESERVATION_H +/* inclinde/linux/mm/reservation.h + * written by Benjamin LaHaise + * + * Copyright 2001 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * based in part on ideas/code from Arjan Van de Ven and Stephen Tweedie. + */ + +#define RSV_ONESHOT 0x00 +#define RSV_MULTISHOT 0x01 /* reservation will replenish itself */ + +struct page_reservation { + struct list_head list; + unsigned avail, used; + int flags; + zone_t *zone; +}; + +extern struct page_reservation irq_rsv; + +extern void init_page_reservation(struct page_reservation *rsv, int flags, int zone); +extern void destroy_page_reservation(struct page_reservation *rsv); + +/* Reservation is an all or nothing thing. A successful reservation + * returns 0. Anything else is a failure. + */ +extern int reserve_pages(struct page_reservation *rsv, int gfp_mask, unsigned count); + +/* Release a previously reserved amount of memory. */ +extern void put_reserved_pages(struct page_reservation *rsv, unsigned count); + +#endif diff -urN linux.orig/include/linux/mmzone.h linux/include/linux/mmzone.h --- linux.orig/include/linux/mmzone.h Thu May 10 16:07:27 2001 +++ linux/include/linux/mmzone.h Thu May 10 15:45:34 2001 @@ -50,6 +50,10 @@ unsigned long inactive_dirty_pages; unsigned long pages_min, pages_low, pages_high; + /* Page reservation */ + unsigned long reserved_pages; + struct list_head depleted_rsv_list; + /* * free areas of different sizes */ diff -urN linux.orig/include/linux/sched.h linux/include/linux/sched.h --- linux.orig/include/linux/sched.h Thu May 10 16:07:27 2001 +++ linux/include/linux/sched.h Thu May 10 15:45:35 2001 @@ -406,6 +406,8 @@ u32 self_exec_id; /* Protection of (de-)allocation: mm, files, fs, tty */ spinlock_t alloc_lock; + + struct page_reservation *page_reservations; }; /* @@ -486,7 +488,8 @@ sig: &init_signals, \ pending: { NULL, &tsk.pending.head, {{0}}}, \ blocked: {{0}}, \ - alloc_lock: SPIN_LOCK_UNLOCKED \ + alloc_lock: SPIN_LOCK_UNLOCKED, \ + page_reservations: NULL, \ } diff -urN linux.orig/init/main.c linux/init/main.c --- linux.orig/init/main.c Thu May 10 16:04:39 2001 +++ linux/init/main.c Thu May 10 12:16:21 2001 @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -655,6 +656,8 @@ #endif mempages = num_physpages; + if (reserve_pages(&irq_rsv, GFP_KERNEL, mempages >> 8)) + panic("unable to reserve memory.\n"); fork_init(mempages); proc_caches_init(); vfs_caches_init(mempages); diff -urN linux.orig/kernel/exit.c linux/kernel/exit.c --- linux.orig/kernel/exit.c Thu May 10 16:07:27 2001 +++ linux/kernel/exit.c Thu May 10 12:15:34 2001 @@ -10,6 +10,7 @@ #include #include #include +#include #ifdef CONFIG_BSD_PROCESS_ACCT #include #endif @@ -422,6 +423,11 @@ NORET_TYPE void do_exit(long code) { struct task_struct *tsk = current; + + if (tsk->page_reservations) { + destroy_page_reservation(tsk->page_reservations); + tsk->page_reservations = NULL; + } if (in_interrupt()) panic("Aiee, killing interrupt handler!"); diff -urN linux.orig/kernel/fork.c linux/kernel/fork.c --- linux.orig/kernel/fork.c Thu May 10 16:07:27 2001 +++ linux/kernel/fork.c Thu May 10 12:15:34 2001 @@ -630,6 +630,7 @@ p->tty_old_pgrp = 0; p->times.tms_utime = p->times.tms_stime = 0; p->times.tms_cutime = p->times.tms_cstime = 0; + p->page_reservations = 0; #ifdef CONFIG_SMP { int i; diff -urN linux.orig/mm/page_alloc.c linux/mm/page_alloc.c --- linux.orig/mm/page_alloc.c Thu May 10 16:07:27 2001 +++ linux/mm/page_alloc.c Thu May 10 14:59:04 2001 @@ -18,7 +18,9 @@ #include #include #include +#include +struct page_reservation irq_rsv; int nr_swap_pages; int nr_active_pages; int nr_inactive_dirty_pages; @@ -99,7 +101,7 @@ page->flags &= ~((1<age = PAGE_AGE_START; - + zone = page->zone; mask = (~0UL) << order; @@ -115,7 +117,8 @@ __save_flags(flags); __cli(); - if (!order && (per_cpu->nr_pages < per_cpu->max_nr_pages)) { + if (!order && (per_cpu->nr_pages < per_cpu->max_nr_pages) && list_empty(&zone->depleted_rsv_list)) { +static int foo; if (foo++ < 5) printk("freeing per-cpu page\n"); list_add(&page->list, &per_cpu->head); per_cpu->nr_pages++; __restore_flags(flags); @@ -124,6 +127,20 @@ spin_lock(&zone->lock); + /* Check if we need to replenish any of this zone's reservations. */ + if (!list_empty(&zone->depleted_rsv_list)) { + struct page_reservation *rsv = list_entry(zone->depleted_rsv_list.next, struct page_reservation, list); +static int foo; if (foo++ < 5) printk("updating reserve: %p %u %u\n", rsv, rsv->avail, rsv->used); + if (!rsv->used) + BUG(); + rsv->avail++; + rsv->used--; + + list_del_init(&rsv->list); + if (rsv->used) + list_add(&rsv->list, zone->depleted_rsv_list.prev); + } + zone->free_pages -= mask; while (mask + (1 << (MAX_ORDER-1))) { @@ -190,8 +207,8 @@ } -static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned long order)); -static struct page * rmqueue(zone_t *zone, unsigned long order) +static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned long order, struct page_reservation *rsv)); +static struct page * rmqueue(zone_t *zone, unsigned long order, struct page_reservation *rsv) { per_cpu_t *per_cpu = zone->cpu_pages + smp_processor_id(); free_area_t * area = zone->free_area + order; @@ -232,6 +249,15 @@ zone->free_pages -= 1 << order; page = expand(zone, page, index, order, curr_order, area); + if (rsv && rsv->avail) { +static int foo; if (foo++ < 5) printk("alloc from reserv: %p %u %u\n", rsv, rsv->avail, rsv->used); + rsv->avail--; + if (!rsv->used++ && (rsv->flags & RSV_MULTISHOT)) { +static int foo; if (foo++ < 5) printk("multishot reserv: %p\n", rsv); + list_add(&rsv->list, &zone->depleted_rsv_list); +} + zone->reserved_pages--; + } spin_unlock_irqrestore(&zone->lock, flags); set_page_count(page, 1); @@ -249,8 +275,9 @@ } #define PAGES_MIN 0 -#define PAGES_LOW 1 -#define PAGES_HIGH 2 +#define PAGES_RSV 1 +#define PAGES_LOW 2 +#define PAGES_HIGH 3 /* * This function does the dirty work for __alloc_pages @@ -261,10 +288,12 @@ unsigned long order, int limit, int direct_reclaim) { zone_t **zone = zonelist->zones; + struct page_reservation *rsv = NULL; for (;;) { zone_t *z = *(zone++); unsigned long water_mark; + unsigned long free_pages; if (!z) break; @@ -275,11 +304,22 @@ * We allocate if the number of free + inactive_clean * pages is above the watermark. */ + free_pages = z->free_pages - z->reserved_pages; + switch (limit) { default: case PAGES_MIN: water_mark = z->pages_min; break; + case PAGES_RSV: + water_mark = z->pages_low; + rsv = current->page_reservations; + if ((rsv->zone == z) && rsv->avail) { +static int foo; if (foo++ < 5) printk("hit page reservation: %p\n", rsv); + free_pages += rsv->avail; + } else + rsv = NULL; + break; case PAGES_LOW: water_mark = z->pages_low; break; @@ -287,14 +327,14 @@ water_mark = z->pages_high; } - if (z->free_pages + z->inactive_clean_pages > water_mark) { + if (free_pages + z->inactive_clean_pages > water_mark) { struct page *page = NULL; /* If possible, reclaim a page directly. */ - if (direct_reclaim && z->free_pages < z->pages_min + 8) + if (direct_reclaim && free_pages < z->pages_min + 8) page = reclaim_page(z); /* If that fails, fall back to rmqueue. */ if (!page) - page = rmqueue(z, order); + page = rmqueue(z, order, rsv); if (page) return page; } @@ -304,6 +344,8 @@ return NULL; } +extern struct page *get_reserved_page (void); + /* * This is the 'heart' of the zoned buddy allocator: @@ -320,7 +362,7 @@ * Allocations put pressure on the VM subsystem. */ memory_pressure++; - + /* * (If anyone calls gfp from interrupts nonatomically then it * will sooner or later tripped up by a schedule().) @@ -351,11 +393,11 @@ if (!z->size) BUG(); - if (z->free_pages >= z->pages_low) { - page = rmqueue(z, order); + if (z->free_pages - z->reserved_pages >= z->pages_low) { + page = rmqueue(z, order, NULL); if (page) - return page; - } else if (z->free_pages < z->pages_min && + goto out_success; + } else if (z->free_pages - z->reserved_pages < z->pages_min && waitqueue_active(&kreclaimd_wait)) { wake_up_interruptible(&kreclaimd_wait); } @@ -371,7 +413,7 @@ */ page = __alloc_pages_limit(zonelist, order, PAGES_HIGH, direct_reclaim); if (page) - return page; + goto out_success; /* * Then try to allocate a page from a zone with more @@ -383,7 +425,7 @@ */ page = __alloc_pages_limit(zonelist, order, PAGES_LOW, direct_reclaim); if (page) - return page; + goto out_success; /* * OK, none of the zones on our zonelist has lots @@ -418,8 +460,22 @@ */ page = __alloc_pages_limit(zonelist, order, PAGES_MIN, direct_reclaim); if (page) - return page; + goto out_success; + /* Memory reservation hook. Note: memory reservations are + * attempted after all other normal means of allocations have + * failed. Give it a try with the memory reservation and see + * what happens. + * TODO: with memory reservations in place, much of the code + * below is completely bogus. Clean this up! -ben + */ + if (!order && current->page_reservations) { +static int foo; if (foo++ < 5) printk("trying reservation: %p\n", current->page_reservations); + page = __alloc_pages_limit(zonelist, order, PAGES_RSV, direct_reclaim); + if (page) + goto out_success; + } + /* * If we dont want to try too hard then we can give up * now @@ -465,9 +521,9 @@ break; __free_page(page); /* Try if the allocation succeeds. */ - page = rmqueue(z, order); + page = rmqueue(z, order, NULL); if (page) - return page; + goto out_success; } } } @@ -511,31 +567,25 @@ if (direct_reclaim) { page = reclaim_page(z); if (page) - return page; + goto out_success; } /* XXX: is pages_min/4 a good amount to reserve for this? */ - if (z->free_pages < z->pages_min / 4 && + if (z->free_pages - z->reserved_pages < z->pages_min / 4 && !((current->flags & PF_MEMALLOC) && (gfp_mask & __GFP_WAIT))) continue; - page = rmqueue(z, order); + page = rmqueue(z, order, NULL); if (page) - return page; + goto out_success; } - // okay - we are in trouble, lets go to the DMA pool directly: - - { - zone_t *z = pgdat_list->node_zones; - - page = rmqueue(z, order); - if (page) - return page; - } /* No luck.. */ printk(KERN_INFO "__alloc_pages: %lu-order allocation failed.\n", order); return NULL; + +out_success: + return page; } /* @@ -588,7 +638,7 @@ sum = 0; while (pgdat) { for (zone = pgdat->node_zones; zone < pgdat->node_zones + MAX_NR_ZONES; zone++) - sum += zone->free_pages; + sum += zone->free_pages - zone->reserved_pages; pgdat = pgdat->node_next; } return sum; @@ -605,7 +655,8 @@ sum = 0; pgdat = pgdat_list; while (pgdat) { - sum += (pgdat->node_zones+zone_type)->free_pages; + zone_t *z = pgdat->node_zones+zone_type; + sum += z->free_pages - z->reserved_pages; pgdat = pgdat->node_next; } return sum; @@ -694,6 +745,7 @@ while (pgdat) { pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages; + pages -= pgdat->node_zones[ZONE_HIGHMEM].reserved_pages; pgdat = pgdat->node_next; } return pages; @@ -901,8 +953,11 @@ zone->lock = SPIN_LOCK_UNLOCKED; zone->zone_pgdat = pgdat; zone->free_pages = 0; + zone->reserved_pages = 0; zone->inactive_clean_pages = 0; zone->inactive_dirty_pages = 0; + zone->reserved_pages = 0; + INIT_LIST_HEAD(&zone->depleted_rsv_list); memlist_init(&zone->inactive_clean_list); if (!size) continue; @@ -961,6 +1016,8 @@ } } build_zonelists(pgdat); + + init_page_reservation(&irq_rsv, RSV_MULTISHOT, ZONE_NORMAL); } void __init free_area_init(unsigned long *zones_size) @@ -977,6 +1034,89 @@ for (j = 0; j < MAX_NR_ZONES; j++) printk("%d ", zone_balance_ratio[j]); printk("\n"); return 1; +} + +void init_page_reservation(struct page_reservation *rsv, int flags, int zone) +{ +static int foo; if (foo++ < 5) printk("init_page_reservation(%p, %d, %d)\n", rsv, flags, zone); + INIT_LIST_HEAD(&rsv->list); + rsv->avail = 0; + rsv->used = 0; + rsv->flags = flags; + + /* FIXME: This doesn't work properly on NUMA or multizoned setups. + */ + rsv->zone = &pgdat_list->node_zones[zone]; +} + +void destroy_page_reservation(struct page_reservation *rsv) +{ + unsigned long flags; + zone_t *zone = rsv->zone; +static int foo; if (foo++ < 5) printk("destroy_page_reservation(%p)\n", rsv); + + spin_lock_irqsave(&zone->lock, flags); + zone->reserved_pages -= rsv->avail; + list_del(&rsv->list); /* This relies on list_del_init being used */ + spin_unlock_irqrestore(&zone->lock, flags); + memset(rsv, 0x57, sizeof(*rsv)); +} + +int reserve_pages(struct page_reservation *rsv, int gfp_mask, unsigned count) +{ + unsigned long flags, free_pages; + zone_t *zone = rsv->zone; + unsigned orig = count; + int tries = 5; +static int foo; if (foo++ < 5) printk("reserve_pages(%p, %d, %u)\n", rsv, gfp_mask, count); + + while (--tries && count) { + spin_lock_irqsave(&zone->lock, flags); + free_pages = zone->free_pages - zone->reserved_pages; + if (free_pages > count) + free_pages = count; + count -= free_pages; + zone->reserved_pages += free_pages; + spin_unlock_irqrestore(&zone->lock, flags); + + if (count) { + try_to_free_pages(gfp_mask); + if ((gfp_mask & __GFP_WAIT) && !(current->flags & PF_ATOMICALLOC)) { + __set_current_state(TASK_RUNNING); + current->policy |= SCHED_YIELD; + schedule(); + } + } + } + + if (!count) + return 0; + + put_reserved_pages(rsv, orig - count); + return -ENOMEM; +} + +void put_reserved_pages(struct page_reservation *rsv, unsigned count) +{ + unsigned long flags; + zone_t *zone = rsv->zone; +static int foo; if (foo++ < 5) printk("put_reserved_pages(%p, %u)\n", rsv, count); + spin_lock_irqsave(&zone->lock, flags); + + if (rsv->used <= count) { + count -= rsv->used; + rsv->used = 0; + } else { + rsv->used -= count; + count = 0; + } + + if (count > rsv->avail) + BUG(); + + rsv->avail -= count; + zone->reserved_pages -= count; + spin_unlock_irqrestore(&zone->lock, flags); } __setup("memfrac=", setup_mem_frac); diff -urN linux.orig/mm/vmscan.c linux/mm/vmscan.c --- linux.orig/mm/vmscan.c Thu May 10 16:07:27 2001 +++ linux/mm/vmscan.c Thu May 10 12:15:35 2001 @@ -21,6 +21,7 @@ #include #include #include +#include #include @@ -505,7 +506,7 @@ if (launder_loop && !maxlaunder) break; if (launder_loop && zone->inactive_clean_pages + - zone->free_pages > zone->pages_high) + zone->free_pages - zone->reserved_pages > zone->pages_high) goto skip_page; /* @@ -541,8 +542,9 @@ ClearPageDirty(page); page_cache_get(page); spin_unlock(&pagemap_lru_lock); - - writepage(page); + + writepage(page); + /* XXX: all ->writepage()s should use nr_async_pages */ if (!PageSwapCache(page)) flushed_pages++; @@ -835,10 +837,11 @@ for(i = 0; i < MAX_NR_ZONES; i++) { zone_t *zone = pgdat->node_zones+ i; if (zone->size && (zone->inactive_clean_pages + - zone->free_pages < zone->pages_min+1)) { + zone->free_pages - zone->reserved_pages < zone->pages_min+1)) { /* + 1 to have overlap with alloc_pages() !! */ sum += zone->pages_min + 1; sum -= zone->free_pages; + sum += zone->reserved_pages; sum -= zone->inactive_clean_pages; } } @@ -881,6 +884,7 @@ zone_shortage -= zone->inactive_dirty_pages; zone_shortage -= zone->inactive_clean_pages; zone_shortage -= zone->free_pages; + zone_shortage += zone->reserved_pages; if (zone_shortage > 0) shortage += zone_shortage; } @@ -1009,6 +1013,7 @@ int kswapd(void *unused) { + static struct page_reservation kswapd_rsv; struct task_struct *tsk = current; tsk->session = 1; @@ -1016,6 +1021,11 @@ strcpy(tsk->comm, "kswapd"); sigfillset(&tsk->blocked); kswapd_task = tsk; + + init_page_reservation(&kswapd_rsv, RSV_MULTISHOT, ZONE_NORMAL); + if (reserve_pages(&kswapd_rsv, GFP_KERNEL, 32)) + panic("kswapd unable to reserve emergency pages!\n"); + tsk->page_reservations = &kswapd_rsv; /* * Tell the memory management that we're a "memory allocator", @@ -1086,6 +1096,8 @@ oom_kill(); } } + + destroy_page_reservation(&kswapd_rsv); } void wakeup_kswapd(void) @@ -1151,7 +1163,7 @@ if (!zone->size) continue; - while (zone->free_pages < zone->pages_low) { + while (zone->free_pages - zone->reserved_pages < zone->pages_low) { struct page * page; page = reclaim_page(zone); if (!page)