From: Robert Love - Let real-time tasks dip further into the reserves than usual in __alloc_pages(). There are a lot of ways to special case this. This patch just cuts z->pages_low in half, before doing the incremental min thing, for real-time tasks. I do not do anything in the low memory slow path. We can be a _lot_ more aggressive if we want. Right now, we just give real-time tasks a little help. - Never ever call balance_dirty_pages() on a real-time task. Where and how exactly we handle this is up for debate. We could, for example, special case real-time tasks inside balance_dirty_pages(). This would allow us to perform some of the work (say, waking up pdflush) but not other work (say, the active throttling). As it stands now, we do the per-processor accounting in balance_dirty_pages_ratelimited() but we never call balance_dirty_pages(). Lots of approaches work. What we want to do is never engage the real-time task in forced writeback. include/linux/sched.h | 4 +++- kernel/sched.c | 1 - mm/page-writeback.c | 6 +++++- mm/page_alloc.c | 29 ++++++++++++++++++++--------- 4 files changed, 28 insertions(+), 12 deletions(-) diff -puN include/linux/sched.h~rt-tasks-special-vm-treatment include/linux/sched.h --- 25/include/linux/sched.h~rt-tasks-special-vm-treatment 2003-08-08 23:11:54.000000000 -0700 +++ 25-akpm/include/linux/sched.h 2003-08-08 23:11:54.000000000 -0700 @@ -282,7 +282,9 @@ struct signal_struct { #define MAX_RT_PRIO MAX_USER_RT_PRIO #define MAX_PRIO (MAX_RT_PRIO + 40) - + +#define rt_task(p) ((p)->prio < MAX_RT_PRIO) + /* * Some day this will be a full-fledged user tracking system.. */ diff -puN kernel/sched.c~rt-tasks-special-vm-treatment kernel/sched.c --- 25/kernel/sched.c~rt-tasks-special-vm-treatment 2003-08-08 23:11:54.000000000 -0700 +++ 25-akpm/kernel/sched.c 2003-08-08 23:11:54.000000000 -0700 @@ -208,7 +208,6 @@ static DEFINE_PER_CPU(struct runqueue, r #define this_rq() (&__get_cpu_var(runqueues)) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) -#define rt_task(p) ((p)->prio < MAX_RT_PRIO) /* * Default context-switch locking: diff -puN mm/page_alloc.c~rt-tasks-special-vm-treatment mm/page_alloc.c --- 25/mm/page_alloc.c~rt-tasks-special-vm-treatment 2003-08-08 23:11:54.000000000 -0700 +++ 25-akpm/mm/page_alloc.c 2003-08-08 23:11:54.000000000 -0700 @@ -519,7 +519,8 @@ static struct page *buffered_rmqueue(str * * Herein lies the mysterious "incremental min". That's the * - * min += z->pages_low; + * local_low = z->pages_low; + * min += local_low; * * thing. The intent here is to provide additional protection to low zones for * allocation requests which _could_ use higher zones. So a GFP_HIGHMEM @@ -537,10 +538,11 @@ __alloc_pages(unsigned int gfp_mask, uns unsigned long min; struct zone **zones, *classzone; struct page *page; + struct reclaim_state reclaim_state; + struct task_struct *p = current; int i; int cold; int do_retry; - struct reclaim_state reclaim_state; if (wait) might_sleep(); @@ -558,8 +560,17 @@ __alloc_pages(unsigned int gfp_mask, uns min = 1UL << order; for (i = 0; zones[i] != NULL; i++) { struct zone *z = zones[i]; + unsigned long local_low; + + /* + * This is the fabled 'incremental min'. We let real-time tasks + * dip their real-time paws a little deeper into reserves. + */ + local_low = z->pages_low; + if (rt_task(p)) + local_low >>= 1; + min += local_low; - min += z->pages_low; if (z->free_pages >= min || (!wait && z->free_pages >= z->pages_high)) { page = buffered_rmqueue(z, order, cold); @@ -595,7 +606,7 @@ __alloc_pages(unsigned int gfp_mask, uns /* here we're in the low on memory slow path */ rebalance: - if ((current->flags & (PF_MEMALLOC | PF_MEMDIE)) && !in_interrupt()) { + if ((p->flags & (PF_MEMALLOC | PF_MEMDIE)) && !in_interrupt()) { /* go through the zonelist yet again, ignoring mins */ for (i = 0; zones[i] != NULL; i++) { struct zone *z = zones[i]; @@ -611,14 +622,14 @@ rebalance: if (!wait) goto nopage; - current->flags |= PF_MEMALLOC; + p->flags |= PF_MEMALLOC; reclaim_state.reclaimed_slab = 0; - current->reclaim_state = &reclaim_state; + p->reclaim_state = &reclaim_state; try_to_free_pages(classzone, gfp_mask, order); - current->reclaim_state = NULL; - current->flags &= ~PF_MEMALLOC; + p->reclaim_state = NULL; + p->flags &= ~PF_MEMALLOC; /* go through the zonelist yet one more time */ min = 1UL << order; @@ -658,7 +669,7 @@ nopage: if (!(gfp_mask & __GFP_NOWARN)) { printk("%s: page allocation failure." " order:%d, mode:0x%x\n", - current->comm, order, gfp_mask); + p->comm, order, gfp_mask); } return NULL; got_pg: diff -puN mm/page-writeback.c~rt-tasks-special-vm-treatment mm/page-writeback.c --- 25/mm/page-writeback.c~rt-tasks-special-vm-treatment 2003-08-08 23:11:54.000000000 -0700 +++ 25-akpm/mm/page-writeback.c 2003-08-08 23:11:54.000000000 -0700 @@ -219,7 +219,11 @@ void balance_dirty_pages_ratelimited(str if (dirty_exceeded) ratelimit = 8; - if (get_cpu_var(ratelimits)++ >= ratelimit) { + /* + * Check the rate limiting. Also, we do not want to throttle real-time + * tasks in balance_dirty_pages(). Period. + */ + if (get_cpu_var(ratelimits)++ >= ratelimit && !rt_task(current)) { __get_cpu_var(ratelimits) = 0; put_cpu_var(ratelimits); balance_dirty_pages(mapping); _