try_to_free_pages() currently fails to notice that it successfully freed slab pages via shrink_slab(). So it can keep looping and eventually call out_of_memory(), even though there's a lot of memory now free. And even if it doesn't do that, it can free too much memory. The patch changes try_to_free_pages() so that it will notice freed slab pages and will return when enough memory has been freed via shrink_slab(). Many options were considered, but must of them were unacceptably inaccurate, intrusive or sleazy. I ended up putting the accounting into a stack-local structure which is pointed to by current->reclaim_state. One reason for this is that we can cleanly resurrect the current->local_pages pool by putting it into struct reclaim_state. (current->local_pages was removed because the per-cpu page pools in the page allocator largely duplicate its function. But it is still possible for interrupt-time allocations to steal just-freed pages, so we might want to put it back some time.) 25-akpm/include/linux/sched.h | 5 +++++ 25-akpm/include/linux/swap.h | 8 ++++++++ 25-akpm/mm/page_alloc.c | 6 ++++++ 25-akpm/mm/slab.c | 6 +++++- 25-akpm/mm/vmscan.c | 37 +++++++++++++++++++++++++++---------- 5 files changed, 51 insertions(+), 11 deletions(-) diff -puN include/linux/sched.h~shrink_slab-accounting include/linux/sched.h --- 25/include/linux/sched.h~shrink_slab-accounting Thu May 1 14:07:15 2003 +++ 25-akpm/include/linux/sched.h Thu May 1 14:07:15 2003 @@ -294,6 +294,7 @@ extern struct user_struct root_user; typedef struct prio_array prio_array_t; struct backing_dev_info; +struct reclaim_state; /* POSIX.1b interval timer structure. */ struct k_itimer { @@ -435,6 +436,10 @@ struct task_struct { /* journalling filesystem info */ void *journal_info; + +/* VM state */ + struct reclaim_state *reclaim_state; + struct dentry *proc_dentry; struct backing_dev_info *backing_dev_info; diff -puN mm/vmscan.c~shrink_slab-accounting mm/vmscan.c --- 25/mm/vmscan.c~shrink_slab-accounting Thu May 1 14:07:15 2003 +++ 25-akpm/mm/vmscan.c Thu May 1 14:10:23 2003 @@ -817,12 +817,14 @@ shrink_caches(struct zone *classzone, in * excessive rotation of the inactive list, which is _supposed_ to be an LRU, * yes? */ -int try_to_free_pages(struct zone *classzone, +int try_to_free_pages(struct zone *cz, unsigned int gfp_mask, unsigned int order) { int priority; + int ret = 0; const int nr_pages = SWAP_CLUSTER_MAX; int nr_reclaimed = 0; + struct reclaim_state *reclaim_state = current->reclaim_state; inc_page_state(allocstall); @@ -831,11 +833,12 @@ int try_to_free_pages(struct zone *class struct page_state ps; get_page_state(&ps); - nr_reclaimed += shrink_caches(classzone, priority, - &total_scanned, gfp_mask, - nr_pages, &ps); - if (nr_reclaimed >= nr_pages) - return 1; + nr_reclaimed += shrink_caches(cz, priority, &total_scanned, + gfp_mask, nr_pages, &ps); + if (nr_reclaimed >= nr_pages) { + ret = 1; + goto out; + } if (!(gfp_mask & __GFP_FS)) break; /* Let the caller handle it */ /* @@ -847,12 +850,18 @@ int try_to_free_pages(struct zone *class /* Take a nap, wait for some writeback to complete */ blk_congestion_wait(WRITE, HZ/10); - if (classzone - classzone->zone_pgdat->node_zones < ZONE_HIGHMEM) + if (cz - cz->zone_pgdat->node_zones < ZONE_HIGHMEM) { shrink_slab(total_scanned, gfp_mask); + if (reclaim_state) { + nr_reclaimed += reclaim_state->reclaimed_slab; + reclaim_state->reclaimed_slab = 0; + } + } } if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) out_of_memory(); - return 0; +out: + return ret; } /* @@ -878,6 +887,7 @@ static int balance_pgdat(pg_data_t *pgda int to_free = nr_pages; int priority; int i; + struct reclaim_state *reclaim_state = current->reclaim_state; inc_page_state(pageoutrun); @@ -908,8 +918,11 @@ static int balance_pgdat(pg_data_t *pgda max_scan = SWAP_CLUSTER_MAX; to_free -= shrink_zone(zone, max_scan, GFP_KERNEL, to_reclaim, &nr_mapped, ps, priority); - if (i < ZONE_HIGHMEM) + if (i < ZONE_HIGHMEM) { + reclaim_state->reclaimed_slab = 0; shrink_slab(max_scan + nr_mapped, GFP_KERNEL); + to_free += reclaim_state->reclaimed_slab; + } if (zone->all_unreclaimable) continue; if (zone->pages_scanned > zone->present_pages * 2) @@ -940,10 +953,14 @@ int kswapd(void *p) pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; DEFINE_WAIT(wait); + struct reclaim_state reclaim_state = { + .reclaimed_slab = 0, + }; daemonize("kswapd%d", pgdat->node_id); set_cpus_allowed(tsk, node_to_cpumask(pgdat->node_id)); - + current->reclaim_state = &reclaim_state; + /* * Tell the memory management that we're a "memory allocator", * and that if we need more memory we should get access to it diff -puN include/linux/swap.h~shrink_slab-accounting include/linux/swap.h --- 25/include/linux/swap.h~shrink_slab-accounting Thu May 1 14:07:15 2003 +++ 25-akpm/include/linux/swap.h Thu May 1 14:07:15 2003 @@ -66,6 +66,14 @@ typedef struct { unsigned long val; } swp_entry_t; +/* + * current->reclaim_state points to one of these when a task is running + * memory reclaim + */ +struct reclaim_state { + unsigned long reclaimed_slab; +}; + #ifdef __KERNEL__ struct address_space; diff -puN mm/slab.c~shrink_slab-accounting mm/slab.c --- 25/mm/slab.c~shrink_slab-accounting Thu May 1 14:07:15 2003 +++ 25-akpm/mm/slab.c Thu May 1 14:07:15 2003 @@ -76,6 +76,7 @@ #include #include #include +#include #include #include #include @@ -714,6 +715,7 @@ static inline void kmem_freepages (kmem_ { unsigned long i = (1<gfporder); struct page *page = virt_to_page(addr); + const unsigned long nr_freed = i; /* free_pages() does not clear the type bit - we do that. * The pages have been unlinked from their cache-slab, @@ -722,9 +724,11 @@ static inline void kmem_freepages (kmem_ */ while (i--) { ClearPageSlab(page); - dec_page_state(nr_slab); page++; } + sub_page_state(nr_slab, nr_freed); + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += nr_freed; free_pages((unsigned long)addr, cachep->gfporder); } diff -puN mm/page_alloc.c~shrink_slab-accounting mm/page_alloc.c --- 25/mm/page_alloc.c~shrink_slab-accounting Thu May 1 14:07:15 2003 +++ 25-akpm/mm/page_alloc.c Thu May 1 14:07:15 2003 @@ -537,6 +537,7 @@ __alloc_pages(unsigned int gfp_mask, uns int i; int cold; int do_retry; + struct reclaim_state reclaim_state; if (wait) might_sleep(); @@ -613,7 +614,12 @@ rebalance: goto nopage; current->flags |= PF_MEMALLOC; + reclaim_state.reclaimed_slab = 0; + current->reclaim_state = &reclaim_state; + try_to_free_pages(classzone, gfp_mask, order); + + current->reclaim_state = NULL; current->flags &= ~PF_MEMALLOC; /* go through the zonelist yet one more time */ _