We've been futzing with the scan rates of the inactive and active lists far too much, and it's still not right (Anton reports interrupt-off times of over a second). - We have this logic in there from 2.4.early (at least) which tries to keep the inactive list 1/3rd the size of the active list. Or something. I really cannot see any logic behind this, so toss it out and change the arithmetic in there so that all pages on both lists have equal scan rates. - Chunk the work up so we never hold interrupts off for more that 32 pages worth of scanning. - Make the per-zone scan-count accumulators unsigned long rather than atomic_t. Mainly because atomic_t's could conceivably overflow, but also because access to these counters is racy-by-design anyway. Signed-off-by: Andrew Morton --- 25-akpm/include/linux/mmzone.h | 4 +- 25-akpm/mm/page_alloc.c | 4 +- 25-akpm/mm/vmscan.c | 70 ++++++++++++++++++----------------------- 3 files changed, 35 insertions(+), 43 deletions(-) diff -puN mm/vmscan.c~vmscan-scan-sanity mm/vmscan.c --- 25/mm/vmscan.c~vmscan-scan-sanity 2004-06-15 02:19:01.485627112 -0700 +++ 25-akpm/mm/vmscan.c 2004-06-15 02:49:29.317754392 -0700 @@ -789,54 +789,46 @@ refill_inactive_zone(struct zone *zone, } /* - * Scan `nr_pages' from this zone. Returns the number of reclaimed pages. * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ static void shrink_zone(struct zone *zone, struct scan_control *sc) { - unsigned long scan_active, scan_inactive; - int count; - - scan_inactive = (zone->nr_active + zone->nr_inactive) >> sc->priority; + unsigned long nr_active; + unsigned long nr_inactive; /* - * Try to keep the active list 2/3 of the size of the cache. And - * make sure that refill_inactive is given a decent number of pages. - * - * The "scan_active + 1" here is important. With pagecache-intensive - * workloads the inactive list is huge, and `ratio' evaluates to zero - * all the time. Which pins the active list memory. So we add one to - * `scan_active' just to make sure that the kernel will slowly sift - * through the active list. + * Add one to `nr_to_scan' just to make sure that the kernel will + * slowly sift through the active list. */ - if (zone->nr_active >= 4*(zone->nr_inactive*2 + 1)) { - /* Don't scan more than 4 times the inactive list scan size */ - scan_active = 4*scan_inactive; - } else { - unsigned long long tmp; - - /* Cast to long long so the multiply doesn't overflow */ - - tmp = (unsigned long long)scan_inactive * zone->nr_active; - do_div(tmp, zone->nr_inactive*2 + 1); - scan_active = (unsigned long)tmp; - } - - atomic_add(scan_active + 1, &zone->nr_scan_active); - count = atomic_read(&zone->nr_scan_active); - if (count >= SWAP_CLUSTER_MAX) { - atomic_set(&zone->nr_scan_active, 0); - sc->nr_to_scan = count; - refill_inactive_zone(zone, sc); - } + zone->nr_scan_active += (zone->nr_active >> sc->priority) + 1; + nr_active = zone->nr_scan_active; + if (nr_active >= SWAP_CLUSTER_MAX) + zone->nr_scan_active = 0; + else + nr_active = 0; + + zone->nr_scan_inactive += (zone->nr_inactive >> sc->priority) + 1; + nr_inactive = zone->nr_scan_inactive; + if (nr_inactive >= SWAP_CLUSTER_MAX) + zone->nr_scan_inactive = 0; + else + nr_inactive = 0; + + while (nr_active || nr_inactive) { + if (nr_active) { + sc->nr_to_scan = min(nr_active, + (unsigned long)SWAP_CLUSTER_MAX); + nr_active -= sc->nr_to_scan; + refill_inactive_zone(zone, sc); + } - atomic_add(scan_inactive, &zone->nr_scan_inactive); - count = atomic_read(&zone->nr_scan_inactive); - if (count >= SWAP_CLUSTER_MAX) { - atomic_set(&zone->nr_scan_inactive, 0); - sc->nr_to_scan = count; - shrink_cache(zone, sc); + if (nr_inactive) { + sc->nr_to_scan = min(nr_inactive, + (unsigned long)SWAP_CLUSTER_MAX); + nr_inactive -= sc->nr_to_scan; + shrink_cache(zone, sc); + } } } diff -puN include/linux/mmzone.h~vmscan-scan-sanity include/linux/mmzone.h --- 25/include/linux/mmzone.h~vmscan-scan-sanity 2004-06-15 02:49:35.705783264 -0700 +++ 25-akpm/include/linux/mmzone.h 2004-06-15 02:49:48.283871104 -0700 @@ -118,8 +118,8 @@ struct zone { spinlock_t lru_lock; struct list_head active_list; struct list_head inactive_list; - atomic_t nr_scan_active; - atomic_t nr_scan_inactive; + unsigned long nr_scan_active; + unsigned long nr_scan_inactive; unsigned long nr_active; unsigned long nr_inactive; int all_unreclaimable; /* All pages pinned */ diff -puN mm/page_alloc.c~vmscan-scan-sanity mm/page_alloc.c --- 25/mm/page_alloc.c~vmscan-scan-sanity 2004-06-15 02:50:04.404420408 -0700 +++ 25-akpm/mm/page_alloc.c 2004-06-15 02:50:53.752918296 -0700 @@ -1482,8 +1482,8 @@ static void __init free_area_init_core(s zone_names[j], realsize, batch); INIT_LIST_HEAD(&zone->active_list); INIT_LIST_HEAD(&zone->inactive_list); - atomic_set(&zone->nr_scan_active, 0); - atomic_set(&zone->nr_scan_inactive, 0); + zone->nr_scan_active = 0; + zone->nr_scan_inactive = 0; zone->nr_active = 0; zone->nr_inactive = 0; if (!size) _