The zone->pressure field is supposed to record the amount of reclaim pressure which this zone is under. We need this info so we know whether to unmap pages from pagetables right from the outset of a balance_pgdat() or try_to_free_pages() invokation. The problem with the current code is that the exponential average gets tugged around too much: as we perform the increasing-priority scan, the pressure metric is made artificially low by the early part of the scan. So instead what we do here is to record within the zone the scanning priority from the zone's previous scan. It is defined as the priority at which the zone achieved the "enough pages free" state. This prev_priority is used on the next scan for the do-we-need-to-be-unmapping-pages decision. include/linux/mmzone.h | 23 +++++++++++++------- mm/vmscan.c | 56 +++++++++++++++++++------------------------------ 2 files changed, 37 insertions(+), 42 deletions(-) diff -puN mm/vmscan.c~zone-pressure-simplification mm/vmscan.c --- 25/mm/vmscan.c~zone-pressure-simplification 2003-08-28 23:03:41.000000000 -0700 +++ 25-akpm/mm/vmscan.c 2003-08-28 23:03:41.000000000 -0700 @@ -80,25 +80,6 @@ static long total_memory; #endif /* - * exponentially decaying average - */ -static inline int expavg(int avg, int val) -{ - return ((val - avg) >> 1) + avg; -} - -static void zone_adj_pressure(struct zone *zone, int priority) -{ - zone->pressure = expavg(zone->pressure, - (DEF_PRIORITY - priority) << 10); -} - -static int pressure_to_priority(int pressure) -{ - return DEF_PRIORITY - (pressure >> 10); -} - -/* * The list of shrinker callbacks used by to apply pressure to * ageable caches. */ @@ -646,7 +627,7 @@ refill_inactive_zone(struct zone *zone, * `distress' is a measure of how much trouble we're having reclaiming * pages. 0 -> no problems. 100 -> great trouble. */ - distress = 100 >> pressure_to_priority(zone->pressure); + distress = 100 >> zone->prev_priority; /* * The point of this algorithm is to decide when to start reclaiming @@ -830,6 +811,9 @@ shrink_caches(struct zone *classzone, in int nr_mapped = 0; int max_scan; + if (zone->free_pages < zone->pages_high) + zone->temp_priority = priority; + if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; /* Let kswapd poll it */ @@ -843,10 +827,8 @@ shrink_caches(struct zone *classzone, in ret += shrink_zone(zone, max_scan, gfp_mask, to_reclaim, &nr_mapped, ps, priority); *total_scanned += max_scan + nr_mapped; - if (ret >= nr_pages) { - zone_adj_pressure(zone, priority); + if (ret >= nr_pages) break; - } } return ret; } @@ -880,6 +862,9 @@ int try_to_free_pages(struct zone *cz, inc_page_state(allocstall); + for (zone = cz; zone >= cz->zone_pgdat->node_zones; --zone) + zone->temp_priority = DEF_PRIORITY; + for (priority = DEF_PRIORITY; priority >= 0; priority--) { int total_scanned = 0; struct page_state ps; @@ -912,9 +897,9 @@ int try_to_free_pages(struct zone *cz, } if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) out_of_memory(); - for (zone = cz; zone >= cz->zone_pgdat->node_zones; -- zone) - zone_adj_pressure(zone, -1); out: + for (zone = cz; zone >= cz->zone_pgdat->node_zones; --zone) + zone->prev_priority = zone->temp_priority; return ret; } @@ -945,6 +930,12 @@ static int balance_pgdat(pg_data_t *pgda inc_page_state(pageoutrun); + for (i = 0; i < pgdat->nr_zones; i++) { + struct zone *zone = pgdat->node_zones + i; + + zone->temp_priority = DEF_PRIORITY; + } + for (priority = DEF_PRIORITY; priority; priority--) { int all_zones_ok = 1; @@ -961,11 +952,10 @@ static int balance_pgdat(pg_data_t *pgda to_reclaim = min(to_free, SWAP_CLUSTER_MAX*8); } else { /* Zone balancing */ to_reclaim = zone->pages_high-zone->free_pages; - if (to_reclaim <= 0) { - zone_adj_pressure(zone, priority); + if (to_reclaim <= 0) continue; - } } + zone->temp_priority = priority; all_zones_ok = 0; max_scan = zone->nr_inactive >> priority; if (max_scan < to_reclaim * 2) @@ -989,13 +979,11 @@ static int balance_pgdat(pg_data_t *pgda if (to_free > 0) blk_congestion_wait(WRITE, HZ/10); } - if (priority <= 0) { - for (i = 0; i < pgdat->nr_zones; i++) { - struct zone *zone = pgdat->node_zones + i; - if (zone->free_pages < zone->pages_high) - zone_adj_pressure(zone, -1); - } + for (i = 0; i < pgdat->nr_zones; i++) { + struct zone *zone = pgdat->node_zones + i; + + zone->prev_priority = zone->temp_priority; } return nr_pages - to_free; } diff -puN include/linux/mmzone.h~zone-pressure-simplification include/linux/mmzone.h --- 25/include/linux/mmzone.h~zone-pressure-simplification 2003-08-28 23:03:41.000000000 -0700 +++ 25-akpm/include/linux/mmzone.h 2003-08-28 23:03:41.000000000 -0700 @@ -89,17 +89,24 @@ struct zone { ZONE_PADDING(_pad2_) - /* - * measure of scanning intensity for this zone. It is calculated - * as exponentially decaying average of the scanning priority - * required to free enough pages in this zone - * (zone_adj_pressure()). + /* + * prev_priority holds the scanning priority for this zone. It is + * defined as the scanning priority at which we achieved our reclaim + * target at the previous try_to_free_pages() or balance_pgdat() + * invokation. + * + * We use prev_priority as a measure of how much stress page reclaim is + * under - it drives the swappiness decision: whether to unmap mapped + * pages. * - * 0 --- low pressure + * temp_priority is used to remember the scanning priority at which + * this zone was successfully refilled to free_pages == pages_high. * - * (DEF_PRIORITY << 10) --- high pressure + * Access to both these fields is quite racy even on uniprocessor. But + * it is expected to average out OK. */ - int pressure; + int temp_priority; + int prev_priority; /* * free areas of different sizes _