diff options
author | Greg Kroah-Hartman <gregkh@suse.de> | 2011-09-11 12:27:50 +0200 |
---|---|---|
committer | Greg Kroah-Hartman <gregkh@suse.de> | 2011-09-11 12:27:50 +0200 |
commit | 2cda044ac988bb928bd4dd5b105cfd1837916cbd (patch) | |
tree | 2a40cb11e267e438f81cb14a2b3cf99df8f630ab | |
parent | 16f25c3855d04e30346983c484e1145414969f86 (diff) | |
download | stable-queue-2cda044ac988bb928bd4dd5b105cfd1837916cbd.tar.gz |
3.0 patches
3 files changed, 333 insertions, 0 deletions
diff --git a/queue-3.0/mm-page-allocator-initialise-zlc-for-first-zone-eligible-for-zone_reclaim.patch b/queue-3.0/mm-page-allocator-initialise-zlc-for-first-zone-eligible-for-zone_reclaim.patch new file mode 100644 index 0000000000..6cc0774478 --- /dev/null +++ b/queue-3.0/mm-page-allocator-initialise-zlc-for-first-zone-eligible-for-zone_reclaim.patch @@ -0,0 +1,257 @@ +From cd38b115d5ad79b0100ac6daa103c4fe2c50a913 Mon Sep 17 00:00:00 2001 +From: Mel Gorman <mgorman@suse.de> +Date: Mon, 25 Jul 2011 17:12:29 -0700 +Subject: mm: page allocator: initialise ZLC for first zone eligible for zone_reclaim + +From: Mel Gorman <mgorman@suse.de> + +commit cd38b115d5ad79b0100ac6daa103c4fe2c50a913 upstream. + +There have been a small number of complaints about significant stalls +while copying large amounts of data on NUMA machines reported on a +distribution bugzilla. In these cases, zone_reclaim was enabled by +default due to large NUMA distances. In general, the complaints have not +been about the workload itself unless it was a file server (in which case +the recommendation was disable zone_reclaim). + +The stalls are mostly due to significant amounts of time spent scanning +the preferred zone for pages to free. After a failure, it might fallback +to another node (as zonelists are often node-ordered rather than +zone-ordered) but stall quickly again when the next allocation attempt +occurs. In bad cases, each page allocated results in a full scan of the +preferred zone. + +Patch 1 checks the preferred zone for recent allocation failure + which is particularly important if zone_reclaim has failed + recently. This avoids rescanning the zone in the near future and + instead falling back to another node. This may hurt node locality + in some cases but a failure to zone_reclaim is more expensive than + a remote access. + +Patch 2 clears the zlc information after direct reclaim. + Otherwise, zone_reclaim can mark zones full, direct reclaim can + reclaim enough pages but the zone is still not considered for + allocation. + +This was tested on a 24-thread 2-node x86_64 machine. The tests were +focused on large amounts of IO. All tests were bound to the CPUs on +node-0 to avoid disturbances due to processes being scheduled on different +nodes. The kernels tested are + +3.0-rc6-vanilla Vanilla 3.0-rc6 +zlcfirst Patch 1 applied +zlcreconsider Patches 1+2 applied + +FS-Mark +./fs_mark -d /tmp/fsmark-10813 -D 100 -N 5000 -n 208 -L 35 -t 24 -S0 -s 524288 + fsmark-3.0-rc6 3.0-rc6 3.0-rc6 + vanilla zlcfirs zlcreconsider +Files/s min 54.90 ( 0.00%) 49.80 (-10.24%) 49.10 (-11.81%) +Files/s mean 100.11 ( 0.00%) 135.17 (25.94%) 146.93 (31.87%) +Files/s stddev 57.51 ( 0.00%) 138.97 (58.62%) 158.69 (63.76%) +Files/s max 361.10 ( 0.00%) 834.40 (56.72%) 802.40 (55.00%) +Overhead min 76704.00 ( 0.00%) 76501.00 ( 0.27%) 77784.00 (-1.39%) +Overhead mean 1485356.51 ( 0.00%) 1035797.83 (43.40%) 1594680.26 (-6.86%) +Overhead stddev 1848122.53 ( 0.00%) 881489.88 (109.66%) 1772354.90 ( 4.27%) +Overhead max 7989060.00 ( 0.00%) 3369118.00 (137.13%) 10135324.00 (-21.18%) +MMTests Statistics: duration +User/Sys Time Running Test (seconds) 501.49 493.91 499.93 +Total Elapsed Time (seconds) 2451.57 2257.48 2215.92 + +MMTests Statistics: vmstat +Page Ins 46268 63840 66008 +Page Outs 90821596 90671128 88043732 +Swap Ins 0 0 0 +Swap Outs 0 0 0 +Direct pages scanned 13091697 8966863 8971790 +Kswapd pages scanned 0 1830011 1831116 +Kswapd pages reclaimed 0 1829068 1829930 +Direct pages reclaimed 13037777 8956828 8648314 +Kswapd efficiency 100% 99% 99% +Kswapd velocity 0.000 810.643 826.346 +Direct efficiency 99% 99% 96% +Direct velocity 5340.128 3972.068 4048.788 +Percentage direct scans 100% 83% 83% +Page writes by reclaim 0 3 0 +Slabs scanned 796672 720640 720256 +Direct inode steals 7422667 7160012 7088638 +Kswapd inode steals 0 1736840 2021238 + +Test completes far faster with a large increase in the number of files +created per second. Standard deviation is high as a small number of +iterations were much higher than the mean. The number of pages scanned by +zone_reclaim is reduced and kswapd is used for more work. + +LARGE DD + 3.0-rc6 3.0-rc6 3.0-rc6 + vanilla zlcfirst zlcreconsider +download tar 59 ( 0.00%) 59 ( 0.00%) 55 ( 7.27%) +dd source files 527 ( 0.00%) 296 (78.04%) 320 (64.69%) +delete source 36 ( 0.00%) 19 (89.47%) 20 (80.00%) +MMTests Statistics: duration +User/Sys Time Running Test (seconds) 125.03 118.98 122.01 +Total Elapsed Time (seconds) 624.56 375.02 398.06 + +MMTests Statistics: vmstat +Page Ins 3594216 439368 407032 +Page Outs 23380832 23380488 23377444 +Swap Ins 0 0 0 +Swap Outs 0 436 287 +Direct pages scanned 17482342 69315973 82864918 +Kswapd pages scanned 0 519123 575425 +Kswapd pages reclaimed 0 466501 522487 +Direct pages reclaimed 5858054 2732949 2712547 +Kswapd efficiency 100% 89% 90% +Kswapd velocity 0.000 1384.254 1445.574 +Direct efficiency 33% 3% 3% +Direct velocity 27991.453 184832.737 208171.929 +Percentage direct scans 100% 99% 99% +Page writes by reclaim 0 5082 13917 +Slabs scanned 17280 29952 35328 +Direct inode steals 115257 1431122 332201 +Kswapd inode steals 0 0 979532 + +This test downloads a large tarfile and copies it with dd a number of +times - similar to the most recent bug report I've dealt with. Time to +completion is reduced. The number of pages scanned directly is still +disturbingly high with a low efficiency but this is likely due to the +number of dirty pages encountered. The figures could probably be improved +with more work around how kswapd is used and how dirty pages are handled +but that is separate work and this result is significant on its own. + +Streaming Mapped Writer +MMTests Statistics: duration +User/Sys Time Running Test (seconds) 124.47 111.67 112.64 +Total Elapsed Time (seconds) 2138.14 1816.30 1867.56 + +MMTests Statistics: vmstat +Page Ins 90760 89124 89516 +Page Outs 121028340 120199524 120736696 +Swap Ins 0 86 55 +Swap Outs 0 0 0 +Direct pages scanned 114989363 96461439 96330619 +Kswapd pages scanned 56430948 56965763 57075875 +Kswapd pages reclaimed 27743219 27752044 27766606 +Direct pages reclaimed 49777 46884 36655 +Kswapd efficiency 49% 48% 48% +Kswapd velocity 26392.541 31363.631 30561.736 +Direct efficiency 0% 0% 0% +Direct velocity 53780.091 53108.759 51581.004 +Percentage direct scans 67% 62% 62% +Page writes by reclaim 385 122 1513 +Slabs scanned 43008 39040 42112 +Direct inode steals 0 10 8 +Kswapd inode steals 733 534 477 + +This test just creates a large file mapping and writes to it linearly. +Time to completion is again reduced. + +The gains are mostly down to two things. In many cases, there is less +scanning as zone_reclaim simply gives up faster due to recent failures. +The second reason is that memory is used more efficiently. Instead of +scanning the preferred zone every time, the allocator falls back to +another zone and uses it instead improving overall memory utilisation. + +This patch: initialise ZLC for first zone eligible for zone_reclaim. + +The zonelist cache (ZLC) is used among other things to record if +zone_reclaim() failed for a particular zone recently. The intention is to +avoid a high cost scanning extremely long zonelists or scanning within the +zone uselessly. + +Currently the zonelist cache is setup only after the first zone has been +considered and zone_reclaim() has been called. The objective was to avoid +a costly setup but zone_reclaim is itself quite expensive. If it is +failing regularly such as the first eligible zone having mostly mapped +pages, the cost in scanning and allocation stalls is far higher than the +ZLC initialisation step. + +This patch initialises ZLC before the first eligible zone calls +zone_reclaim(). Once initialised, it is checked whether the zone failed +zone_reclaim recently. If it has, the zone is skipped. As the first zone +is now being checked, additional care has to be taken about zones marked +full. A zone can be marked "full" because it should not have enough +unmapped pages for zone_reclaim but this is excessive as direct reclaim or +kswapd may succeed where zone_reclaim fails. Only mark zones "full" after +zone_reclaim fails if it failed to reclaim enough pages after scanning. + +Signed-off-by: Mel Gorman <mgorman@suse.de> +Cc: Minchan Kim <minchan.kim@gmail.com> +Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> +Cc: Christoph Lameter <cl@linux.com> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Stefan Priebe <s.priebe@profihost.ag> +Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de> + +--- + mm/page_alloc.c | 35 ++++++++++++++++++++++------------- + 1 file changed, 22 insertions(+), 13 deletions(-) + +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -1664,7 +1664,7 @@ zonelist_scan: + continue; + if ((alloc_flags & ALLOC_CPUSET) && + !cpuset_zone_allowed_softwall(zone, gfp_mask)) +- goto try_next_zone; ++ continue; + + BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); + if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { +@@ -1676,17 +1676,36 @@ zonelist_scan: + classzone_idx, alloc_flags)) + goto try_this_zone; + ++ if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { ++ /* ++ * we do zlc_setup if there are multiple nodes ++ * and before considering the first zone allowed ++ * by the cpuset. ++ */ ++ allowednodes = zlc_setup(zonelist, alloc_flags); ++ zlc_active = 1; ++ did_zlc_setup = 1; ++ } ++ + if (zone_reclaim_mode == 0) + goto this_zone_full; + ++ /* ++ * As we may have just activated ZLC, check if the first ++ * eligible zone has failed zone_reclaim recently. ++ */ ++ if (NUMA_BUILD && zlc_active && ++ !zlc_zone_worth_trying(zonelist, z, allowednodes)) ++ continue; ++ + ret = zone_reclaim(zone, gfp_mask, order); + switch (ret) { + case ZONE_RECLAIM_NOSCAN: + /* did not scan */ +- goto try_next_zone; ++ continue; + case ZONE_RECLAIM_FULL: + /* scanned but unreclaimable */ +- goto this_zone_full; ++ continue; + default: + /* did we reclaim enough */ + if (!zone_watermark_ok(zone, order, mark, +@@ -1703,16 +1722,6 @@ try_this_zone: + this_zone_full: + if (NUMA_BUILD) + zlc_mark_zone_full(zonelist, z); +-try_next_zone: +- if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { +- /* +- * we do zlc_setup after the first zone is tried but only +- * if there are multiple nodes make it worthwhile +- */ +- allowednodes = zlc_setup(zonelist, alloc_flags); +- zlc_active = 1; +- did_zlc_setup = 1; +- } + } + + if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { diff --git a/queue-3.0/mm-page-allocator-reconsider-zones-for-allocation-after-direct-reclaim.patch b/queue-3.0/mm-page-allocator-reconsider-zones-for-allocation-after-direct-reclaim.patch new file mode 100644 index 0000000000..f3569e3f7f --- /dev/null +++ b/queue-3.0/mm-page-allocator-reconsider-zones-for-allocation-after-direct-reclaim.patch @@ -0,0 +1,74 @@ +From 76d3fbf8fbf6cc78ceb63549e0e0c5bc8a88f838 Mon Sep 17 00:00:00 2001 +From: Mel Gorman <mgorman@suse.de> +Date: Mon, 25 Jul 2011 17:12:30 -0700 +Subject: mm: page allocator: reconsider zones for allocation after direct reclaim + +From: Mel Gorman <mgorman@suse.de> + +commit 76d3fbf8fbf6cc78ceb63549e0e0c5bc8a88f838 upstream. + +With zone_reclaim_mode enabled, it's possible for zones to be considered +full in the zonelist_cache so they are skipped in the future. If the +process enters direct reclaim, the ZLC may still consider zones to be full +even after reclaiming pages. Reconsider all zones for allocation if +direct reclaim returns successfully. + +Signed-off-by: Mel Gorman <mgorman@suse.de> +Cc: Minchan Kim <minchan.kim@gmail.com> +Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> +Cc: Christoph Lameter <cl@linux.com> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Stefan Priebe <s.priebe@profihost.ag> +Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de> + +--- + mm/page_alloc.c | 23 +++++++++++++++++++++++ + 1 file changed, 23 insertions(+) + +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -1616,6 +1616,21 @@ static void zlc_mark_zone_full(struct zo + set_bit(i, zlc->fullzones); + } + ++/* ++ * clear all zones full, called after direct reclaim makes progress so that ++ * a zone that was recently full is not skipped over for up to a second ++ */ ++static void zlc_clear_zones_full(struct zonelist *zonelist) ++{ ++ struct zonelist_cache *zlc; /* cached zonelist speedup info */ ++ ++ zlc = zonelist->zlcache_ptr; ++ if (!zlc) ++ return; ++ ++ bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); ++} ++ + #else /* CONFIG_NUMA */ + + static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) +@@ -1632,6 +1647,10 @@ static int zlc_zone_worth_trying(struct + static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) + { + } ++ ++static void zlc_clear_zones_full(struct zonelist *zonelist) ++{ ++} + #endif /* CONFIG_NUMA */ + + /* +@@ -1963,6 +1982,10 @@ __alloc_pages_direct_reclaim(gfp_t gfp_m + if (unlikely(!(*did_some_progress))) + return NULL; + ++ /* After successful reclaim, reconsider all zones for allocation */ ++ if (NUMA_BUILD) ++ zlc_clear_zones_full(zonelist); ++ + retry: + page = get_page_from_freelist(gfp_mask, nodemask, order, + zonelist, high_zoneidx, diff --git a/queue-3.0/series b/queue-3.0/series index f4377b8687..0230df07c7 100644 --- a/queue-3.0/series +++ b/queue-3.0/series @@ -50,3 +50,5 @@ arm-7014-1-cache-l2x0-fix-l2-cache-size-calculation.patch md-linear-avoid-corrupting-structure-while-waiting-for.patch drm-radeon-kms-set-a-default-max_pixel_clock.patch drm-radeon-kms-make-sure-pci-max-read-request-size-is-valid-on-evergreen-v2.patch +mm-page-allocator-initialise-zlc-for-first-zone-eligible-for-zone_reclaim.patch +mm-page-allocator-reconsider-zones-for-allocation-after-direct-reclaim.patch |