From: "Martin J. Bligh" This introduces calculate_e820_holes(), currently for x86_64 only, which will find holes from the e820 map in the memory range for both node->node_spanned_pages and zone->present_pages. This fixes a problem where we were significantly overestimating the amount of memory we had available on the system (eg 6GB instead of 4GB), which caused us to fill up too much memory with dirty pages. There are some extra debug printks in there that could be removed later, but we want them in for now, in case anyone has problems with the patch. Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/x86_64/kernel/e820.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++ mm/page_alloc.c | 21 +++++++++++++++++- 2 files changed, 70 insertions(+), 2 deletions(-) diff -puN arch/x86_64/kernel/e820.c~x86_64-fix-numa-node-sizing-in-nr_free_zone_pages arch/x86_64/kernel/e820.c --- devel/arch/x86_64/kernel/e820.c~x86_64-fix-numa-node-sizing-in-nr_free_zone_pages 2005-08-21 22:22:45.000000000 -0700 +++ devel-akpm/arch/x86_64/kernel/e820.c 2005-08-21 22:22:45.000000000 -0700 @@ -184,6 +184,57 @@ unsigned long __init e820_end_of_ram(voi return end_pfn; } +/* + * Calculate holes in the e820 map that intersect with a given memory area + */ +unsigned long calculate_e820_holes(unsigned long start_pfn, unsigned long end_pfn) +{ + int i; + pg_data_t *pgdat; + unsigned long e820_start_pfn, e820_end_pfn; + unsigned long holes_sum = 0; + + printk("Calculating e820 holes from pfn %ld to pfn %ld\n", + start_pfn, end_pfn); + /* + * Walk through the list of e820 blocks, identifying holes inbetween. + * We'll move start_pfn up as we go to indicate where we're up to. + * Assumes the e820 regions are in order, and not overlapping, which + * should have been fixed by now. + */ + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + + e820_start_pfn = (ei->addr) / PAGE_SIZE; + e820_end_pfn = (ei->addr + ei->size) / PAGE_SIZE; + if (e820_start_pfn >= e820_end_pfn) + continue; + if (e820_end_pfn < start_pfn) + continue; /* wholly before our scan region */ + if (e820_start_pfn > end_pfn) + break; /* wholly after our scan region */ + if (start_pfn < e820_start_pfn) { + /* gap before start of this block */ + holes_sum += e820_start_pfn - start_pfn; + printk("Found e820 hole (%lu pages) from pfn %lu to pfn %lu\n", + e820_start_pfn - start_pfn, + start_pfn, e820_start_pfn); + } + if (ei->type != E820_RAM) { + /* Effectively a hole, respective of usable memory */ + if (e820_start_pfn < start_pfn) /* partial overlap */ + e820_start_pfn = start_pfn; + if (e820_end_pfn > end_pfn) /* partial overlap */ + e820_end_pfn = end_pfn; + holes_sum += e820_end_pfn - e820_start_pfn; + printk("Found e820 unusable area (%lu pages) from pfn %lu to pfn %lu\n", e820_end_pfn - e820_start_pfn, e820_start_pfn, e820_end_pfn); + } + start_pfn = e820_end_pfn; + } + printk("Found a total of %lu pages of holes\n", holes_sum); + return holes_sum; +} + /* * Mark e820 reserved areas as busy for the resource manager. */ diff -puN mm/page_alloc.c~x86_64-fix-numa-node-sizing-in-nr_free_zone_pages mm/page_alloc.c --- devel/mm/page_alloc.c~x86_64-fix-numa-node-sizing-in-nr_free_zone_pages 2005-08-21 22:22:45.000000000 -0700 +++ devel-akpm/mm/page_alloc.c 2005-08-21 22:22:45.000000000 -0700 @@ -51,6 +51,12 @@ unsigned long totalram_pages; unsigned long totalhigh_pages; long nr_swap_pages; +#ifdef CONFIG_X86_64 +extern unsigned long calculate_e820_holes(unsigned long, unsigned long); +#else +#define calculate_e820_holes(start, end) (0) +#endif + /* * results with 256, 32 in the lowmem_reserve sysctl: * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) @@ -1630,6 +1636,7 @@ static inline unsigned long wait_table_b #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) +/* This function calculates the total pages in a node, not a zone. Pah. */ static void __init calculate_zone_totalpages(struct pglist_data *pgdat, unsigned long *zones_size, unsigned long *zholes_size) { @@ -1641,11 +1648,18 @@ static void __init calculate_zone_totalp pgdat->node_spanned_pages = totalpages; realtotalpages = totalpages; + /* + * Finding memory holes from BOTH e820 and zholes is a bad + * idea since they may well overlap (eg summit, from SRAT) + */ if (zholes_size) for (i = 0; i < MAX_NR_ZONES; i++) realtotalpages -= zholes_size[i]; + else + realtotalpages -= calculate_e820_holes(pgdat->node_start_pfn, + pgdat->node_start_pfn + pgdat->node_spanned_pages); pgdat->node_present_pages = realtotalpages; - printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); + printk("On node %d, present: %lu, spanned: %lu\n", pgdat->node_id, realtotalpages, totalpages); } @@ -1890,6 +1904,9 @@ static void __init free_area_init_core(s realsize = size = zones_size[j]; if (zholes_size) realsize -= zholes_size[j]; + else + realsize -= calculate_e820_holes(zone_start_pfn, + zone_start_pfn + size); if (j == ZONE_DMA || j == ZONE_NORMAL) nr_kernel_pages += realsize; @@ -1916,7 +1933,7 @@ static void __init free_area_init_core(s setup_pageset(zone_pcp(zone,cpu), batch); #endif } - printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", + printk(" %s zone: %lu pages, LIFO batch:%lu\n", zone_names[j], realsize, batch); INIT_LIST_HEAD(&zone->active_list); INIT_LIST_HEAD(&zone->inactive_list); _