From: Andi Kleen This is an experimental patch that needs a lot more benchmarking before it can be put into mainline. Mainly benchmarking to see if there are no regressions on performance on x86-64 with this patch, on machines with >4GB of memory. I wouldn't put it into mainline anytime soon, it's really only for testing. Add 4GB DMA32 zone Add a new 4GB GFP_DMA32 between GFP_DMA and GFP_NORMAL zone. This helps mainly graphic drivers who really need a lot of memory below the 4GB area. Previous they could only use IOMMU+16MB GFP_DMA, which was not enough memory. This is done for x86-64. For other architectures who don't set up this new zone nothing changes. It may make sense to rename IA64's GFP_DMA to GFP_DMA32 for better source code compatibility though. I didn't do this, so far only x86-64 is affected. Signed-off-by: Andi Kleen Signed-off-by: Andrew Morton --- 25-akpm/arch/x86_64/mm/init.c | 48 +++++++++++++++++++++++++------------ 25-akpm/arch/x86_64/mm/numa.c | 11 -------- 25-akpm/include/asm-x86_64/dma.h | 11 ++++++-- 25-akpm/include/asm-x86_64/proto.h | 2 + 25-akpm/include/linux/gfp.h | 5 +++ 25-akpm/include/linux/mmzone.h | 14 ++++++---- 25-akpm/mm/page_alloc.c | 16 +++++++++--- 7 files changed, 69 insertions(+), 38 deletions(-) diff -puN arch/x86_64/mm/init.c~x86_64-experimental-4gb-dma-zone arch/x86_64/mm/init.c --- 25/arch/x86_64/mm/init.c~x86_64-experimental-4gb-dma-zone 2004-11-28 01:56:45.607420952 -0800 +++ 25-akpm/arch/x86_64/mm/init.c 2004-11-28 01:56:45.620418976 -0800 @@ -325,24 +325,42 @@ void zap_low_mappings(void) flush_tlb_all(); } +/* Compute zone sizes for the DMA and DMA32 zones in a node. */ +__init void +size_zones(unsigned long *z, unsigned long start_pfn, unsigned long end_pfn) +{ + int i; + unsigned long missing; + + for (i = 0; i < MAX_NR_ZONES; i++) + z[i] = 0; + + if (start_pfn < MAX_DMA_PFN) + z[ZONE_DMA] = MAX_DMA_PFN - start_pfn; + if (start_pfn < MAX_DMA32_PFN) { + unsigned long dma32_pfn = MAX_DMA32_PFN; + if (dma32_pfn > end_pfn) + dma32_pfn = end_pfn; + z[ZONE_DMA32] = dma32_pfn - start_pfn; + } + z[ZONE_NORMAL] = end_pfn - start_pfn; + + /* Remove lower zones from higher ones. */ + missing = 0; + for (i = 0; i < MAX_NR_ZONES; i++) { + if (z[i]) + z[i] -= missing; + printk("--- zone %d: %lx pages\n", i, z[i]); + missing += z[i]; + } +} + #ifndef CONFIG_DISCONTIGMEM void __init paging_init(void) { - { - unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0}; - unsigned int max_dma; - - max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; - - if (end_pfn < max_dma) - zones_size[ZONE_DMA] = end_pfn; - else { - zones_size[ZONE_DMA] = max_dma; - zones_size[ZONE_NORMAL] = end_pfn - max_dma; - } - free_area_init(zones_size); - } - return; + unsigned long zones_size[MAX_NR_ZONES]; + size_zones(zones_size, 0, end_pfn); + free_area_init(zones_size); } #endif diff -puN arch/x86_64/mm/numa.c~x86_64-experimental-4gb-dma-zone arch/x86_64/mm/numa.c --- 25/arch/x86_64/mm/numa.c~x86_64-experimental-4gb-dma-zone 2004-11-28 01:56:45.608420800 -0800 +++ 25-akpm/arch/x86_64/mm/numa.c 2004-11-28 01:56:45.620418976 -0800 @@ -121,7 +121,6 @@ void __init setup_node_zones(int nodeid) { unsigned long start_pfn, end_pfn; unsigned long zones[MAX_NR_ZONES]; - unsigned long dma_end_pfn; memset(zones, 0, sizeof(unsigned long) * MAX_NR_ZONES); @@ -130,15 +129,7 @@ void __init setup_node_zones(int nodeid) Dprintk(KERN_INFO "setting up node %d %lx-%lx\n", nodeid, start_pfn, end_pfn); - /* All nodes > 0 have a zero length zone DMA */ - dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT; - if (start_pfn < dma_end_pfn) { - zones[ZONE_DMA] = dma_end_pfn - start_pfn; - zones[ZONE_NORMAL] = end_pfn - dma_end_pfn; - } else { - zones[ZONE_NORMAL] = end_pfn - start_pfn; - } - + size_zones(zones, start_pfn, end_pfn); free_area_init_node(nodeid, NODE_DATA(nodeid), zones, start_pfn, NULL); } diff -puN include/asm-x86_64/dma.h~x86_64-experimental-4gb-dma-zone include/asm-x86_64/dma.h --- 25/include/asm-x86_64/dma.h~x86_64-experimental-4gb-dma-zone 2004-11-28 01:56:45.609420648 -0800 +++ 25-akpm/include/asm-x86_64/dma.h 2004-11-28 01:56:45.621418824 -0800 @@ -72,8 +72,15 @@ #define MAX_DMA_CHANNELS 8 -/* The maximum address that we can perform a DMA transfer to on this platform */ -#define MAX_DMA_ADDRESS (PAGE_OFFSET+0x1000000) + +/* 16MB ISA DMA zone */ +#define MAX_DMA_PFN ((16*1024*1024) >> PAGE_SHIFT) + +/* 4GB broken PCI/AGP hardware bus master zone */ +#define MAX_DMA32_PFN ((4UL*1024*1024*1024) >> PAGE_SHIFT) + +/* Compat define for old dma zone */ +#define MAX_DMA_ADDRESS ((unsigned long)__va(MAX_DMA_PFN << PAGE_SHIFT)) /* 8237 DMA controllers */ #define IO_DMA1_BASE 0x00 /* 8 bit slave DMA, channels 0..3 */ diff -puN include/asm-x86_64/proto.h~x86_64-experimental-4gb-dma-zone include/asm-x86_64/proto.h --- 25/include/asm-x86_64/proto.h~x86_64-experimental-4gb-dma-zone 2004-11-28 01:56:45.611420344 -0800 +++ 25-akpm/include/asm-x86_64/proto.h 2004-11-28 01:56:45.621418824 -0800 @@ -16,6 +16,8 @@ extern void early_idt_handler(void); extern void mcheck_init(struct cpuinfo_x86 *c); extern void init_memory_mapping(unsigned long start, unsigned long end); +extern void size_zones(unsigned long *z, unsigned long start_pfn, + unsigned long end_pfn); extern void system_call(void); extern int kernel_syscall(void); diff -puN include/linux/gfp.h~x86_64-experimental-4gb-dma-zone include/linux/gfp.h --- 25/include/linux/gfp.h~x86_64-experimental-4gb-dma-zone 2004-11-28 01:56:45.612420192 -0800 +++ 25-akpm/include/linux/gfp.h 2004-11-28 01:56:45.622418672 -0800 @@ -11,9 +11,10 @@ struct vm_area_struct; /* * GFP bitmasks.. */ -/* Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low two bits) */ +/* Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low three bits) */ #define __GFP_DMA 0x01 #define __GFP_HIGHMEM 0x02 +#define __GFP_DMA32 0x04 /* * Action modifiers - doesn't change the zoning @@ -58,6 +59,8 @@ struct vm_area_struct; #define GFP_DMA __GFP_DMA +/* 4GB DMA on some platforms */ +#define GFP_DMA32 __GFP_DMA32 /* * There is only one page-allocator function, and two main namespaces to diff -puN include/linux/mmzone.h~x86_64-experimental-4gb-dma-zone include/linux/mmzone.h --- 25/include/linux/mmzone.h~x86_64-experimental-4gb-dma-zone 2004-11-28 01:56:45.614419888 -0800 +++ 25-akpm/include/linux/mmzone.h 2004-11-28 01:56:45.623418520 -0800 @@ -64,11 +64,12 @@ struct per_cpu_pageset { } ____cacheline_aligned_in_smp; #define ZONE_DMA 0 -#define ZONE_NORMAL 1 -#define ZONE_HIGHMEM 2 +#define ZONE_DMA32 1 +#define ZONE_NORMAL 2 +#define ZONE_HIGHMEM 3 -#define MAX_NR_ZONES 3 /* Sync this with ZONES_SHIFT */ -#define ZONES_SHIFT 2 /* ceil(log2(MAX_NR_ZONES)) */ +#define MAX_NR_ZONES 4 /* Sync this with ZONES_SHIFT */ +#define ZONES_SHIFT 3 /* ceil(log2(MAX_NR_ZONES)) */ /* @@ -84,7 +85,7 @@ struct per_cpu_pageset { * be 8 (2 ** 3) zonelists. GFP_ZONETYPES defines the number of possible * combinations of zone modifiers in "zone modifier space". */ -#define GFP_ZONEMASK 0x03 +#define GFP_ZONEMASK 0x07 /* * As an optimisation any zone modifier bits which are only valid when * no other zone modifier bits are set (loners) should be placed in @@ -104,6 +105,7 @@ struct per_cpu_pageset { * into multiple physical zones. On a PC we have 3 zones: * * ZONE_DMA < 16 MB ISA DMA capable memory + * ZONE_DMA32 0 MB Empty * ZONE_NORMAL 16-896 MB direct mapped by the kernel * ZONE_HIGHMEM > 896 MB only page cache and user processes */ @@ -409,7 +411,7 @@ extern struct pglist_data contig_page_da #endif /* There are currently 3 zones: DMA, Normal & Highmem, thus we need 2 bits */ -#define MAX_ZONES_SHIFT 2 +#define MAX_ZONES_SHIFT 3 #if ZONES_SHIFT > MAX_ZONES_SHIFT #error ZONES_SHIFT > MAX_ZONES_SHIFT diff -puN mm/page_alloc.c~x86_64-experimental-4gb-dma-zone mm/page_alloc.c --- 25/mm/page_alloc.c~x86_64-experimental-4gb-dma-zone 2004-11-28 01:56:45.616419584 -0800 +++ 25-akpm/mm/page_alloc.c 2004-11-28 01:56:45.626418064 -0800 @@ -55,7 +55,7 @@ EXPORT_SYMBOL(nr_swap_pages); struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)]; EXPORT_SYMBOL(zone_table); -static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; +static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" }; int min_free_kbytes = 1024; unsigned long __initdata nr_kernel_pages; @@ -1200,6 +1200,10 @@ static int __init build_zonelists_node(p zone = pgdat->node_zones + ZONE_NORMAL; if (zone->present_pages) zonelist->zones[j++] = zone; + case ZONE_DMA32: + zone = pgdat->node_zones + ZONE_DMA32; + if (zone->present_pages) + zonelist->zones[j++] = zone; case ZONE_DMA: zone = pgdat->node_zones + ZONE_DMA; if (zone->present_pages) @@ -1310,6 +1314,8 @@ static void __init build_zonelists(pg_da k = ZONE_NORMAL; if (i & __GFP_HIGHMEM) k = ZONE_HIGHMEM; + if (i & __GFP_DMA32) + k = ZONE_DMA32; if (i & __GFP_DMA) k = ZONE_DMA; @@ -1336,6 +1342,8 @@ static void __init build_zonelists(pg_da k = ZONE_NORMAL; if (i & __GFP_HIGHMEM) k = ZONE_HIGHMEM; + if (i & __GFP_DMA32) + k = ZONE_DMA32; if (i & __GFP_DMA) k = ZONE_DMA; @@ -1537,7 +1545,7 @@ static void __init free_area_init_core(s if (zholes_size) realsize -= zholes_size[j]; - if (j == ZONE_DMA || j == ZONE_NORMAL) + if (j == ZONE_DMA || j == ZONE_NORMAL || j == ZONE_DMA32) nr_kernel_pages += realsize; nr_all_pages += realsize; @@ -1895,12 +1903,12 @@ static void setup_per_zone_protection(vo /* * For each of the different allocation types: - * GFP_DMA -> GFP_KERNEL -> GFP_HIGHMEM + * GFP_DMA -> GFP_DMA32 -> GFP_KERNEL -> GFP_HIGHMEM */ for (i = 0; i < GFP_ZONETYPES; i++) { /* * For each of the zones: - * ZONE_HIGHMEM -> ZONE_NORMAL -> ZONE_DMA + * ZONE_HIGHMEM -> ZONE_NORMAL -> ZONE_DMA32 ->ZONE_DMA */ for (j = MAX_NR_ZONES-1; j >= 0; j--) { zone = &zones[j]; _