From: Matthew Dobson Documentation/kernel-parameters.txt | 2 Documentation/sysctl/vm.txt | 9 +++ include/linux/mmzone.h | 19 +++++++ include/linux/sysctl.h | 1 init/main.c | 1 kernel/sysctl.c | 17 ++++-- mm/page_alloc.c | 89 +++++++++++++++++++++++++----------- 7 files changed, 103 insertions(+), 35 deletions(-) diff -puN Documentation/kernel-parameters.txt~min_free_kbytes Documentation/kernel-parameters.txt --- 25/Documentation/kernel-parameters.txt~min_free_kbytes 2003-05-29 00:03:15.000000000 -0700 +++ 25-akpm/Documentation/kernel-parameters.txt 2003-05-29 00:03:15.000000000 -0700 @@ -540,8 +540,6 @@ running once the system is up. [KNL,ACPI] Mark specific memory as reserved. Region of memory to be used, from ss to ss+nn. - memfrac= [KNL] - meye= [HW] Set MotionEye Camera parameters See Documentation/video4linux/meye.txt. diff -puN Documentation/sysctl/vm.txt~min_free_kbytes Documentation/sysctl/vm.txt --- 25/Documentation/sysctl/vm.txt~min_free_kbytes 2003-05-29 00:03:15.000000000 -0700 +++ 25-akpm/Documentation/sysctl/vm.txt 2003-05-29 00:03:15.000000000 -0700 @@ -22,6 +22,7 @@ Currently, these files are in /proc/sys/ - dirty_background_ratio - dirty_expire_centisecs - dirty_writeback_centisecs +- min_free_kbytes ============================================================== @@ -74,3 +75,11 @@ The number of pages the kernel reads in 2 ^ page-cluster. Values above 2 ^ 5 don't make much sense for swap because we only cluster swap data in 32-page groups. +============================================================== + +min_free_kbytes: + +This is used to force the Linux VM to keep a minimum number +of kilobytes free. The VM uses this number to compute a pages_min +value for each lowmem zone in the system. Each lowmem zone gets +a number of reserved free pages based proportionally on its size. diff -puN include/linux/mmzone.h~min_free_kbytes include/linux/mmzone.h --- 25/include/linux/mmzone.h~min_free_kbytes 2003-05-29 00:03:15.000000000 -0700 +++ 25-akpm/include/linux/mmzone.h 2003-05-29 00:03:15.000000000 -0700 @@ -249,6 +249,25 @@ static inline struct zone *next_zone(str #define for_each_zone(zone) \ for (zone = pgdat_list->node_zones; zone; zone = next_zone(zone)) +/** + * is_highmem - helper function to quickly check if a struct zone is a + * highmem zone or not. This is an attempt to keep references + * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum. + * @zone - pointer to struct zone variable + */ +static inline int is_highmem(struct zone *zone) +{ + return (zone - zone->zone_pgdat->node_zones == ZONE_HIGHMEM); +} + +/* These two functions are used to setup the per zone pages min values */ +struct ctl_table; +struct file; +int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *, + void *, size_t *); +extern void setup_per_zone_pages_min(void); + + #ifdef CONFIG_NUMA #define MAX_NR_MEMBLKS BITS_PER_LONG /* Max number of Memory Blocks */ #else /* !CONFIG_NUMA */ diff -puN include/linux/sysctl.h~min_free_kbytes include/linux/sysctl.h --- 25/include/linux/sysctl.h~min_free_kbytes 2003-05-29 00:03:15.000000000 -0700 +++ 25-akpm/include/linux/sysctl.h 2003-05-29 00:03:15.000000000 -0700 @@ -156,6 +156,7 @@ enum VM_HUGETLB_PAGES=18, /* int: Number of available Huge Pages */ VM_SWAPPINESS=19, /* Tendency to steal mapped memory */ VM_LOWER_ZONE_PROTECTION=20,/* Amount of protection of lower zones */ + VM_MIN_FREE_KBYTES=21, /* Minimum free kilobytes to maintain */ }; diff -puN init/main.c~min_free_kbytes init/main.c --- 25/init/main.c~min_free_kbytes 2003-05-29 00:03:15.000000000 -0700 +++ 25-akpm/init/main.c 2003-05-29 00:03:15.000000000 -0700 @@ -390,6 +390,7 @@ asmlinkage void __init start_kernel(void lock_kernel(); printk(linux_banner); setup_arch(&command_line); + setup_per_zone_pages_min(); setup_per_cpu_areas(); /* diff -puN kernel/sysctl.c~min_free_kbytes kernel/sysctl.c --- 25/kernel/sysctl.c~min_free_kbytes 2003-05-29 00:03:15.000000000 -0700 +++ 25-akpm/kernel/sysctl.c 2003-05-29 00:03:15.000000000 -0700 @@ -57,6 +57,7 @@ extern char core_pattern[]; extern int cad_pid; extern int pid_max; extern int sysctl_lower_zone_protection; +extern int min_free_kbytes; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; @@ -305,24 +306,26 @@ static ctl_table vm_table[] = { some nicely documented throttling code in wb_kupdate(). There is no maximum legal value for dirty_writeback. */ - &one , NULL}, + &one, NULL}, {VM_DIRTY_EXPIRE_CS, "dirty_expire_centisecs", &dirty_expire_centisecs, sizeof(dirty_expire_centisecs), 0644, NULL, &proc_dointvec}, - { VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads", - &nr_pdflush_threads, sizeof nr_pdflush_threads, + {VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads", + &nr_pdflush_threads, sizeof(nr_pdflush_threads), 0444 /* read-only*/, NULL, &proc_dointvec}, {VM_SWAPPINESS, "swappiness", &vm_swappiness, sizeof(vm_swappiness), 0644, NULL, &proc_dointvec_minmax, &sysctl_intvec, NULL, &zero, - &one_hundred }, + &one_hundred}, #ifdef CONFIG_HUGETLB_PAGE - {VM_HUGETLB_PAGES, "nr_hugepages", &htlbpage_max, sizeof(int), 0644, - NULL, &hugetlb_sysctl_handler}, + {VM_HUGETLB_PAGES, "nr_hugepages", &htlbpage_max, sizeof(int), 0644, + NULL, &hugetlb_sysctl_handler}, #endif {VM_LOWER_ZONE_PROTECTION, "lower_zone_protection", &sysctl_lower_zone_protection, sizeof(sysctl_lower_zone_protection), 0644, NULL, &proc_dointvec_minmax, &sysctl_intvec, NULL, &zero, - NULL, }, + NULL}, + {VM_MIN_FREE_KBYTES, "min_free_kbytes", &min_free_kbytes, + sizeof(min_free_kbytes), 0644, NULL, &min_free_kbytes_sysctl_handler}, {0} }; diff -puN mm/page_alloc.c~min_free_kbytes mm/page_alloc.c --- 25/mm/page_alloc.c~min_free_kbytes 2003-05-29 00:03:15.000000000 -0700 +++ 25-akpm/mm/page_alloc.c 2003-05-29 00:03:15.000000000 -0700 @@ -28,6 +28,7 @@ #include #include #include +#include #include @@ -48,9 +49,7 @@ struct zone *zone_table[MAX_NR_ZONES*MAX EXPORT_SYMBOL(zone_table); static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; -static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 128, 128, 128, }; -static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, }; -static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, }; +int min_free_kbytes = 1024; #ifdef CONFIG_DEBUG_PAGEALLOC static int __map_pages(struct page *page, unsigned int num, pgprot_t prot) @@ -1253,7 +1252,6 @@ static void __init free_area_init_core(s for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; - unsigned long mask; unsigned long size, realsize; unsigned long batch; @@ -1327,15 +1325,6 @@ static void __init free_area_init_core(s pgdat->nr_zones = j+1; - mask = (realsize / zone_balance_ratio[j]); - if (mask < zone_balance_min[j]) - mask = zone_balance_min[j]; - else if (mask > zone_balance_max[j]) - mask = zone_balance_max[j]; - zone->pages_min = mask; - zone->pages_low = mask*2; - zone->pages_high = mask*3; - zone->zone_mem_map = lmem_map; zone->zone_start_pfn = zone_start_pfn; @@ -1420,19 +1409,6 @@ void __init free_area_init(unsigned long } #endif -static int __init setup_mem_frac(char *str) -{ - int j = 0; - - while (get_option(&str, &zone_balance_ratio[j++]) == 2); - printk("setup_mem_frac: "); - for (j = 0; j < MAX_NR_ZONES; j++) printk("%d ", zone_balance_ratio[j]); - printk("\n"); - return 1; -} - -__setup("memfrac=", setup_mem_frac); - #ifdef CONFIG_PROC_FS #include @@ -1609,3 +1585,64 @@ void __init page_alloc_init(void) init_page_alloc_cpu(smp_processor_id()); register_cpu_notifier(&page_alloc_nb); } + +/* + * setup_per_zone_pages_min - called when min_free_kbytes changes. Ensures + * that the pages_{min,low,high} values for each zone are set correctly + * with respect to min_free_kbytes. + */ +void setup_per_zone_pages_min(void) +{ + unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); + unsigned long lowmem_pages = 0; + struct zone *zone; + unsigned long flags; + + /* Calculate total number of !ZONE_HIGHMEM pages */ + for_each_zone(zone) + if (!is_highmem(zone)) + lowmem_pages += zone->present_pages; + + for_each_zone(zone) { + spin_lock_irqsave(&zone->lru_lock, flags); + if (is_highmem(zone)) { + /* + * Often, highmem doesn't need to reserve any pages. + * But the pages_min/low/high values are also used for + * batching up page reclaim activity so we need a + * decent value here. + */ + int min_pages; + + min_pages = zone->present_pages / 1024; + if (min_pages < SWAP_CLUSTER_MAX) + min_pages = SWAP_CLUSTER_MAX; + if (min_pages > 128) + min_pages = 128; + zone->pages_min = min_pages; + } else { + /* if it's a lowmem zone, reserve a number of pages + * proportionate to the zone's size. + */ + zone->pages_min = (pages_min * zone->present_pages) / + lowmem_pages; + } + + zone->pages_low = zone->pages_min * 2; + zone->pages_high = zone->pages_min * 3; + spin_unlock_irqrestore(&zone->lru_lock, flags); + } +} + +/* + * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so + * that we can call setup_per_zone_pages_min() whenever min_free_kbytes + * changes. + */ +int min_free_kbytes_sysctl_handler(ctl_table *table, int write, + struct file *file, void *buffer, size_t *length) +{ + proc_dointvec(table, write, file, buffer, length); + setup_per_zone_pages_min(); + return 0; +} _