From: Matthew Dobson Documentation/kernel-parameters.txt | 2 Documentation/sysctl/vm.txt | 9 +++ include/linux/mmzone.h | 19 +++++++ include/linux/sysctl.h | 1 init/main.c | 1 kernel/sysctl.c | 11 ++++ mm/page_alloc.c | 89 +++++++++++++++++++++++++----------- 7 files changed, 104 insertions(+), 28 deletions(-) diff -puN Documentation/kernel-parameters.txt~min_free_kbytes Documentation/kernel-parameters.txt --- 25/Documentation/kernel-parameters.txt~min_free_kbytes 2003-06-03 20:32:25.000000000 -0700 +++ 25-akpm/Documentation/kernel-parameters.txt 2003-06-03 20:32:25.000000000 -0700 @@ -540,8 +540,6 @@ running once the system is up. [KNL,ACPI] Mark specific memory as reserved. Region of memory to be used, from ss to ss+nn. - memfrac= [KNL] - meye= [HW] Set MotionEye Camera parameters See Documentation/video4linux/meye.txt. diff -puN Documentation/sysctl/vm.txt~min_free_kbytes Documentation/sysctl/vm.txt --- 25/Documentation/sysctl/vm.txt~min_free_kbytes 2003-06-03 20:32:25.000000000 -0700 +++ 25-akpm/Documentation/sysctl/vm.txt 2003-06-03 20:32:25.000000000 -0700 @@ -22,6 +22,7 @@ Currently, these files are in /proc/sys/ - dirty_background_ratio - dirty_expire_centisecs - dirty_writeback_centisecs +- min_free_kbytes ============================================================== @@ -74,3 +75,11 @@ The number of pages the kernel reads in 2 ^ page-cluster. Values above 2 ^ 5 don't make much sense for swap because we only cluster swap data in 32-page groups. +============================================================== + +min_free_kbytes: + +This is used to force the Linux VM to keep a minimum number +of kilobytes free. The VM uses this number to compute a pages_min +value for each lowmem zone in the system. Each lowmem zone gets +a number of reserved free pages based proportionally on its size. diff -puN include/linux/mmzone.h~min_free_kbytes include/linux/mmzone.h --- 25/include/linux/mmzone.h~min_free_kbytes 2003-06-03 20:32:25.000000000 -0700 +++ 25-akpm/include/linux/mmzone.h 2003-06-03 20:32:25.000000000 -0700 @@ -249,6 +249,25 @@ static inline struct zone *next_zone(str #define for_each_zone(zone) \ for (zone = pgdat_list->node_zones; zone; zone = next_zone(zone)) +/** + * is_highmem - helper function to quickly check if a struct zone is a + * highmem zone or not. This is an attempt to keep references + * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum. + * @zone - pointer to struct zone variable + */ +static inline int is_highmem(struct zone *zone) +{ + return (zone - zone->zone_pgdat->node_zones == ZONE_HIGHMEM); +} + +/* These two functions are used to setup the per zone pages min values */ +struct ctl_table; +struct file; +int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *, + void *, size_t *); +extern void setup_per_zone_pages_min(void); + + #ifdef CONFIG_NUMA #define MAX_NR_MEMBLKS BITS_PER_LONG /* Max number of Memory Blocks */ #else /* !CONFIG_NUMA */ diff -puN include/linux/sysctl.h~min_free_kbytes include/linux/sysctl.h --- 25/include/linux/sysctl.h~min_free_kbytes 2003-06-03 20:32:25.000000000 -0700 +++ 25-akpm/include/linux/sysctl.h 2003-06-03 20:32:25.000000000 -0700 @@ -156,6 +156,7 @@ enum VM_HUGETLB_PAGES=18, /* int: Number of available Huge Pages */ VM_SWAPPINESS=19, /* Tendency to steal mapped memory */ VM_LOWER_ZONE_PROTECTION=20,/* Amount of protection of lower zones */ + VM_MIN_FREE_KBYTES=21, /* Minimum free kilobytes to maintain */ }; diff -puN init/main.c~min_free_kbytes init/main.c --- 25/init/main.c~min_free_kbytes 2003-06-03 20:32:25.000000000 -0700 +++ 25-akpm/init/main.c 2003-06-03 20:32:25.000000000 -0700 @@ -390,6 +390,7 @@ asmlinkage void __init start_kernel(void lock_kernel(); printk(linux_banner); setup_arch(&command_line); + setup_per_zone_pages_min(); setup_per_cpu_areas(); /* diff -puN kernel/sysctl.c~min_free_kbytes kernel/sysctl.c --- 25/kernel/sysctl.c~min_free_kbytes 2003-06-03 20:32:25.000000000 -0700 +++ 25-akpm/kernel/sysctl.c 2003-06-03 20:34:13.000000000 -0700 @@ -57,6 +57,7 @@ extern char core_pattern[]; extern int cad_pid; extern int pid_max; extern int sysctl_lower_zone_protection; +extern int min_free_kbytes; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; @@ -661,6 +662,16 @@ static ctl_table vm_table[] = { .strategy = &sysctl_intvec, .extra1 = &zero, }, + { + .ctl_name = VM_MIN_FREE_KBYTES, + .procname = "min_free_kbytes", + .data = &min_free_kbytes, + .maxlen = sizeof(min_free_kbytes), + .mode = 0644, + .proc_handler = &min_free_kbytes_sysctl_handler, + .strategy = &sysctl_intvec, + .extra1 = &zero, + }, { .ctl_name = 0 } }; diff -puN mm/page_alloc.c~min_free_kbytes mm/page_alloc.c --- 25/mm/page_alloc.c~min_free_kbytes 2003-06-03 20:32:25.000000000 -0700 +++ 25-akpm/mm/page_alloc.c 2003-06-03 20:32:25.000000000 -0700 @@ -29,6 +29,7 @@ #include #include #include +#include #include @@ -49,9 +50,7 @@ struct zone *zone_table[MAX_NR_ZONES*MAX EXPORT_SYMBOL(zone_table); static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; -static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 128, 128, 128, }; -static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, }; -static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, }; +int min_free_kbytes = 1024; /* * Temporary debugging check for pages not lying within a given zone. @@ -1212,7 +1211,6 @@ static void __init free_area_init_core(s for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; - unsigned long mask; unsigned long size, realsize; unsigned long batch; @@ -1286,15 +1284,6 @@ static void __init free_area_init_core(s pgdat->nr_zones = j+1; - mask = (realsize / zone_balance_ratio[j]); - if (mask < zone_balance_min[j]) - mask = zone_balance_min[j]; - else if (mask > zone_balance_max[j]) - mask = zone_balance_max[j]; - zone->pages_min = mask; - zone->pages_low = mask*2; - zone->pages_high = mask*3; - zone->zone_mem_map = lmem_map; zone->zone_start_pfn = zone_start_pfn; @@ -1379,19 +1368,6 @@ void __init free_area_init(unsigned long } #endif -static int __init setup_mem_frac(char *str) -{ - int j = 0; - - while (get_option(&str, &zone_balance_ratio[j++]) == 2); - printk("setup_mem_frac: "); - for (j = 0; j < MAX_NR_ZONES; j++) printk("%d ", zone_balance_ratio[j]); - printk("\n"); - return 1; -} - -__setup("memfrac=", setup_mem_frac); - #ifdef CONFIG_PROC_FS #include @@ -1568,3 +1544,64 @@ void __init page_alloc_init(void) init_page_alloc_cpu(smp_processor_id()); register_cpu_notifier(&page_alloc_nb); } + +/* + * setup_per_zone_pages_min - called when min_free_kbytes changes. Ensures + * that the pages_{min,low,high} values for each zone are set correctly + * with respect to min_free_kbytes. + */ +void setup_per_zone_pages_min(void) +{ + unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); + unsigned long lowmem_pages = 0; + struct zone *zone; + unsigned long flags; + + /* Calculate total number of !ZONE_HIGHMEM pages */ + for_each_zone(zone) + if (!is_highmem(zone)) + lowmem_pages += zone->present_pages; + + for_each_zone(zone) { + spin_lock_irqsave(&zone->lru_lock, flags); + if (is_highmem(zone)) { + /* + * Often, highmem doesn't need to reserve any pages. + * But the pages_min/low/high values are also used for + * batching up page reclaim activity so we need a + * decent value here. + */ + int min_pages; + + min_pages = zone->present_pages / 1024; + if (min_pages < SWAP_CLUSTER_MAX) + min_pages = SWAP_CLUSTER_MAX; + if (min_pages > 128) + min_pages = 128; + zone->pages_min = min_pages; + } else { + /* if it's a lowmem zone, reserve a number of pages + * proportionate to the zone's size. + */ + zone->pages_min = (pages_min * zone->present_pages) / + lowmem_pages; + } + + zone->pages_low = zone->pages_min * 2; + zone->pages_high = zone->pages_min * 3; + spin_unlock_irqrestore(&zone->lru_lock, flags); + } +} + +/* + * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so + * that we can call setup_per_zone_pages_min() whenever min_free_kbytes + * changes. + */ +int min_free_kbytes_sysctl_handler(ctl_table *table, int write, + struct file *file, void *buffer, size_t *length) +{ + proc_dointvec(table, write, file, buffer, length); + setup_per_zone_pages_min(); + return 0; +} _