From: Matthew Dobson <colpatch@us.ibm.com>




 Documentation/kernel-parameters.txt |    2 
 Documentation/sysctl/vm.txt         |    9 +++
 include/linux/mmzone.h              |   19 +++++++
 include/linux/sysctl.h              |    1 
 init/main.c                         |    1 
 kernel/sysctl.c                     |   17 ++++--
 mm/page_alloc.c                     |   88 +++++++++++++++++++++++++-----------
 7 files changed, 102 insertions(+), 35 deletions(-)

diff -puN Documentation/kernel-parameters.txt~min_free_kbytes Documentation/kernel-parameters.txt
--- 25/Documentation/kernel-parameters.txt~min_free_kbytes	2003-05-23 19:29:12.000000000 -0700
+++ 25-akpm/Documentation/kernel-parameters.txt	2003-05-23 19:29:12.000000000 -0700
@@ -539,8 +539,6 @@ running once the system is up.
 			[KNL,ACPI] Mark specific memory as reserved.
 			Region of memory to be used, from ss to ss+nn.
 
-	memfrac=	[KNL]
-
 	meye=		[HW] Set MotionEye Camera parameters
 			See Documentation/video4linux/meye.txt.
 
diff -puN Documentation/sysctl/vm.txt~min_free_kbytes Documentation/sysctl/vm.txt
--- 25/Documentation/sysctl/vm.txt~min_free_kbytes	2003-05-23 19:29:12.000000000 -0700
+++ 25-akpm/Documentation/sysctl/vm.txt	2003-05-23 19:29:12.000000000 -0700
@@ -22,6 +22,7 @@ Currently, these files are in /proc/sys/
 - dirty_background_ratio
 - dirty_expire_centisecs
 - dirty_writeback_centisecs
+- min_free_kbytes
 
 ==============================================================
 
@@ -74,3 +75,11 @@ The number of pages the kernel reads in 
 2 ^ page-cluster. Values above 2 ^ 5 don't make much sense
 for swap because we only cluster swap data in 32-page groups.
 
+==============================================================
+
+min_free_kbytes:
+
+This is used to force the Linux VM to keep a minimum number 
+of kilobytes free.  The VM uses this number to compute a pages_min
+value for each lowmem zone in the system.  Each lowmem zone gets 
+a number of reserved free pages based proportionally on its size.
diff -puN include/linux/mmzone.h~min_free_kbytes include/linux/mmzone.h
--- 25/include/linux/mmzone.h~min_free_kbytes	2003-05-23 19:29:12.000000000 -0700
+++ 25-akpm/include/linux/mmzone.h	2003-05-23 19:29:12.000000000 -0700
@@ -249,6 +249,25 @@ static inline struct zone *next_zone(str
 #define for_each_zone(zone) \
 	for (zone = pgdat_list->node_zones; zone; zone = next_zone(zone))
 
+/**
+ * is_highmem - helper function to quickly check if a struct zone is a 
+ *              highmem zone or not.  This is an attempt to keep references
+ *              to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum.
+ * @zone - pointer to struct zone variable
+ */
+static inline int is_highmem(struct zone *zone)
+{
+	return (zone - zone->zone_pgdat->node_zones == ZONE_HIGHMEM);
+}
+
+/* These two functions are used to setup the per zone pages min values */
+struct ctl_table;
+struct file;
+int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *, 
+					  void *, size_t *);
+extern void setup_per_zone_pages_min(void);
+
+
 #ifdef CONFIG_NUMA
 #define MAX_NR_MEMBLKS	BITS_PER_LONG /* Max number of Memory Blocks */
 #else /* !CONFIG_NUMA */
diff -puN include/linux/sysctl.h~min_free_kbytes include/linux/sysctl.h
--- 25/include/linux/sysctl.h~min_free_kbytes	2003-05-23 19:29:12.000000000 -0700
+++ 25-akpm/include/linux/sysctl.h	2003-05-23 19:29:12.000000000 -0700
@@ -156,6 +156,7 @@ enum
 	VM_HUGETLB_PAGES=18,	/* int: Number of available Huge Pages */
 	VM_SWAPPINESS=19,	/* Tendency to steal mapped memory */
 	VM_LOWER_ZONE_PROTECTION=20,/* Amount of protection of lower zones */
+	VM_MIN_FREE_KBYTES=21,	/* Minimum free kilobytes to maintain */
 };
 
 
diff -puN init/main.c~min_free_kbytes init/main.c
--- 25/init/main.c~min_free_kbytes	2003-05-23 19:29:12.000000000 -0700
+++ 25-akpm/init/main.c	2003-05-23 19:29:12.000000000 -0700
@@ -390,6 +390,7 @@ asmlinkage void __init start_kernel(void
 	lock_kernel();
 	printk(linux_banner);
 	setup_arch(&command_line);
+	setup_per_zone_pages_min();
 	setup_per_cpu_areas();
 
 	/*
diff -puN kernel/sysctl.c~min_free_kbytes kernel/sysctl.c
--- 25/kernel/sysctl.c~min_free_kbytes	2003-05-23 19:29:12.000000000 -0700
+++ 25-akpm/kernel/sysctl.c	2003-05-23 19:29:12.000000000 -0700
@@ -57,6 +57,7 @@ extern char core_pattern[];
 extern int cad_pid;
 extern int pid_max;
 extern int sysctl_lower_zone_protection;
+extern int min_free_kbytes;
 
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
@@ -305,24 +306,26 @@ static ctl_table vm_table[] = {
 	    some nicely documented throttling code in wb_kupdate().
 
 	    There is no maximum legal value for dirty_writeback. */
-	 &one , NULL},
+	 &one, NULL},
 	{VM_DIRTY_EXPIRE_CS, "dirty_expire_centisecs",
 	 &dirty_expire_centisecs, sizeof(dirty_expire_centisecs), 0644,
 	 NULL, &proc_dointvec},
-	{ VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads",
-	  &nr_pdflush_threads, sizeof nr_pdflush_threads,
+	{VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads",
+	  &nr_pdflush_threads, sizeof(nr_pdflush_threads),
 	  0444 /* read-only*/, NULL, &proc_dointvec},
 	{VM_SWAPPINESS, "swappiness", &vm_swappiness, sizeof(vm_swappiness),
 	 0644, NULL, &proc_dointvec_minmax, &sysctl_intvec, NULL, &zero,
-	 &one_hundred },
+	 &one_hundred},
 #ifdef CONFIG_HUGETLB_PAGE
-	 {VM_HUGETLB_PAGES, "nr_hugepages", &htlbpage_max, sizeof(int), 0644,
-	  NULL, &hugetlb_sysctl_handler},
+	{VM_HUGETLB_PAGES, "nr_hugepages", &htlbpage_max, sizeof(int), 0644,
+	 NULL, &hugetlb_sysctl_handler},
 #endif
 	{VM_LOWER_ZONE_PROTECTION, "lower_zone_protection",
 	 &sysctl_lower_zone_protection, sizeof(sysctl_lower_zone_protection),
 	 0644, NULL, &proc_dointvec_minmax, &sysctl_intvec, NULL, &zero,
-	 NULL, },
+	 NULL},
+	{VM_MIN_FREE_KBYTES, "min_free_kbytes", &min_free_kbytes, 
+	 sizeof(min_free_kbytes), 0644, NULL, &min_free_kbytes_sysctl_handler},
 	{0}
 };
 
diff -puN mm/page_alloc.c~min_free_kbytes mm/page_alloc.c
--- 25/mm/page_alloc.c~min_free_kbytes	2003-05-23 19:29:12.000000000 -0700
+++ 25-akpm/mm/page_alloc.c	2003-05-23 19:31:54.000000000 -0700
@@ -28,6 +28,7 @@
 #include <linux/blkdev.h>
 #include <linux/slab.h>
 #include <linux/notifier.h>
+#include <linux/sysctl.h>
 
 #include <asm/topology.h>
 #include <asm/cacheflush.h>
@@ -50,9 +51,7 @@ struct zone *zone_table[MAX_NR_ZONES*MAX
 EXPORT_SYMBOL(zone_table);
 
 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
-static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 128, 128, 128, };
-static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, };
-static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, };
+int min_free_kbytes = 1024;
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
 static int __map_pages(struct page *page, unsigned int num, pgprot_t prot)
@@ -1254,7 +1253,6 @@ static void __init free_area_init_core(s
 	
 	for (j = 0; j < MAX_NR_ZONES; j++) {
 		struct zone *zone = pgdat->node_zones + j;
-		unsigned long mask;
 		unsigned long size, realsize;
 		unsigned long batch;
 
@@ -1328,15 +1326,6 @@ static void __init free_area_init_core(s
 
 		pgdat->nr_zones = j+1;
 
-		mask = (realsize / zone_balance_ratio[j]);
-		if (mask < zone_balance_min[j])
-			mask = zone_balance_min[j];
-		else if (mask > zone_balance_max[j])
-			mask = zone_balance_max[j];
-		zone->pages_min = mask;
-		zone->pages_low = mask*2;
-		zone->pages_high = mask*3;
-
 		zone->zone_mem_map = lmem_map;
 		zone->zone_start_pfn = zone_start_pfn;
 
@@ -1421,19 +1410,6 @@ void __init free_area_init(unsigned long
 }
 #endif
 
-static int __init setup_mem_frac(char *str)
-{
-	int j = 0;
-
-	while (get_option(&str, &zone_balance_ratio[j++]) == 2);
-	printk("setup_mem_frac: ");
-	for (j = 0; j < MAX_NR_ZONES; j++) printk("%d  ", zone_balance_ratio[j]);
-	printk("\n");
-	return 1;
-}
-
-__setup("memfrac=", setup_mem_frac);
-
 #ifdef CONFIG_PROC_FS
 
 #include <linux/seq_file.h>
@@ -1610,3 +1586,63 @@ void __init page_alloc_init(void)
 	init_page_alloc_cpu(smp_processor_id());
 	register_cpu_notifier(&page_alloc_nb);
 }
+
+/*
+ * setup_per_zone_pages_min - called when min_free_kbytes changes.  Ensures 
+ *	that the pages_{min,low,high} values for each zone are set correctly 
+ *	with respect to min_free_kbytes.
+ */
+void setup_per_zone_pages_min(void)
+{
+	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
+	unsigned long lowmem_pages = 0;
+	struct zone *zone;
+
+	/* Calculate total number of !ZONE_HIGHMEM pages */
+	for_each_zone(zone)
+		if (!is_highmem(zone))
+			lowmem_pages += zone->present_pages;
+
+	for_each_zone(zone) {
+		spin_lock_irq(&zone->lru_lock);
+		if (is_highmem(zone)) {
+			/*
+			 * Often, highmem doesn't need to reserve any pages.
+			 * But the pages_min/low/high values are also used for
+			 * batching up page reclaim activity so we need a
+			 * decent value here.
+			 */
+			int min_pages;
+
+			min_pages = zone->present_pages / 1024;
+			if (min_pages < SWAP_CLUSTER_MAX)
+				min_pages = SWAP_CLUSTER_MAX;
+			if (min_pages > 128)
+				min_pages = 128;
+			zone->pages_min = min_pages;
+		} else {
+			/* if it's a lowmem zone, reserve a number of pages 
+			 * proportionate to the zone's size.
+			 */
+			zone->pages_min = (pages_min * zone->present_pages) / 
+			                   lowmem_pages;
+		}
+
+		zone->pages_low = zone->pages_min * 2;
+		zone->pages_high = zone->pages_min * 3;
+		spin_unlock_irq(&zone->lru_lock);
+	}
+}
+
+/*
+ * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 
+ *	that we can call setup_per_zone_pages_min() whenever min_free_kbytes 
+ *	changes.
+ */
+int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 
+		struct file *file, void *buffer, size_t *length)
+{
+	proc_dointvec(table, write, file, buffer, length);
+	setup_per_zone_pages_min();
+	return 0;
+}

_