[PATCH] Permit inode & dentry hash tables to be allocated > MAX_ORDER size

Here's a patch to allocate memory for big system hash tables with the bootmem allocator rather than with main page allocator. It is needed for three reasons: (1) So that the size can be bigger than MAX_ORDER. IBM have done some testing on their big PPC64 systems (64GB of RAM) with linux-2.4 and found that they get better performance if the sizes of the inode cache hash, dentry cache hash, buffer head hash and page cache hash are increased beyond MAX_ORDER (order 11). Now the main allocator can't allocate anything larger than MAX_ORDER, but the bootmem allocator can. In 2.6 it appears that only the inode and dentry hashes remain of those four, but there are other hash tables that could use this service. (2) Changing MAX_ORDER appears to have a number of effects beyond just limiting the maximum size that can be allocated in one go. (3) Should someone want a hash table in which each bucket isn't a power of two in size, memory will be wasted as the chunk of memory allocated will be a power of two in size (to hold a power of two number of buckets). On the other hand, using the bootmem allocator means the allocation will only take up sufficient pages to hold it, rather than the next power of two up. Admittedly, this point doesn't apply to the dentry and inode hashes, but it might to another hash table that might want to use this service. I've coelesced the meat of the inode and dentry allocation routines into one such routine in mm/page_alloc.c that the the respective initialisation functions now call before mem_init() is called. This routine gets it's approximation of memory size by counting up the ZONE_NORMAL and ZONE_DMA pages (and ZONE_HIGHMEM if requested) in all the nodes passed to the main allocator by paging_init() (or wherever the arch does it). It does not use max_low_pfn as that doesn't seem to be available on all archs, and it doesn't use num_physpages since that includes highmem pages not available to the kernel for allocating data structures upon - which may not be appropriate when calculating hash table size. On the off chance that the size of each hash bucket may not be exactly a power of two, the routine will only allocate as many pages as is necessary to ensure that the number of buckets is exactly a power of two, rather than allocating the smallest power-of-two sized chunk of memory that will hold the same array of buckets. The maximum size of any single hash table is given by MAX_SYS_HASH_TABLE_ORDER, as is now defined in linux/mmzone.h. Signed-off-by: Paul Mackerras <paulus@samba.org> Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: David Howells <dhowells@redhat.com> 2004-06-20 04:52:59 -0700
committer: Linus Torvalds <torvalds@ppc970.osdl.org> 2004-06-20 04:52:59 -0700
commit: 17e14befcea27d734fd4f2247f2186683ee35ae0 (patch)
tree: 29a5b0462c662ec8b502ac4549e4912d9201bb48 /mm
parent: f9e60d7bb448dabc574780d406cfbe82e4bd335e (diff)
download: history-17e14befcea27d734fd4f2247f2186683ee35ae0.tar.gz
1 files changed, 73 insertions, 0 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 444bb534dbd8ee..16d5c2af94ee02 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -55,6 +55,9 @@ EXPORT_SYMBOL(zone_table);
 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
 int min_free_kbytes = 1024;
 
+static unsigned long __initdata nr_kernel_pages;
+static unsigned long __initdata nr_all_pages;
+
 /*
  * Temporary debugging check for pages not lying within a given zone.
  */
@@ -1430,6 +1433,10 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
 		if (zholes_size)
 			realsize -= zholes_size[j];
 
+		if (j == ZONE_DMA || j == ZONE_NORMAL)
+			nr_kernel_pages += realsize;
+		nr_all_pages += realsize;
+
 		zone->spanned_pages = size;
 		zone->present_pages = realsize;
 		zone->name = zone_names[j];
@@ -1970,3 +1977,69 @@ int lower_zone_protection_sysctl_handler(ctl_table *table, int write,
 	setup_per_zone_protection();
 	return 0;
 }
+
+/*
+ * allocate a large system hash table from bootmem
+ * - it is assumed that the hash table must contain an exact power-of-2
+ *   quantity of entries
+ */
+void *__init alloc_large_system_hash(const char *tablename,
+				     unsigned long bucketsize,
+				     unsigned long numentries,
+				     int scale,
+				     int consider_highmem,
+				     unsigned int *_hash_shift,
+				     unsigned int *_hash_mask)
+{
+	unsigned long mem, max, log2qty, size;
+	void *table;
+
+	/* round applicable memory size up to nearest megabyte */
+	mem = consider_highmem ? nr_all_pages : nr_kernel_pages;
+	mem += (1UL << (20 - PAGE_SHIFT)) - 1;
+	mem >>= 20 - PAGE_SHIFT;
+	mem <<= 20 - PAGE_SHIFT;
+
+	/* limit to 1 bucket per 2^scale bytes of low memory (rounded up to
+	 * nearest power of 2 in size) */
+	if (scale > PAGE_SHIFT)
+		mem >>= (scale - PAGE_SHIFT);
+	else
+		mem <<= (PAGE_SHIFT - scale);
+
+	mem = 1UL << (long_log2(mem) + 1);
+
+	/* limit allocation size */
+	max = (1UL << (PAGE_SHIFT + MAX_SYS_HASH_TABLE_ORDER)) / bucketsize;
+	if (max > mem)
+		max = mem;
+
+	/* allow the kernel cmdline to have a say */
+	if (!numentries || numentries > max)
+		numentries = max;
+
+	log2qty = long_log2(numentries);
+
+	do {
+		size = bucketsize << log2qty;
+
+		table = (void *) alloc_bootmem(size);
+
+	} while (!table && size > PAGE_SIZE);
+
+	if (!table)
+		panic("Failed to allocate %s hash table\n", tablename);
+
+	printk("%s hash table entries: %d (order: %d, %lu bytes)\n",
+	       tablename,
+	       (1U << log2qty),
+	       long_log2(size) - PAGE_SHIFT,
+	       size);
+
+	if (_hash_shift)
+		*_hash_shift = log2qty;
+	if (_hash_mask)
+		*_hash_mask = (1 << log2qty) - 1;
+
+	return table;
+}
author	David Howells <dhowells@redhat.com>	2004-06-20 04:52:59 -0700
committer	Linus Torvalds <torvalds@ppc970.osdl.org>	2004-06-20 04:52:59 -0700
commit	17e14befcea27d734fd4f2247f2186683ee35ae0 (patch)
tree	29a5b0462c662ec8b502ac4549e4912d9201bb48 /mm
parent	f9e60d7bb448dabc574780d406cfbe82e4bd335e (diff)
download	history-17e14befcea27d734fd4f2247f2186683ee35ae0.tar.gz