From: j-nomura@ce.jp.nec.com The attached patch is NUMA-aware zonelist builder patch, which sorts zonelist in the order that near-node first, far-node last. In lse-tech and linux-ia64, where most of NUMA people resides, no objections are raised so far. The patch adds NUMA-specific version of build_zonelists which calls find_next_best_node to select the next-nearest node to add to zonelist. The patch has no effect on flat NUMA platform. --- 25-akpm/include/asm-generic/topology.h | 7 ++ 25-akpm/include/asm-i386/topology.h | 6 + 25-akpm/mm/page_alloc.c | 105 +++++++++++++++++++++++++++++++++ 3 files changed, 118 insertions(+) diff -puN include/asm-generic/topology.h~numa-aware-zonelist-builder include/asm-generic/topology.h --- 25/include/asm-generic/topology.h~numa-aware-zonelist-builder Mon Mar 1 15:18:30 2004 +++ 25-akpm/include/asm-generic/topology.h Mon Mar 1 15:18:30 2004 @@ -45,6 +45,13 @@ #define pcibus_to_cpumask(bus) (cpu_online_map) #endif +#ifndef node_distance +#define node_distance(from,to) (from != to) +#endif +#ifndef PENALTY_FOR_NODE_WITH_CPUS +#define PENALTY_FOR_NODE_WITH_CPUS (1) +#endif + /* Cross-node load balancing interval. */ #ifndef NODE_BALANCE_RATE #define NODE_BALANCE_RATE 10 diff -puN include/asm-i386/topology.h~numa-aware-zonelist-builder include/asm-i386/topology.h --- 25/include/asm-i386/topology.h~numa-aware-zonelist-builder Mon Mar 1 15:18:30 2004 +++ 25-akpm/include/asm-i386/topology.h Mon Mar 1 15:18:30 2004 @@ -66,6 +66,12 @@ static inline cpumask_t pcibus_to_cpumas return node_to_cpumask(mp_bus_id_to_node[bus]); } +/* Node-to-Node distance */ +static inline int node_distance(int from, int to) +{ + return (from != to); +} + /* Cross-node load balancing interval. */ #define NODE_BALANCE_RATE 100 diff -puN mm/page_alloc.c~numa-aware-zonelist-builder mm/page_alloc.c --- 25/mm/page_alloc.c~numa-aware-zonelist-builder Mon Mar 1 15:18:30 2004 +++ 25-akpm/mm/page_alloc.c Mon Mar 1 15:18:30 2004 @@ -1128,6 +1128,109 @@ static int __init build_zonelists_node(p return j; } +#ifdef CONFIG_NUMA +#define MAX_NODE_LOAD (numnodes) +static int __initdata node_load[MAX_NUMNODES]; +/** + * find_next_best_node - find the next node that should appear in a given + * node's fallback list + * @node: node whose fallback list we're appending + * @used_node_mask: pointer to the bitmap of already used nodes + * + * We use a number of factors to determine which is the next node that should + * appear on a given node's fallback list. The node should not have appeared + * already in @node's fallback list, and it should be the next closest node + * according to the distance array (which contains arbitrary distance values + * from each node to each node in the system), and should also prefer nodes + * with no CPUs, since presumably they'll have very little allocation pressure + * on them otherwise. + * It returns -1 if no node is found. + */ +static int __init find_next_best_node(int node, void *used_node_mask) +{ + int i, n, val; + int min_val = INT_MAX; + int best_node = -1; + + for (i = 0; i < numnodes; i++) { + /* Start from local node */ + n = (node+i)%numnodes; + + /* Don't want a node to appear more than once */ + if (test_bit(n, used_node_mask)) + continue; + + /* Use the distance array to find the distance */ + val = node_distance(node, n); + + /* Give preference to headless and unused nodes */ + if (node_to_cpumask(n)) + val += PENALTY_FOR_NODE_WITH_CPUS; + + /* Slight preference for less loaded node */ + val *= (MAX_NODE_LOAD*MAX_NUMNODES); + val += node_load[n]; + + if (val < min_val) { + min_val = val; + best_node = n; + } + } + + if (best_node >= 0) + set_bit(best_node, used_node_mask); + + return best_node; +} + +static void __init build_zonelists(pg_data_t *pgdat) +{ + int i, j, k, node, local_node; + int prev_node, load; + struct zonelist *zonelist; + DECLARE_BITMAP(used_mask, MAX_NUMNODES); + + /* initialize zonelists */ + for (i = 0; i < MAX_NR_ZONES; i++) { + zonelist = pgdat->node_zonelists + i; + memset(zonelist, 0, sizeof(*zonelist)); + zonelist->zones[0] = NULL; + } + + /* NUMA-aware ordering of nodes */ + local_node = pgdat->node_id; + load = numnodes; + prev_node = local_node; + CLEAR_BITMAP(used_mask, MAX_NUMNODES); + while ((node = find_next_best_node(local_node, used_mask)) >= 0) { + /* + * We don't want to pressure a particular node. + * So adding penalty to the first node in same + * distance group to make it round-robin. + */ + if (node_distance(local_node, node) != + node_distance(local_node, prev_node)) + node_load[node] += load; + prev_node = node; + load--; + for (i = 0; i < MAX_NR_ZONES; i++) { + zonelist = pgdat->node_zonelists + i; + for (j = 0; zonelist->zones[j] != NULL; j++); + + k = ZONE_NORMAL; + if (i & __GFP_HIGHMEM) + k = ZONE_HIGHMEM; + if (i & __GFP_DMA) + k = ZONE_DMA; + + j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); + zonelist->zones[j] = NULL; + } + } +} + +#else /* CONFIG_NUMA */ + static void __init build_zonelists(pg_data_t *pgdat) { int i, j, k, node, local_node; @@ -1164,6 +1267,8 @@ static void __init build_zonelists(pg_da } } +#endif /* CONFIG_NUMA */ + void __init build_all_zonelists(void) { int i; _