From: Paul Jackson This patch gets cpusets working with the cpus_online_map and nodes_online_map that are manipulated by the hotplug features. Cpusets keeps bitmaps of CPUs and Memory Nodes where tasks must run. Hotplug can add and remove CPUs and Memory, which could take all the CPUs or Memory in a cpuset offline. This patch handles this case by using other CPUs and Memory (from parent cpusets) in order to avoid starving a task of the CPU and Memory it needs to run. When using hotplug, administrators should reconfigure cpusets to reflect the actual resources available. This patch provides a default allocation of resources if they don't. It is better to violate a misconfigured cpusets placement than to starve a process. This patch also adds the ability to change the top, root cpuset, adding and removing CPUs and Memory Nodes from it, just as was already supported for subordinate cpusets, and following the same rules. This is necessary to allow the administrator to adapt a cpuset configuration when adding and removing CPUs and Memory Nodes using hotplug. Several arch's built. Arch ia64 sn2_defconfig booted and unit tested. No actual testing using hotplug to add or remove resources was done. Signed-off-by: Paul Jackson Signed-off-by: Andrew Morton --- 25-akpm/Documentation/cpusets.txt | 12 ++++ 25-akpm/kernel/cpuset.c | 100 +++++++++++++++++++++++++++++--------- 25-akpm/kernel/sched.c | 4 - 3 files changed, 91 insertions(+), 25 deletions(-) diff -puN Documentation/cpusets.txt~cpusets-interoperate-with-hotplug-online-maps Documentation/cpusets.txt --- 25/Documentation/cpusets.txt~cpusets-interoperate-with-hotplug-online-maps 2005-01-10 21:15:17.374812208 -0800 +++ 25-akpm/Documentation/cpusets.txt 2005-01-10 21:15:17.381811144 -0800 @@ -249,6 +249,18 @@ rewritten to the 'tasks' file of its cpu impacting the scheduler code in the kernel with a check for changes in a tasks processor placement. +There is an exception to the above. If hotplug funtionality is used +to remove all the CPUs that are currently assigned to a cpuset, +then the kernel will automatically update the cpus_allowed of all +tasks attached to CPUs in that cpuset with the online CPUs of the +nearest parent cpuset that still has some CPUs online. When memory +hotplug functionality for removing Memory Nodes is available, a +similar exception is expected to apply there as well. In general, +the kernel prefers to violate cpuset placement, over starving a task +that has had all its allowed CPUs or Memory Nodes taken offline. User +code should reconfigure cpusets to only refer to online CPUs and Memory +Nodes when using hotplug to add or remove such resources. + To start a new job that is to be contained within a cpuset, the steps are: 1) mkdir /dev/cpuset diff -puN kernel/cpuset.c~cpusets-interoperate-with-hotplug-online-maps kernel/cpuset.c --- 25/kernel/cpuset.c~cpusets-interoperate-with-hotplug-online-maps 2005-01-10 21:15:17.375812056 -0800 +++ 25-akpm/kernel/cpuset.c 2005-01-10 21:15:17.383810840 -0800 @@ -455,6 +455,55 @@ out: } /* + * Return in *pmask the portion of a cpusets's cpus_allowed that + * are online. If none are online, walk up the cpuset hierarchy + * until we find one that does have some online cpus. If we get + * all the way to the top and still haven't found any online cpus, + * return cpu_online_map. Or if passed a NULL cs from an exit'ing + * task, return cpu_online_map. + * + * One way or another, we guarantee to return some non-empty subset + * of cpu_online_map. + * + * Call with cpuset_sem held. + */ + +static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) +{ + while (cs && !cpus_intersects(cs->cpus_allowed, cpu_online_map)) + cs = cs->parent; + if (cs) + cpus_and(*pmask, cs->cpus_allowed, cpu_online_map); + else + *pmask = cpu_online_map; + BUG_ON(!cpus_intersects(*pmask, cpu_online_map)); +} + +/* + * Return in *pmask the portion of a cpusets's mems_allowed that + * are online. If none are online, walk up the cpuset hierarchy + * until we find one that does have some online mems. If we get + * all the way to the top and still haven't found any online mems, + * return node_online_map. + * + * One way or another, we guarantee to return some non-empty subset + * of node_online_map. + * + * Call with cpuset_sem held. + */ + +static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) +{ + while (cs && !nodes_intersects(cs->mems_allowed, node_online_map)) + cs = cs->parent; + if (cs) + nodes_and(*pmask, cs->mems_allowed, node_online_map); + else + *pmask = node_online_map; + BUG_ON(!nodes_intersects(*pmask, node_online_map)); +} + +/* * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q? * * One cpuset is a subset of another if all its allowed CPUs and @@ -492,18 +541,7 @@ static int is_cpuset_subset(const struct static int validate_change(const struct cpuset *cur, const struct cpuset *trial) { - struct cpuset *c, *par = cur->parent; - - /* - * Don't mess with Big Daddy - top_cpuset must remain maximal. - * And besides, the rest of this routine blows chunks if par == 0. - */ - if (cur == &top_cpuset) - return -EPERM; - - /* We must be a subset of our parent cpuset */ - if (!is_cpuset_subset(trial, par)) - return -EACCES; + struct cpuset *c, *par; /* Each of our child cpusets must be a subset of us */ list_for_each_entry(c, &cur->children, sibling) { @@ -511,6 +549,14 @@ static int validate_change(const struct return -EBUSY; } + /* Remaining checks don't apply to root cpuset */ + if ((par = cur->parent) == NULL) + return 0; + + /* We must be a subset of our parent cpuset */ + if (!is_cpuset_subset(trial, par)) + return -EACCES; + /* If either I or some sibling (!= me) is exclusive, we can't overlap */ list_for_each_entry(c, &par->children, sibling) { if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && @@ -536,6 +582,8 @@ static int update_cpumask(struct cpuset if (retval < 0) return retval; cpus_and(trialcs.cpus_allowed, trialcs.cpus_allowed, cpu_online_map); + if (cpus_empty(trialcs.cpus_allowed)) + return -ENOSPC; retval = validate_change(cs, &trialcs); if (retval == 0) cs->cpus_allowed = trialcs.cpus_allowed; @@ -551,6 +599,9 @@ static int update_nodemask(struct cpuset retval = nodelist_parse(buf, trialcs.mems_allowed); if (retval < 0) return retval; + nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map); + if (nodes_empty(trialcs.mems_allowed)) + return -ENOSPC; retval = validate_change(cs, &trialcs); if (retval == 0) { cs->mems_allowed = trialcs.mems_allowed; @@ -597,6 +648,7 @@ static int attach_task(struct cpuset *cs pid_t pid; struct task_struct *tsk; struct cpuset *oldcs; + cpumask_t cpus; if (sscanf(buf, "%d", &pid) != 1) return -EIO; @@ -636,7 +688,8 @@ static int attach_task(struct cpuset *cs tsk->cpuset = cs; task_unlock(tsk); - set_cpus_allowed(tsk, cs->cpus_allowed); + guarantee_online_cpus(cs, &cpus); + set_cpus_allowed(tsk, cpus); put_task_struct(tsk); if (atomic_dec_and_test(&oldcs->count)) @@ -1276,7 +1329,9 @@ int __init cpuset_init(void) struct dentry *root; int err; - top_cpuset.mems_allowed = node_possible_map; + top_cpuset.cpus_allowed = CPU_MASK_ALL; + top_cpuset.mems_allowed = NODE_MASK_ALL; + atomic_inc(&cpuset_mems_generation); top_cpuset.mems_generation = atomic_read(&cpuset_mems_generation); @@ -1305,12 +1360,13 @@ out: /** * cpuset_init_smp - initialize cpus_allowed * - * Description: Initialize cpus_allowed after cpu_possible_map is initialized + * Description: Finish top cpuset after cpu, node maps are initialized **/ void __init cpuset_init_smp(void) { - top_cpuset.cpus_allowed = cpu_possible_map; + top_cpuset.cpus_allowed = cpu_online_map; + top_cpuset.mems_allowed = node_online_map; } /** @@ -1358,7 +1414,9 @@ void cpuset_exit(struct task_struct *tsk * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. * * Description: Returns the cpumask_t cpus_allowed of the cpuset - * attached to the specified @tsk. + * attached to the specified @tsk. Guaranteed to return some non-empty + * subset of cpu_online_map, even if this means going outside the + * tasks cpuset. **/ const cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk) @@ -1367,10 +1425,7 @@ const cpumask_t cpuset_cpus_allowed(cons down(&cpuset_sem); task_lock((struct task_struct *)tsk); - if (tsk->cpuset) - mask = tsk->cpuset->cpus_allowed; - else - mask = CPU_MASK_ALL; + guarantee_online_cpus(tsk->cpuset, &mask); task_unlock((struct task_struct *)tsk); up(&cpuset_sem); @@ -1387,6 +1442,7 @@ void cpuset_init_current_mems_allowed(vo * update current->mems_allowed and mems_generation to the new value. * Do not call this routine if in_interrupt(). */ + void cpuset_update_current_mems_allowed() { struct cpuset *cs = current->cpuset; @@ -1395,7 +1451,7 @@ void cpuset_update_current_mems_allowed( return; /* task is exiting */ if (current->cpuset_mems_generation != cs->mems_generation) { down(&cpuset_sem); - current->mems_allowed = cs->mems_allowed; + guarantee_online_mems(cs, ¤t->mems_allowed); current->cpuset_mems_generation = cs->mems_generation; up(&cpuset_sem); } diff -puN kernel/sched.c~cpusets-interoperate-with-hotplug-online-maps kernel/sched.c --- 25/kernel/sched.c~cpusets-interoperate-with-hotplug-online-maps 2005-01-10 21:15:17.377811752 -0800 +++ 25-akpm/kernel/sched.c 2005-01-10 21:15:17.386810384 -0800 @@ -4126,9 +4126,7 @@ static void move_task_off_dead_cpu(int d /* No more Mr. Nice Guy. */ if (dest_cpu == NR_CPUS) { - tsk->cpus_allowed = cpuset_cpus_allowed(tsk); - if (!cpus_intersects(tsk->cpus_allowed, cpu_online_map)) - cpus_setall(tsk->cpus_allowed); + tsk->cpus_allowed = cpuset_cpus_allowed(tsk); dest_cpu = any_online_cpu(tsk->cpus_allowed); /* _