aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/cpusets.txt25
-rw-r--r--include/linux/mempolicy.h7
-rw-r--r--kernel/cpuset.c38
3 files changed, 68 insertions, 2 deletions
diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt
index a09a8eb80665e..e2d9afc30d2d6 100644
--- a/Documentation/cpusets.txt
+++ b/Documentation/cpusets.txt
@@ -192,6 +192,7 @@ containing the following files describing that cpuset:
- cpus: list of CPUs in that cpuset
- mems: list of Memory Nodes in that cpuset
+ - memory_migrate flag: if set, move pages to cpusets nodes
- cpu_exclusive flag: is cpu placement exclusive?
- mem_exclusive flag: is memory placement exclusive?
- tasks: list of tasks (by pid) attached to that cpuset
@@ -277,6 +278,30 @@ rewritten to the 'tasks' file of its cpuset. This is done to avoid
impacting the scheduler code in the kernel with a check for changes
in a tasks processor placement.
+Normally, once a page is allocated (given a physical page
+of main memory) then that page stays on whatever node it
+was allocated, so long as it remains allocated, even if the
+cpusets memory placement policy 'mems' subsequently changes.
+If the cpuset flag file 'memory_migrate' is set true, then when
+tasks are attached to that cpuset, any pages that task had
+allocated to it on nodes in its previous cpuset are migrated
+to the tasks new cpuset. Depending on the implementation,
+this migration may either be done by swapping the page out,
+so that the next time the page is referenced, it will be paged
+into the tasks new cpuset, usually on the node where it was
+referenced, or this migration may be done by directly copying
+the pages from the tasks previous cpuset to the new cpuset,
+where possible to the same node, relative to the new cpuset,
+as the node that held the page, relative to the old cpuset.
+Also if 'memory_migrate' is set true, then if that cpusets
+'mems' file is modified, pages allocated to tasks in that
+cpuset, that were on nodes in the previous setting of 'mems',
+will be moved to nodes in the new setting of 'mems.' Again,
+depending on the implementation, this might be done by swapping,
+or by direct copying. In either case, pages that were not in
+the tasks prior cpuset, or in the cpusets prior 'mems' setting,
+will not be moved.
+
There is an exception to the above. If hotplug functionality is used
to remove all the CPUs that are currently assigned to a cpuset,
then the kernel will automatically update the cpus_allowed of all
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 3e61e829681db..66247eff24a05 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -235,6 +235,13 @@ static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
return NODE_DATA(0)->node_zonelists + gfp_zone(GFP_HIGHUSER);
}
+static inline int do_migrate_pages(struct mm_struct *mm,
+ const nodemask_t *from_nodes,
+ const nodemask_t *to_nodes, int flags)
+{
+ return 0;
+}
+
static inline void check_highest_zone(int k)
{
}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7430640f9816d..f63383e01ec78 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -87,6 +87,7 @@ struct cpuset {
typedef enum {
CS_CPU_EXCLUSIVE,
CS_MEM_EXCLUSIVE,
+ CS_MEMORY_MIGRATE,
CS_REMOVED,
CS_NOTIFY_ON_RELEASE
} cpuset_flagbits_t;
@@ -112,6 +113,11 @@ static inline int notify_on_release(const struct cpuset *cs)
return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
}
+static inline int is_memory_migrate(const struct cpuset *cs)
+{
+ return !!test_bit(CS_MEMORY_MIGRATE, &cs->flags);
+}
+
/*
* Increment this atomic integer everytime any cpuset changes its
* mems_allowed value. Users of cpusets can track this generation
@@ -602,16 +608,24 @@ static void refresh_mems(void)
if (current->cpuset_mems_generation != my_cpusets_mem_gen) {
struct cpuset *cs;
nodemask_t oldmem = current->mems_allowed;
+ int migrate;
down(&callback_sem);
task_lock(current);
cs = current->cpuset;
+ migrate = is_memory_migrate(cs);
guarantee_online_mems(cs, &current->mems_allowed);
current->cpuset_mems_generation = cs->mems_generation;
task_unlock(current);
up(&callback_sem);
- if (!nodes_equal(oldmem, current->mems_allowed))
+ if (!nodes_equal(oldmem, current->mems_allowed)) {
numa_policy_rebind(&oldmem, &current->mems_allowed);
+ if (migrate) {
+ do_migrate_pages(current->mm, &oldmem,
+ &current->mems_allowed,
+ MPOL_MF_MOVE_ALL);
+ }
+ }
}
}
@@ -795,7 +809,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
/*
* update_flag - read a 0 or a 1 in a file and update associated flag
* bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
- * CS_NOTIFY_ON_RELEASE)
+ * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE)
* cs: the cpuset to update
* buf: the buffer where we read the 0 or 1
*
@@ -848,6 +862,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
struct task_struct *tsk;
struct cpuset *oldcs;
cpumask_t cpus;
+ nodemask_t from, to;
if (sscanf(pidbuf, "%d", &pid) != 1)
return -EIO;
@@ -893,7 +908,12 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
guarantee_online_cpus(cs, &cpus);
set_cpus_allowed(tsk, cpus);
+ from = oldcs->mems_allowed;
+ to = cs->mems_allowed;
+
up(&callback_sem);
+ if (is_memory_migrate(cs))
+ do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL);
put_task_struct(tsk);
if (atomic_dec_and_test(&oldcs->count))
check_for_release(oldcs, ppathbuf);
@@ -905,6 +925,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
typedef enum {
FILE_ROOT,
FILE_DIR,
+ FILE_MEMORY_MIGRATE,
FILE_CPULIST,
FILE_MEMLIST,
FILE_CPU_EXCLUSIVE,
@@ -960,6 +981,9 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
case FILE_NOTIFY_ON_RELEASE:
retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer);
break;
+ case FILE_MEMORY_MIGRATE:
+ retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer);
+ break;
case FILE_TASKLIST:
retval = attach_task(cs, buffer, &pathbuf);
break;
@@ -1060,6 +1084,9 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
case FILE_NOTIFY_ON_RELEASE:
*s++ = notify_on_release(cs) ? '1' : '0';
break;
+ case FILE_MEMORY_MIGRATE:
+ *s++ = is_memory_migrate(cs) ? '1' : '0';
+ break;
default:
retval = -EINVAL;
goto out;
@@ -1408,6 +1435,11 @@ static struct cftype cft_notify_on_release = {
.private = FILE_NOTIFY_ON_RELEASE,
};
+static struct cftype cft_memory_migrate = {
+ .name = "memory_migrate",
+ .private = FILE_MEMORY_MIGRATE,
+};
+
static int cpuset_populate_dir(struct dentry *cs_dentry)
{
int err;
@@ -1422,6 +1454,8 @@ static int cpuset_populate_dir(struct dentry *cs_dentry)
return err;
if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0)
return err;
+ if ((err = cpuset_add_file(cs_dentry, &cft_memory_migrate)) < 0)
+ return err;
if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0)
return err;
return 0;