From 99f895518368252ba862cc15ce4eb98ebbe1bec6 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 26 Jun 2006 00:25:55 -0700 Subject: [PATCH] proc: don't lock task_structs indefinitely Every inode in /proc holds a reference to a struct task_struct. If a directory or file is opened and remains open after the the task exits this pinning continues. With 8K stacks on a 32bit machine the amount pinned per file descriptor is about 10K. Normally I would figure a reasonable per user process limit is about 100 processes. With 80 processes, with a 1000 file descriptors each I can trigger the 00M killer on a 32bit kernel, because I have pinned about 800MB of useless data. This patch replaces the struct task_struct pointer with a pointer to a struct task_ref which has a struct task_struct pointer. The so the pinning of dead tasks does not happen. The code now has to contend with the fact that the task may now exit at any time. Which is a little but not muh more complicated. With this change it takes about 1000 processes each opening up 1000 file descriptors before I can trigger the OOM killer. Much better. [mlp@google.com: task_mmu small fixes] Signed-off-by: Eric W. Biederman Cc: Trond Myklebust Cc: Paul Jackson Cc: Oleg Nesterov Cc: Albert Cahalan Signed-off-by: Prasanna Meda Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 355 ++++++++++++++++++++++++++++++++++-------------- fs/proc/inode.c | 9 +- fs/proc/internal.h | 15 +- fs/proc/task_mmu.c | 72 +++++++--- include/linux/proc_fs.h | 8 +- kernel/cpuset.c | 27 +++- mm/mempolicy.c | 6 +- 7 files changed, 349 insertions(+), 143 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 20746e124409fe..489810abc72d9b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -307,12 +307,15 @@ static struct pid_entry tid_attr_stuff[] = { static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt) { - struct task_struct *task = proc_task(inode); - struct files_struct *files; + struct task_struct *task = get_proc_task(inode); + struct files_struct *files = NULL; struct file *file; int fd = proc_fd(inode); - files = get_files_struct(task); + if (task) { + files = get_files_struct(task); + put_task_struct(task); + } if (files) { /* * We are not taking a ref to the file structure, so we must @@ -344,10 +347,29 @@ static struct fs_struct *get_fs_struct(struct task_struct *task) return fs; } +static int get_nr_threads(struct task_struct *tsk) +{ + /* Must be called with the rcu_read_lock held */ + unsigned long flags; + int count = 0; + + if (lock_task_sighand(tsk, &flags)) { + count = atomic_read(&tsk->signal->count); + unlock_task_sighand(tsk, &flags); + } + return count; +} + static int proc_cwd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt) { - struct fs_struct *fs = get_fs_struct(proc_task(inode)); + struct task_struct *task = get_proc_task(inode); + struct fs_struct *fs = NULL; int result = -ENOENT; + + if (task) { + fs = get_fs_struct(task); + put_task_struct(task); + } if (fs) { read_lock(&fs->lock); *mnt = mntget(fs->pwdmnt); @@ -361,8 +383,14 @@ static int proc_cwd_link(struct inode *inode, struct dentry **dentry, struct vfs static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt) { - struct fs_struct *fs = get_fs_struct(proc_task(inode)); + struct task_struct *task = get_proc_task(inode); + struct fs_struct *fs = NULL; int result = -ENOENT; + + if (task) { + fs = get_fs_struct(task); + put_task_struct(task); + } if (fs) { read_lock(&fs->lock); *mnt = mntget(fs->rootmnt); @@ -550,16 +578,19 @@ struct proc_mounts { static int mounts_open(struct inode *inode, struct file *file) { - struct task_struct *task = proc_task(inode); - struct namespace *namespace; + struct task_struct *task = get_proc_task(inode); + struct namespace *namespace = NULL; struct proc_mounts *p; int ret = -EINVAL; - task_lock(task); - namespace = task->namespace; - if (namespace) - get_namespace(namespace); - task_unlock(task); + if (task) { + task_lock(task); + namespace = task->namespace; + if (namespace) + get_namespace(namespace); + task_unlock(task); + put_task_struct(task); + } if (namespace) { ret = -ENOMEM; @@ -616,17 +647,21 @@ static struct file_operations proc_mounts_operations = { extern struct seq_operations mountstats_op; static int mountstats_open(struct inode *inode, struct file *file) { - struct task_struct *task = proc_task(inode); int ret = seq_open(file, &mountstats_op); if (!ret) { struct seq_file *m = file->private_data; - struct namespace *namespace; - task_lock(task); - namespace = task->namespace; - if (namespace) - get_namespace(namespace); - task_unlock(task); + struct namespace *namespace = NULL; + struct task_struct *task = get_proc_task(inode); + + if (task) { + task_lock(task); + namespace = task->namespace; + if (namespace) + get_namespace(namespace); + task_unlock(task); + put_task_struct(task); + } if (namespace) m->private = namespace; @@ -653,18 +688,27 @@ static ssize_t proc_info_read(struct file * file, char __user * buf, struct inode * inode = file->f_dentry->d_inode; unsigned long page; ssize_t length; - struct task_struct *task = proc_task(inode); + struct task_struct *task = get_proc_task(inode); + + length = -ESRCH; + if (!task) + goto out_no_task; if (count > PROC_BLOCK_SIZE) count = PROC_BLOCK_SIZE; + + length = -ENOMEM; if (!(page = __get_free_page(GFP_KERNEL))) - return -ENOMEM; + goto out; length = PROC_I(inode)->op.proc_read(task, (char*)page); if (length >= 0) length = simple_read_from_buffer(buf, count, ppos, (char *)page, length); free_page(page); +out: + put_task_struct(task); +out_no_task: return length; } @@ -681,12 +725,15 @@ static int mem_open(struct inode* inode, struct file* file) static ssize_t mem_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { - struct task_struct *task = proc_task(file->f_dentry->d_inode); + struct task_struct *task = get_proc_task(file->f_dentry->d_inode); char *page; unsigned long src = *ppos; int ret = -ESRCH; struct mm_struct *mm; + if (!task) + goto out_no_task; + if (!MAY_PTRACE(task) || !ptrace_may_attach(task)) goto out; @@ -736,6 +783,8 @@ out_put: out_free: free_page((unsigned long) page); out: + put_task_struct(task); +out_no_task: return ret; } @@ -748,15 +797,20 @@ static ssize_t mem_write(struct file * file, const char * buf, { int copied = 0; char *page; - struct task_struct *task = proc_task(file->f_dentry->d_inode); + struct task_struct *task = get_proc_task(file->f_dentry->d_inode); unsigned long dst = *ppos; + copied = -ESRCH; + if (!task) + goto out_no_task; + if (!MAY_PTRACE(task) || !ptrace_may_attach(task)) - return -ESRCH; + goto out; + copied = -ENOMEM; page = (char *)__get_free_page(GFP_USER); if (!page) - return -ENOMEM; + goto out; while (count > 0) { int this_len, retval; @@ -779,6 +833,9 @@ static ssize_t mem_write(struct file * file, const char * buf, } *ppos = dst; free_page((unsigned long) page); +out: + put_task_struct(task); +out_no_task: return copied; } #endif @@ -809,12 +866,17 @@ static struct file_operations proc_mem_operations = { static ssize_t oom_adjust_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *task = proc_task(file->f_dentry->d_inode); + struct task_struct *task = get_proc_task(file->f_dentry->d_inode); char buffer[PROC_NUMBUF]; size_t len; - int oom_adjust = task->oomkilladj; + int oom_adjust; loff_t __ppos = *ppos; + if (!task) + return -ESRCH; + oom_adjust = task->oomkilladj; + put_task_struct(task); + len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust); if (__ppos >= len) return 0; @@ -829,7 +891,7 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf, static ssize_t oom_adjust_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *task = proc_task(file->f_dentry->d_inode); + struct task_struct *task; char buffer[PROC_NUMBUF], *end; int oom_adjust; @@ -845,7 +907,11 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, return -EINVAL; if (*end == '\n') end++; + task = get_proc_task(file->f_dentry->d_inode); + if (!task) + return -ESRCH; task->oomkilladj = oom_adjust; + put_task_struct(task); if (end - buffer == 0) return -EIO; return end - buffer; @@ -862,12 +928,15 @@ static ssize_t proc_loginuid_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { struct inode * inode = file->f_dentry->d_inode; - struct task_struct *task = proc_task(inode); + struct task_struct *task = get_proc_task(inode); ssize_t length; char tmpbuf[TMPBUFLEN]; + if (!task) + return -ESRCH; length = scnprintf(tmpbuf, TMPBUFLEN, "%u", audit_get_loginuid(task->audit_context)); + put_task_struct(task); return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); } @@ -877,13 +946,12 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, struct inode * inode = file->f_dentry->d_inode; char *page, *tmp; ssize_t length; - struct task_struct *task = proc_task(inode); uid_t loginuid; if (!capable(CAP_AUDIT_CONTROL)) return -EPERM; - if (current != task) + if (current != proc_tref(inode)->task) return -EPERM; if (count >= PAGE_SIZE) @@ -907,7 +975,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, goto out_free_page; } - length = audit_set_loginuid(task, loginuid); + length = audit_set_loginuid(current, loginuid); if (likely(length == 0)) length = count; @@ -926,13 +994,16 @@ static struct file_operations proc_loginuid_operations = { static ssize_t seccomp_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *tsk = proc_task(file->f_dentry->d_inode); + struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode); char __buf[20]; loff_t __ppos = *ppos; size_t len; + if (!tsk) + return -ESRCH; /* no need to print the trailing zero, so use only len */ len = sprintf(__buf, "%u\n", tsk->seccomp.mode); + put_task_struct(tsk); if (__ppos >= len) return 0; if (count > len - __ppos) @@ -946,29 +1017,43 @@ static ssize_t seccomp_read(struct file *file, char __user *buf, static ssize_t seccomp_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *tsk = proc_task(file->f_dentry->d_inode); + struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode); char __buf[20], *end; unsigned int seccomp_mode; + ssize_t result; + + result = -ESRCH; + if (!tsk) + goto out_no_task; /* can set it only once to be even more secure */ + result = -EPERM; if (unlikely(tsk->seccomp.mode)) - return -EPERM; + goto out; + result = -EFAULT; memset(__buf, 0, sizeof(__buf)); count = min(count, sizeof(__buf) - 1); if (copy_from_user(__buf, buf, count)) - return -EFAULT; + goto out; + seccomp_mode = simple_strtoul(__buf, &end, 0); if (*end == '\n') end++; + result = -EINVAL; if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) { tsk->seccomp.mode = seccomp_mode; set_tsk_thread_flag(tsk, TIF_SECCOMP); } else - return -EINVAL; + goto out; + result = -EIO; if (unlikely(!(end - __buf))) - return -EIO; - return end - __buf; + goto out; + result = end - __buf; +out: + put_task_struct(tsk); +out_no_task: + return result; } static struct file_operations proc_seccomp_operations = { @@ -995,7 +1080,7 @@ static int proc_check_dentry_visible(struct inode *inode, /* See if the the two tasks share a commone set of * file descriptors. If so everything is visible. */ - task = proc_task(inode); + task = get_proc_task(inode); if (!task) goto out; files = get_files_struct(current); @@ -1006,6 +1091,7 @@ static int proc_check_dentry_visible(struct inode *inode, put_files_struct(task_files); if (files) put_files_struct(files); + put_task_struct(task); if (!error) goto out; @@ -1106,7 +1192,7 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir) { struct dentry *dentry = filp->f_dentry; struct inode *inode = dentry->d_inode; - struct task_struct *p = proc_task(inode); + struct task_struct *p = get_proc_task(inode); unsigned int fd, tid, ino; int retval; char buf[PROC_NUMBUF]; @@ -1114,8 +1200,8 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir) struct fdtable *fdt; retval = -ENOENT; - if (!pid_alive(p)) - goto out; + if (!p) + goto out_no_task; retval = 0; tid = p->pid; @@ -1164,6 +1250,8 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir) put_files_struct(files); } out: + put_task_struct(p); +out_no_task: return retval; } @@ -1175,16 +1263,18 @@ static int proc_pident_readdir(struct file *filp, int pid; struct dentry *dentry = filp->f_dentry; struct inode *inode = dentry->d_inode; + struct task_struct *task = get_proc_task(inode); struct pid_entry *p; ino_t ino; int ret; ret = -ENOENT; - if (!pid_alive(proc_task(inode))) + if (!task) goto out; ret = 0; - pid = proc_task(inode)->pid; + pid = task->pid; + put_task_struct(task); i = filp->f_pos; switch (i) { case 0: @@ -1270,14 +1360,13 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; inode->i_ino = fake_ino(task->pid, ino); - if (!pid_alive(task)) - goto out_unlock; - /* * grab the reference to task. */ - get_task_struct(task); - ei->task = task; + ei->tref = tref_get_by_task(task); + if (!tref_task(ei->tref)) + goto out_unlock; + inode->i_uid = 0; inode->i_gid = 0; if (task_dumpable(task)) { @@ -1303,13 +1392,21 @@ out_unlock: * * Rewrite the inode's ownerships here because the owning task may have * performed a setuid(), etc. + * + * Before the /proc/pid/status file was created the only way to read + * the effective uid of a /process was to stat /proc/pid. Reading + * /proc/pid/status is slow enough that procps and other packages + * kept stating /proc/pid. To keep the rules in /proc simple I have + * made this apply to all per process world readable and executable + * directories. */ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; - struct task_struct *task = proc_task(inode); - if (pid_alive(task)) { - if (task_dumpable(task)) { + struct task_struct *task = get_proc_task(inode); + if (task) { + if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || + task_dumpable(task)) { inode->i_uid = task->euid; inode->i_gid = task->egid; } else { @@ -1317,37 +1414,63 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd) inode->i_gid = 0; } security_task_to_inode(task, inode); + put_task_struct(task); return 1; } d_drop(dentry); return 0; } +static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + struct task_struct *task; + generic_fillattr(inode, stat); + + rcu_read_lock(); + stat->uid = 0; + stat->gid = 0; + task = pid_task(proc_pid(inode), PIDTYPE_PID); + if (task) { + if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || + task_dumpable(task)) { + stat->uid = task->euid; + stat->gid = task->egid; + } + } + rcu_read_unlock(); + return 0; +} + static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; - struct task_struct *task = proc_task(inode); + struct task_struct *task = get_proc_task(inode); int fd = proc_fd(inode); struct files_struct *files; - files = get_files_struct(task); - if (files) { - rcu_read_lock(); - if (fcheck_files(files, fd)) { + if (task) { + files = get_files_struct(task); + if (files) { + rcu_read_lock(); + if (fcheck_files(files, fd)) { + rcu_read_unlock(); + put_files_struct(files); + if (task_dumpable(task)) { + inode->i_uid = task->euid; + inode->i_gid = task->egid; + } else { + inode->i_uid = 0; + inode->i_gid = 0; + } + security_task_to_inode(task, inode); + put_task_struct(task); + return 1; + } rcu_read_unlock(); put_files_struct(files); - if (task_dumpable(task)) { - inode->i_uid = task->euid; - inode->i_gid = task->egid; - } else { - inode->i_uid = 0; - inode->i_gid = 0; - } - security_task_to_inode(task, inode); - return 1; } - rcu_read_unlock(); - put_files_struct(files); + put_task_struct(task); } d_drop(dentry); return 0; @@ -1359,7 +1482,7 @@ static int pid_delete_dentry(struct dentry * dentry) * If so, then don't put the dentry on the lru list, * kill it immediately. */ - return !pid_alive(proc_task(dentry->d_inode)); + return !proc_tref(dentry->d_inode)->task; } static struct dentry_operations tid_fd_dentry_operations = @@ -1401,7 +1524,7 @@ out: /* SMP-safe */ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, struct nameidata *nd) { - struct task_struct *task = proc_task(dir); + struct task_struct *task = get_proc_task(dir); unsigned fd = name_to_int(dentry); struct dentry *result = ERR_PTR(-ENOENT); struct file * file; @@ -1409,10 +1532,10 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, struct inode *inode; struct proc_inode *ei; + if (!task) + goto out_no_task; if (fd == ~0U) goto out; - if (!pid_alive(task)) - goto out; inode = proc_pid_make_inode(dir->i_sb, task, PROC_TID_FD_DIR+fd); if (!inode) @@ -1447,6 +1570,8 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, if (tid_fd_revalidate(dentry, NULL)) result = NULL; out: + put_task_struct(task); +out_no_task: return result; out_unlock2: @@ -1490,12 +1615,17 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf, struct inode * inode = file->f_dentry->d_inode; unsigned long page; ssize_t length; - struct task_struct *task = proc_task(inode); + struct task_struct *task = get_proc_task(inode); + + length = -ESRCH; + if (!task) + goto out_no_task; if (count > PAGE_SIZE) count = PAGE_SIZE; + length = -ENOMEM; if (!(page = __get_free_page(GFP_KERNEL))) - return -ENOMEM; + goto out; length = security_getprocattr(task, (char*)file->f_dentry->d_name.name, @@ -1503,6 +1633,9 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf, if (length >= 0) length = simple_read_from_buffer(buf, count, ppos, (char *)page, length); free_page(page); +out: + put_task_struct(task); +out_no_task: return length; } @@ -1512,26 +1645,36 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, struct inode * inode = file->f_dentry->d_inode; char *page; ssize_t length; - struct task_struct *task = proc_task(inode); + struct task_struct *task = get_proc_task(inode); + length = -ESRCH; + if (!task) + goto out_no_task; if (count > PAGE_SIZE) count = PAGE_SIZE; - if (*ppos != 0) { - /* No partial writes. */ - return -EINVAL; - } + + /* No partial writes. */ + length = -EINVAL; + if (*ppos != 0) + goto out; + + length = -ENOMEM; page = (char*)__get_free_page(GFP_USER); if (!page) - return -ENOMEM; + goto out; + length = -EFAULT; if (copy_from_user(page, buf, count)) - goto out; + goto out_free; length = security_setprocattr(task, (char*)file->f_dentry->d_name.name, (void*)page, count); -out: +out_free: free_page((unsigned long) page); +out: + put_task_struct(task); +out_no_task: return length; } @@ -1553,15 +1696,15 @@ static struct dentry *proc_pident_lookup(struct inode *dir, { struct inode *inode; struct dentry *error; - struct task_struct *task = proc_task(dir); + struct task_struct *task = get_proc_task(dir); struct pid_entry *p; struct proc_inode *ei; error = ERR_PTR(-ENOENT); inode = NULL; - if (!pid_alive(task)) - goto out; + if (!task) + goto out_no_task; for (p = ents; p->name; p++) { if (p->len != dentry->d_name.len) @@ -1748,6 +1891,8 @@ static struct dentry *proc_pident_lookup(struct inode *dir, if (pid_revalidate(dentry, NULL)) error = NULL; out: + put_task_struct(task); +out_no_task: return error; } @@ -1771,10 +1916,12 @@ static struct file_operations proc_tid_base_operations = { static struct inode_operations proc_tgid_base_inode_operations = { .lookup = proc_tgid_base_lookup, + .getattr = pid_getattr, }; static struct inode_operations proc_tid_base_inode_operations = { .lookup = proc_tid_base_lookup, + .getattr = pid_getattr, }; #ifdef CONFIG_SECURITY @@ -1816,10 +1963,12 @@ static struct dentry *proc_tid_attr_lookup(struct inode *dir, static struct inode_operations proc_tgid_attr_inode_operations = { .lookup = proc_tgid_attr_lookup, + .getattr = pid_getattr, }; static struct inode_operations proc_tid_attr_inode_operations = { .lookup = proc_tid_attr_lookup, + .getattr = pid_getattr, }; #endif @@ -1981,10 +2130,13 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry { struct dentry *result = ERR_PTR(-ENOENT); struct task_struct *task; - struct task_struct *leader = proc_task(dir); + struct task_struct *leader = get_proc_task(dir); struct inode *inode; unsigned tid; + if (!leader) + goto out_no_task; + tid = name_to_int(dentry); if (tid == ~0U) goto out; @@ -2024,6 +2176,8 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry out_drop_task: put_task_struct(task); out: + put_task_struct(leader); +out_no_task: return result; } @@ -2163,12 +2317,7 @@ static struct task_struct *first_tid(struct task_struct *leader, int tid, int nr /* If nr exceeds the number of threads there is nothing todo */ if (nr) { - int threads = 0; - task_lock(leader); - if (leader->signal) - threads = atomic_read(&leader->signal->count); - task_unlock(leader); - if (nr >= threads) + if (nr >= get_nr_threads(leader)) goto done; } @@ -2218,15 +2367,15 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi char buf[PROC_NUMBUF]; struct dentry *dentry = filp->f_dentry; struct inode *inode = dentry->d_inode; - struct task_struct *leader = proc_task(inode); + struct task_struct *leader = get_proc_task(inode); struct task_struct *task; int retval = -ENOENT; ino_t ino; int tid; unsigned long pos = filp->f_pos; /* avoiding "long long" filp->f_pos */ - if (!pid_alive(leader)) - goto out; + if (!leader) + goto out_no_task; retval = 0; switch (pos) { @@ -2266,20 +2415,22 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi } out: filp->f_pos = pos; + put_task_struct(leader); +out_no_task: return retval; } static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct inode *inode = dentry->d_inode; - struct task_struct *p = proc_task(inode); + struct task_struct *p = get_proc_task(inode); generic_fillattr(inode, stat); - if (pid_alive(p)) { - task_lock(p); - if (p->signal) - stat->nlink += atomic_read(&p->signal->count); - task_unlock(p); + if (p) { + rcu_read_lock(); + stat->nlink += get_nr_threads(p); + rcu_read_unlock(); + put_task_struct(p); } return 0; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index fbc94df138a76b..31e0475c6cb9a1 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -58,14 +58,11 @@ static void de_put(struct proc_dir_entry *de) static void proc_delete_inode(struct inode *inode) { struct proc_dir_entry *de; - struct task_struct *tsk; truncate_inode_pages(&inode->i_data, 0); - /* Let go of any associated process */ - tsk = PROC_I(inode)->task; - if (tsk) - put_task_struct(tsk); + /* Stop tracking associated processes */ + tref_put(PROC_I(inode)->tref); /* Let go of any associated proc directory entry */ de = PROC_I(inode)->pde; @@ -94,7 +91,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb) ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, SLAB_KERNEL); if (!ei) return NULL; - ei->task = NULL; + ei->tref = NULL; ei->fd = 0; ei->op.proc_get_link = NULL; ei->pde = NULL; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 548e7447ea4791..37f1648adc23e0 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -10,6 +10,7 @@ */ #include +#include struct vmalloc_info { unsigned long used; @@ -41,13 +42,23 @@ extern struct file_operations proc_maps_operations; extern struct file_operations proc_numa_maps_operations; extern struct file_operations proc_smaps_operations; +extern struct file_operations proc_maps_operations; +extern struct file_operations proc_numa_maps_operations; +extern struct file_operations proc_smaps_operations; + + void free_proc_entry(struct proc_dir_entry *de); int proc_init_inodecache(void); -static inline struct task_struct *proc_task(struct inode *inode) +static inline struct task_ref *proc_tref(struct inode *inode) +{ + return PROC_I(inode)->tref; +} + +static inline struct task_struct *get_proc_task(struct inode *inode) { - return PROC_I(inode)->task; + return get_tref_task(proc_tref(inode)); } static inline int proc_fd(struct inode *inode) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 4187b4e9cdb3e5..abf3208c3f60c9 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -75,9 +75,13 @@ int proc_exe_link(struct inode *inode, struct dentry **dentry, struct vfsmount * { struct vm_area_struct * vma; int result = -ENOENT; - struct task_struct *task = proc_task(inode); - struct mm_struct * mm = get_task_mm(task); + struct task_struct *task = get_proc_task(inode); + struct mm_struct * mm = NULL; + if (task) { + mm = get_task_mm(task); + put_task_struct(task); + } if (!mm) goto out; down_read(&mm->mmap_sem); @@ -120,7 +124,8 @@ struct mem_size_stats static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss) { - struct task_struct *task = m->private; + struct proc_maps_private *priv = m->private; + struct task_struct *task = priv->task; struct vm_area_struct *vma = v; struct mm_struct *mm = vma->vm_mm; struct file *file = vma->vm_file; @@ -295,12 +300,16 @@ static int show_smap(struct seq_file *m, void *v) static void *m_start(struct seq_file *m, loff_t *pos) { - struct task_struct *task = m->private; + struct proc_maps_private *priv = m->private; unsigned long last_addr = m->version; struct mm_struct *mm; - struct vm_area_struct *vma, *tail_vma; + struct vm_area_struct *vma, *tail_vma = NULL; loff_t l = *pos; + /* Clear the per syscall fields in priv */ + priv->task = NULL; + priv->tail_vma = NULL; + /* * We remember last_addr rather than next_addr to hit with * mmap_cache most of the time. We have zero last_addr at @@ -311,11 +320,15 @@ static void *m_start(struct seq_file *m, loff_t *pos) if (last_addr == -1UL) return NULL; - mm = get_task_mm(task); + priv->task = get_tref_task(priv->tref); + if (!priv->task) + return NULL; + + mm = get_task_mm(priv->task); if (!mm) return NULL; - tail_vma = get_gate_vma(task); + priv->tail_vma = tail_vma = get_gate_vma(priv->task); down_read(&mm->mmap_sem); /* Start with last addr hint */ @@ -350,11 +363,9 @@ out: return tail_vma; } -static void m_stop(struct seq_file *m, void *v) +static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma) { - struct task_struct *task = m->private; - struct vm_area_struct *vma = v; - if (vma && vma != get_gate_vma(task)) { + if (vma && vma != priv->tail_vma) { struct mm_struct *mm = vma->vm_mm; up_read(&mm->mmap_sem); mmput(mm); @@ -363,17 +374,27 @@ static void m_stop(struct seq_file *m, void *v) static void *m_next(struct seq_file *m, void *v, loff_t *pos) { - struct task_struct *task = m->private; + struct proc_maps_private *priv = m->private; struct vm_area_struct *vma = v; - struct vm_area_struct *tail_vma = get_gate_vma(task); + struct vm_area_struct *tail_vma = priv->tail_vma; (*pos)++; if (vma && (vma != tail_vma) && vma->vm_next) return vma->vm_next; - m_stop(m, v); + vma_stop(priv, vma); return (vma != tail_vma)? tail_vma: NULL; } +static void m_stop(struct seq_file *m, void *v) +{ + struct proc_maps_private *priv = m->private; + struct vm_area_struct *vma = v; + + vma_stop(priv, vma); + if (priv->task) + put_task_struct(priv->task); +} + static struct seq_operations proc_pid_maps_op = { .start = m_start, .next = m_next, @@ -391,11 +412,18 @@ static struct seq_operations proc_pid_smaps_op = { static int do_maps_open(struct inode *inode, struct file *file, struct seq_operations *ops) { - struct task_struct *task = proc_task(inode); - int ret = seq_open(file, ops); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = task; + struct proc_maps_private *priv; + int ret = -ENOMEM; + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (priv) { + priv->tref = proc_tref(inode); + ret = seq_open(file, ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = priv; + } else { + kfree(priv); + } } return ret; } @@ -409,7 +437,7 @@ struct file_operations proc_maps_operations = { .open = maps_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = seq_release_private, }; #ifdef CONFIG_NUMA @@ -431,7 +459,7 @@ struct file_operations proc_numa_maps_operations = { .open = numa_maps_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = seq_release_private, }; #endif @@ -444,5 +472,5 @@ struct file_operations proc_smaps_operations = { .open = smaps_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = seq_release_private, }; diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index d4d2081dbaf785..4c7271f0469718 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -246,7 +246,7 @@ extern void kclist_add(struct kcore_list *, void *, size_t); #endif struct proc_inode { - struct task_struct *task; + struct task_ref *tref; int fd; union { int (*proc_get_link)(struct inode *, struct dentry **, struct vfsmount **); @@ -266,4 +266,10 @@ static inline struct proc_dir_entry *PDE(const struct inode *inode) return PROC_I(inode)->pde; } +struct proc_maps_private { + struct task_ref *tref; + struct task_struct *task; + struct vm_area_struct *tail_vma; +}; + #endif /* _LINUX_PROC_FS_H */ diff --git a/kernel/cpuset.c b/kernel/cpuset.c index b602f73fb38d02..3e991c0c02e289 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include @@ -2442,31 +2443,43 @@ void __cpuset_memory_pressure_bump(void) */ static int proc_cpuset_show(struct seq_file *m, void *v) { + struct task_ref *tref; struct task_struct *tsk; char *buf; - int retval = 0; + int retval; + retval = -ENOMEM; buf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!buf) - return -ENOMEM; + goto out; + + retval = -ESRCH; + tref = m->private; + tsk = get_tref_task(tref); + if (!tsk) + goto out_free; - tsk = m->private; + retval = -EINVAL; mutex_lock(&manage_mutex); + retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE); if (retval < 0) - goto out; + goto out_unlock; seq_puts(m, buf); seq_putc(m, '\n'); -out: +out_unlock: mutex_unlock(&manage_mutex); + put_task_struct(tsk); +out_free: kfree(buf); +out: return retval; } static int cpuset_open(struct inode *inode, struct file *file) { - struct task_struct *tsk = PROC_I(inode)->task; - return single_open(file, proc_cpuset_show, tsk); + struct task_ref *tref = PROC_I(inode)->tref; + return single_open(file, proc_cpuset_show, tref); } struct file_operations proc_cpuset_operations = { diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 73e0f23b7f51e0..6b9740bbf4c019 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1821,7 +1821,7 @@ static inline void check_huge_range(struct vm_area_struct *vma, int show_numa_map(struct seq_file *m, void *v) { - struct task_struct *task = m->private; + struct proc_maps_private *priv = m->private; struct vm_area_struct *vma = v; struct numa_maps *md; struct file *file = vma->vm_file; @@ -1837,7 +1837,7 @@ int show_numa_map(struct seq_file *m, void *v) return 0; mpol_to_str(buffer, sizeof(buffer), - get_vma_policy(task, vma, vma->vm_start)); + get_vma_policy(priv->task, vma, vma->vm_start)); seq_printf(m, "%08lx %s", vma->vm_start, buffer); @@ -1891,7 +1891,7 @@ out: kfree(md); if (m->count < m->size) - m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0; + m->version = (vma != priv->tail_vma) ? vma->vm_start : 0; return 0; } -- cgit 1.2.3-korg