__unhash_process acquires the dcache_lock while holding the tasklist_lock for writing. This can deadlock. Additionally, fs/proc/base.c incorrectly assumed that p->pid would be set to 0 during release_task. The patch fixes that by adding a new spinlock to the task structure and fixing all references to (!p->pid). The alternative to the new spinlock would be to hold dcache_lock around __unhash_process. - fs/proc/base.c assumed that p->pid is reset to 0 during exit. This is not the case anymore. I now look at the count of the pid structure for PIDTYPE_PID. - de_thread now tested - as broken as it was before: open handles to /proc/ are either stale or invalid after an exec of a nptl process, if the exec was call from a secondary thread. - a few lock_kernels removed - that part of /proc doesn't need it. - additional instances of 'if(current->pid)' replaced with pid_alive. fs/exec.c | 37 ++--------- fs/proc/base.c | 154 ++++++++++++++++++++++++++++++++-------------- fs/proc/root.c | 2 include/linux/init_task.h | 1 include/linux/proc_fs.h | 2 include/linux/sched.h | 2 kernel/exit.c | 40 +++-------- 7 files changed, 135 insertions(+), 103 deletions(-) diff -puN fs/exec.c~dcache_lock-vs-tasklist_lock-take-3 fs/exec.c --- 25/fs/exec.c~dcache_lock-vs-tasklist_lock-take-3 2003-05-22 01:16:48.000000000 -0700 +++ 25-akpm/fs/exec.c 2003-05-22 01:16:48.000000000 -0700 @@ -529,30 +529,6 @@ static int exec_mmap(struct mm_struct *m return 0; } -static struct dentry *clean_proc_dentry(struct task_struct *p) -{ - struct dentry *proc_dentry = p->proc_dentry; - - if (proc_dentry) { - spin_lock(&dcache_lock); - if (!d_unhashed(proc_dentry)) { - dget_locked(proc_dentry); - __d_drop(proc_dentry); - } else - proc_dentry = NULL; - spin_unlock(&dcache_lock); - } - return proc_dentry; -} - -static inline void put_proc_dentry(struct dentry *dentry) -{ - if (dentry) { - shrink_dcache_parent(dentry); - dput(dentry); - } -} - /* * This function makes sure the current process has its own signal table, * so that flush_signal_handlers can later reset the handlers without @@ -660,9 +636,11 @@ static inline int de_thread(struct task_ while (leader->state != TASK_ZOMBIE) yield(); + spin_lock(&leader->proc_lock); + spin_lock(¤t->proc_lock); + proc_dentry1 = proc_pid_unhash(current); + proc_dentry2 = proc_pid_unhash(leader); write_lock_irq(&tasklist_lock); - proc_dentry1 = clean_proc_dentry(current); - proc_dentry2 = clean_proc_dentry(leader); if (leader->tgid != current->tgid) BUG(); @@ -702,9 +680,10 @@ static inline int de_thread(struct task_ state = leader->state; write_unlock_irq(&tasklist_lock); - - put_proc_dentry(proc_dentry1); - put_proc_dentry(proc_dentry2); + spin_unlock(&leader->proc_lock); + spin_unlock(¤t->proc_lock); + proc_pid_flush(proc_dentry1); + proc_pid_flush(proc_dentry2); if (state != TASK_ZOMBIE) BUG(); diff -puN fs/proc/base.c~dcache_lock-vs-tasklist_lock-take-3 fs/proc/base.c --- 25/fs/proc/base.c~dcache_lock-vs-tasklist_lock-take-3 2003-05-22 01:16:48.000000000 -0700 +++ 25-akpm/fs/proc/base.c 2003-05-22 01:16:48.000000000 -0700 @@ -624,6 +624,12 @@ static struct inode_operations proc_pid_ .follow_link = proc_pid_follow_link }; +static int pid_alive(struct task_struct *p) +{ + BUG_ON(p->pids[PIDTYPE_PID].pidptr != &p->pids[PIDTYPE_PID].pid); + return atomic_read(&p->pids[PIDTYPE_PID].pid.count); +} + #define NUMBUF 10 static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir) @@ -635,6 +641,9 @@ static int proc_readfd(struct file * fil char buf[NUMBUF]; struct files_struct * files; + retval = -ENOENT; + if (!pid_alive(p)) + goto out; retval = 0; pid = p->pid; @@ -696,48 +705,46 @@ static int proc_base_readdir(struct file int pid; struct inode *inode = filp->f_dentry->d_inode; struct pid_entry *p; - int ret = 0; + int ret; - lock_kernel(); + ret = -ENOENT; + if (!pid_alive(proc_task(inode))) + goto out; + ret = 0; pid = proc_task(inode)->pid; - if (!pid) { - ret = -ENOENT; - goto out; - } i = filp->f_pos; switch (i) { - case 0: - if (filldir(dirent, ".", 1, i, inode->i_ino, DT_DIR) < 0) - goto out; - i++; - filp->f_pos++; - /* fall through */ - case 1: - if (filldir(dirent, "..", 2, i, PROC_ROOT_INO, DT_DIR) < 0) + case 0: + if (filldir(dirent, ".", 1, i, inode->i_ino, DT_DIR) < 0) + goto out; + i++; + filp->f_pos++; + /* fall through */ + case 1: + if (filldir(dirent, "..", 2, i, PROC_ROOT_INO, DT_DIR) < 0) + goto out; + i++; + filp->f_pos++; + /* fall through */ + default: + i -= 2; + if (i >= ARRAY_SIZE(base_stuff)) { + ret = 1; + goto out; + } + p = base_stuff + i; + while (p->name) { + if (filldir(dirent, p->name, p->len, filp->f_pos, + fake_ino(pid, p->type), p->mode >> 12) < 0) goto out; - i++; filp->f_pos++; - /* fall through */ - default: - i -= 2; - if (i>=sizeof(base_stuff)/sizeof(base_stuff[0])) { - ret = 1; - goto out; - } - p = base_stuff + i; - while (p->name) { - if (filldir(dirent, p->name, p->len, filp->f_pos, - fake_ino(pid, p->type), p->mode >> 12) < 0) - goto out; - filp->f_pos++; - p++; - } + p++; + } } ret = 1; out: - unlock_kernel(); return ret; } @@ -774,7 +781,7 @@ static struct inode *proc_pid_make_inode inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; inode->i_ino = fake_ino(task->pid, ino); - if (!task->pid) + if (!pid_alive(task)) goto out_unlock; /* @@ -808,7 +815,7 @@ out_unlock: */ static int pid_revalidate(struct dentry * dentry, int flags) { - if (proc_task(dentry->d_inode)->pid) + if (pid_alive(proc_task(dentry->d_inode))) return 1; d_drop(dentry); return 0; @@ -842,18 +849,23 @@ static int pid_fd_revalidate(struct dent static void pid_base_iput(struct dentry *dentry, struct inode *inode) { struct task_struct *task = proc_task(inode); - write_lock_irq(&tasklist_lock); + spin_lock(&task->proc_lock); if (task->proc_dentry == dentry) task->proc_dentry = NULL; - write_unlock_irq(&tasklist_lock); + spin_unlock(&task->proc_lock); iput(inode); } static int pid_delete_dentry(struct dentry * dentry) { - return proc_task(dentry->d_inode)->pid == 0; + /* Is the task we represent dead? + * If so, then don't put the dentry on the lru list, + * kill it immediately. + */ + return !pid_alive(proc_task(dentry->d_inode)); } + static struct dentry_operations pid_fd_dentry_operations = { .d_revalidate = pid_fd_revalidate, @@ -909,6 +921,8 @@ static struct dentry *proc_lookupfd(stru if (fd == ~0U) goto out; + if (!pid_alive(task)) + goto out; inode = proc_pid_make_inode(dir->i_sb, task, PROC_PID_FD_DIR+fd); if (!inode) @@ -937,8 +951,6 @@ static struct dentry *proc_lookupfd(stru ei->op.proc_get_link = proc_fd_link; dentry->d_op = &pid_fd_dentry_operations; d_add(dentry, inode); - if (!proc_task(dentry->d_inode)->pid) - d_drop(dentry); return NULL; out_unlock2: @@ -975,6 +987,9 @@ static struct dentry *proc_base_lookup(s error = -ENOENT; inode = NULL; + if (!pid_alive(task)) + goto out; + for (p = base_stuff; p->name; p++) { if (p->len != dentry->d_name.len) continue; @@ -1056,8 +1071,6 @@ static struct dentry *proc_base_lookup(s } dentry->d_op = &pid_dentry_operations; d_add(dentry, inode); - if (!proc_task(dentry->d_inode)->pid) - d_drop(dentry); return NULL; out: @@ -1095,6 +1108,55 @@ static struct inode_operations proc_self .follow_link = proc_self_follow_link, }; +/** + * proc_pid_unhash - Unhash /proc/ entry from the dcache. + * @p: task that should be flushed. + * + * Drops the /proc/ dcache entry from the hash chains. + * + * Dropping /proc/ entries and detach_pid must be synchroneous, + * otherwise e.g. /proc//exe might point to the wrong executable, + * if the pid value is immediately reused. This is enforced by + * - caller must acquire spin_lock(p->proc_lock) + * - must be called before detach_pid() + * - proc_pid_lookup acquires proc_lock, and checks that + * the target is not dead by looking at the attach count + * of PIDTYPE_PID. + */ + +struct dentry *proc_pid_unhash(struct task_struct *p) +{ + struct dentry *proc_dentry; + + proc_dentry = p->proc_dentry; + if (proc_dentry != NULL) { + + spin_lock(&dcache_lock); + if (!d_unhashed(proc_dentry)) { + dget_locked(proc_dentry); + __d_drop(proc_dentry); + } else + proc_dentry = NULL; + spin_unlock(&dcache_lock); + } + return proc_dentry; +} + +/** + * proc_pid_flush - recover memory used by stale /proc//x entries + * @proc_entry: directoy to prune. + * + * Shrink the /proc directory that was used by the just killed thread. + */ + +void proc_pid_flush(struct dentry *proc_dentry) +{ + if(proc_dentry != NULL) { + shrink_dcache_parent(proc_dentry); + dput(proc_dentry); + } +} + /* SMP-safe */ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry) { @@ -1143,12 +1205,12 @@ struct dentry *proc_pid_lookup(struct in inode->i_flags|=S_IMMUTABLE; dentry->d_op = &pid_base_dentry_operations; + + spin_lock(&task->proc_lock); + task->proc_dentry = dentry; d_add(dentry, inode); - read_lock(&tasklist_lock); - proc_task(dentry->d_inode)->proc_dentry = dentry; - read_unlock(&tasklist_lock); - if (!proc_task(dentry->d_inode)->pid) - d_drop(dentry); + spin_unlock(&task->proc_lock); + return NULL; out: return ERR_PTR(-ENOENT); @@ -1171,7 +1233,7 @@ static int get_pid_list(int index, unsig read_lock(&tasklist_lock); for_each_process(p) { int pid = p->pid; - if (!pid) + if (!pid_alive(p)) continue; if (--index >= 0) continue; diff -puN fs/proc/root.c~dcache_lock-vs-tasklist_lock-take-3 fs/proc/root.c --- 25/fs/proc/root.c~dcache_lock-vs-tasklist_lock-take-3 2003-05-22 01:16:48.000000000 -0700 +++ 25-akpm/fs/proc/root.c 2003-05-22 01:16:48.000000000 -0700 @@ -110,9 +110,9 @@ static int proc_root_readdir(struct file } filp->f_pos = FIRST_PROCESS_ENTRY; } + unlock_kernel(); ret = proc_pid_readdir(filp, dirent, filldir); - unlock_kernel(); return ret; } diff -puN include/linux/init_task.h~dcache_lock-vs-tasklist_lock-take-3 include/linux/init_task.h --- 25/include/linux/init_task.h~dcache_lock-vs-tasklist_lock-take-3 2003-05-22 01:16:48.000000000 -0700 +++ 25-akpm/include/linux/init_task.h 2003-05-22 01:16:48.000000000 -0700 @@ -101,6 +101,7 @@ .blocked = {{0}}, \ .posix_timers = LIST_HEAD_INIT(tsk.posix_timers), \ .alloc_lock = SPIN_LOCK_UNLOCKED, \ + .proc_lock = SPIN_LOCK_UNLOCKED, \ .switch_lock = SPIN_LOCK_UNLOCKED, \ .journal_info = NULL, \ } diff -puN include/linux/proc_fs.h~dcache_lock-vs-tasklist_lock-take-3 include/linux/proc_fs.h --- 25/include/linux/proc_fs.h~dcache_lock-vs-tasklist_lock-take-3 2003-05-22 01:16:48.000000000 -0700 +++ 25-akpm/include/linux/proc_fs.h 2003-05-22 01:16:48.000000000 -0700 @@ -87,6 +87,8 @@ extern void proc_root_init(void); extern void proc_misc_init(void); struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry); +struct dentry *proc_pid_unhash(struct task_struct *p); +void proc_pid_flush(struct dentry *proc_dentry); int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir); extern struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode, diff -puN include/linux/sched.h~dcache_lock-vs-tasklist_lock-take-3 include/linux/sched.h --- 25/include/linux/sched.h~dcache_lock-vs-tasklist_lock-take-3 2003-05-22 01:16:48.000000000 -0700 +++ 25-akpm/include/linux/sched.h 2003-05-22 01:16:48.000000000 -0700 @@ -429,6 +429,8 @@ struct task_struct { u32 self_exec_id; /* Protection of (de-)allocation: mm, files, fs, tty */ spinlock_t alloc_lock; +/* Protection of proc_dentry: nesting proc_lock, dcache_lock, write_lock_irq(&tasklist_lock); */ + spinlock_t proc_lock; /* context-switch lock */ spinlock_t switch_lock; diff -puN kernel/exit.c~dcache_lock-vs-tasklist_lock-take-3 kernel/exit.c --- 25/kernel/exit.c~dcache_lock-vs-tasklist_lock-take-3 2003-05-22 01:16:48.000000000 -0700 +++ 25-akpm/kernel/exit.c 2003-05-22 01:16:48.000000000 -0700 @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -31,10 +32,8 @@ extern struct task_struct *child_reaper; int getrusage(struct task_struct *, int, struct rusage *); -static struct dentry * __unhash_process(struct task_struct *p) +static void __unhash_process(struct task_struct *p) { - struct dentry *proc_dentry; - nr_threads--; detach_pid(p, PIDTYPE_PID); detach_pid(p, PIDTYPE_TGID); @@ -46,34 +45,25 @@ static struct dentry * __unhash_process( } REMOVE_LINKS(p); - proc_dentry = p->proc_dentry; - if (unlikely(proc_dentry != NULL)) { - spin_lock(&dcache_lock); - if (!d_unhashed(proc_dentry)) { - dget_locked(proc_dentry); - __d_drop(proc_dentry); - } else - proc_dentry = NULL; - spin_unlock(&dcache_lock); - } - return proc_dentry; } void release_task(struct task_struct * p) { - struct dentry *proc_dentry; task_t *leader; + struct dentry *proc_dentry; BUG_ON(p->state < TASK_ZOMBIE); atomic_dec(&p->user->processes); + spin_lock(&p->proc_lock); + proc_dentry = proc_pid_unhash(p); write_lock_irq(&tasklist_lock); if (unlikely(p->ptrace)) __ptrace_unlink(p); BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); __exit_signal(p); __exit_sighand(p); - proc_dentry = __unhash_process(p); + __unhash_process(p); /* * If we are the last non-leader member of the thread @@ -92,11 +82,8 @@ void release_task(struct task_struct * p p->parent->cnswap += p->nswap + p->cnswap; sched_exit(p); write_unlock_irq(&tasklist_lock); - - if (unlikely(proc_dentry != NULL)) { - shrink_dcache_parent(proc_dentry); - dput(proc_dentry); - } + spin_unlock(&p->proc_lock); + proc_pid_flush(proc_dentry); release_thread(p); put_task_struct(p); } @@ -107,14 +94,13 @@ void unhash_process(struct task_struct * { struct dentry *proc_dentry; + spin_lock(&p->proc_lock); + proc_dentry = proc_pid_unhash(p); write_lock_irq(&tasklist_lock); - proc_dentry = __unhash_process(p); + __unhash_process(p); write_unlock_irq(&tasklist_lock); - - if (unlikely(proc_dentry != NULL)) { - shrink_dcache_parent(proc_dentry); - dput(proc_dentry); - } + spin_unlock(&p->proc_lock); + proc_pid_flush(proc_dentry); } /* _