__unhash_process acquires the dcache_lock while holding the tasklist_lock for writing. This can deadlock. While trying to fix that race, I found further races with proc_dentry. The attached patch (hopefully) fixes all these races. Changes: - fs/proc/base.c assumed that p->pid is reset to 0 during exit. This is not the case anymore. I now look at the count of the pid structure for PIDTYPE_PID. - pid_delete_dentry and pid_revalidate removed. The implementation didn't work, and noone complained. - only d_add the new proc entry if the task is still valid: d_add+d_drop is a race, if a lookup happens between both calls. - add a new spinlock that protects proc_dentry. fs/exec.c | 37 +++-------------- fs/proc/base.c | 99 ++++++++++++++++++++++++++++++++-------------- include/linux/init_task.h | 1 include/linux/proc_fs.h | 2 include/linux/sched.h | 2 kernel/exit.c | 40 ++++++------------ 6 files changed, 96 insertions(+), 85 deletions(-) diff -puN fs/exec.c~tasklist_lock-dcache_lock-inversion-fix fs/exec.c --- 25/fs/exec.c~tasklist_lock-dcache_lock-inversion-fix 2003-04-14 00:27:59.000000000 -0700 +++ 25-akpm/fs/exec.c 2003-04-14 00:27:59.000000000 -0700 @@ -527,30 +527,6 @@ static int exec_mmap(struct mm_struct *m return 0; } -static struct dentry *clean_proc_dentry(struct task_struct *p) -{ - struct dentry *proc_dentry = p->proc_dentry; - - if (proc_dentry) { - spin_lock(&dcache_lock); - if (!d_unhashed(proc_dentry)) { - dget_locked(proc_dentry); - __d_drop(proc_dentry); - } else - proc_dentry = NULL; - spin_unlock(&dcache_lock); - } - return proc_dentry; -} - -static inline void put_proc_dentry(struct dentry *dentry) -{ - if (dentry) { - shrink_dcache_parent(dentry); - dput(dentry); - } -} - /* * This function makes sure the current process has its own signal table, * so that flush_signal_handlers can later reset the handlers without @@ -658,9 +634,11 @@ static inline int de_thread(struct task_ while (leader->state != TASK_ZOMBIE) yield(); + task_lock(leader); + task_lock(current); + proc_dentry1 = proc_pid_unhash(current); + proc_dentry2 = proc_pid_unhash(leader); write_lock_irq(&tasklist_lock); - proc_dentry1 = clean_proc_dentry(current); - proc_dentry2 = clean_proc_dentry(leader); if (leader->tgid != current->tgid) BUG(); @@ -700,9 +678,10 @@ static inline int de_thread(struct task_ state = leader->state; write_unlock_irq(&tasklist_lock); - - put_proc_dentry(proc_dentry1); - put_proc_dentry(proc_dentry2); + task_unlock(current); + task_unlock(leader); + proc_pid_flush(proc_dentry1); + proc_pid_flush(proc_dentry2); if (state != TASK_ZOMBIE) BUG(); diff -puN fs/proc/base.c~tasklist_lock-dcache_lock-inversion-fix fs/proc/base.c --- 25/fs/proc/base.c~tasklist_lock-dcache_lock-inversion-fix 2003-04-14 00:27:59.000000000 -0700 +++ 25-akpm/fs/proc/base.c 2003-04-14 00:27:59.000000000 -0700 @@ -799,17 +799,11 @@ out_unlock: /* dentry stuff */ -/* - * Exceptional case: normally we are not allowed to unhash a busy - * directory. In this case, however, we can do it - no aliasing problems - * due to the way we treat inodes. - */ -static int pid_revalidate(struct dentry * dentry, int flags) +static int pid_alive(struct task_struct *p) { - if (proc_task(dentry->d_inode)->pid) - return 1; - d_drop(dentry); - return 0; + BUG_ON(p->pids[PIDTYPE_PID].pidptr != &p->pids[PIDTYPE_PID].pid); + + return atomic_read(&p->pids[PIDTYPE_PID].pid.count); } static int pid_fd_revalidate(struct dentry * dentry, int flags) @@ -840,35 +834,25 @@ static int pid_fd_revalidate(struct dent static void pid_base_iput(struct dentry *dentry, struct inode *inode) { struct task_struct *task = proc_task(inode); - write_lock_irq(&tasklist_lock); + spin_lock(&task->proc_lock); if (task->proc_dentry == dentry) task->proc_dentry = NULL; - write_unlock_irq(&tasklist_lock); + spin_unlock(&task->proc_lock); iput(inode); } -static int pid_delete_dentry(struct dentry * dentry) -{ - return proc_task(dentry->d_inode)->pid == 0; -} - static struct dentry_operations pid_fd_dentry_operations = { .d_revalidate = pid_fd_revalidate, - .d_delete = pid_delete_dentry, }; static struct dentry_operations pid_dentry_operations = { - .d_revalidate = pid_revalidate, - .d_delete = pid_delete_dentry, }; static struct dentry_operations pid_base_dentry_operations = { - .d_revalidate = pid_revalidate, .d_iput = pid_base_iput, - .d_delete = pid_delete_dentry, }; /* Lookups */ @@ -1093,6 +1077,60 @@ static struct inode_operations proc_self .follow_link = proc_self_follow_link, }; +/** + * proc_pid_unhash - Unhash /proc/ entry from the dcache. + * @p: task that should be flushed. + * + * Drops the /proc/ dcache entry from the hash chains. + * + * As a special exception, dropping happens even if the directory + * is busy. Normally we are not allowed to unhash a busy + * directory. In this case, however, we can do it - no aliasing problems + * due to the way we treat inodes. (No idea what that means - manfred) + * + * Dropping /proc/ entries and detach_pid must be synchroneous, + * otherwise e.g. /proc//exe might point to the wrong executable, + * if the pid value is immediately reused. This is enforced by + * - caller must acquire spin_lock(p->proc_lock) + * - must be called before detach_pid() + * - proc_pid_lookup acquires proc_lock, and checks that + * the target is not dead by looking at the attach count + * of PIDTYPE_PID. + */ + +struct dentry *proc_pid_unhash(struct task_struct *p) +{ + struct dentry *proc_dentry; + + proc_dentry = p->proc_dentry; + if (proc_dentry != NULL) { + + spin_lock(&dcache_lock); + if (!d_unhashed(proc_dentry)) { + dget_locked(proc_dentry); + __d_drop(proc_dentry); + } else + proc_dentry = NULL; + spin_unlock(&dcache_lock); + } + return proc_dentry; +} + +/** + * proc_pid_flush - recover memory used by stale /proc//x entries + * @proc_entry: directoy to prune. + * + * Shrink the /proc directory that was used by the just killed thread. + */ + +void proc_pid_flush(struct dentry *proc_dentry) +{ + if(proc_dentry != NULL) { + shrink_dcache_parent(proc_dentry); + dput(proc_dentry); + } +} + /* SMP-safe */ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry) { @@ -1141,13 +1179,16 @@ struct dentry *proc_pid_lookup(struct in inode->i_flags|=S_IMMUTABLE; dentry->d_op = &pid_base_dentry_operations; - d_add(dentry, inode); - read_lock(&tasklist_lock); - proc_task(dentry->d_inode)->proc_dentry = dentry; - read_unlock(&tasklist_lock); - if (!proc_task(dentry->d_inode)->pid) - d_drop(dentry); - return NULL; + + spin_lock(&task->proc_lock); + if (pid_alive(task)) { + task->proc_dentry = dentry; + d_add(dentry, inode); + dentry = NULL; + } + spin_unlock(&task->proc_lock); + if (!dentry) + return NULL; out: return ERR_PTR(-ENOENT); } diff -puN include/linux/init_task.h~tasklist_lock-dcache_lock-inversion-fix include/linux/init_task.h --- 25/include/linux/init_task.h~tasklist_lock-dcache_lock-inversion-fix 2003-04-14 00:27:59.000000000 -0700 +++ 25-akpm/include/linux/init_task.h 2003-04-14 00:27:59.000000000 -0700 @@ -101,6 +101,7 @@ .blocked = {{0}}, \ .posix_timers = LIST_HEAD_INIT(tsk.posix_timers), \ .alloc_lock = SPIN_LOCK_UNLOCKED, \ + .proc_lock = SPIN_LOCK_UNLOCKED, \ .switch_lock = SPIN_LOCK_UNLOCKED, \ .journal_info = NULL, \ } diff -puN include/linux/proc_fs.h~tasklist_lock-dcache_lock-inversion-fix include/linux/proc_fs.h --- 25/include/linux/proc_fs.h~tasklist_lock-dcache_lock-inversion-fix 2003-04-14 00:27:59.000000000 -0700 +++ 25-akpm/include/linux/proc_fs.h 2003-04-14 00:27:59.000000000 -0700 @@ -87,6 +87,8 @@ extern void proc_root_init(void); extern void proc_misc_init(void); struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry); +struct dentry *proc_pid_unhash(struct task_struct *p); +void proc_pid_flush(struct dentry *proc_dentry); int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir); extern struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode, diff -puN include/linux/sched.h~tasklist_lock-dcache_lock-inversion-fix include/linux/sched.h --- 25/include/linux/sched.h~tasklist_lock-dcache_lock-inversion-fix 2003-04-14 00:27:59.000000000 -0700 +++ 25-akpm/include/linux/sched.h 2003-04-14 00:27:59.000000000 -0700 @@ -428,6 +428,8 @@ struct task_struct { u32 self_exec_id; /* Protection of (de-)allocation: mm, files, fs, tty */ spinlock_t alloc_lock; +/* Protection of proc_dentry: nesting proc_lock, dcache_lock, write_lock_irq(&tasklist_lock); */ + spinlock_t proc_lock; /* context-switch lock */ spinlock_t switch_lock; diff -puN kernel/exit.c~tasklist_lock-dcache_lock-inversion-fix kernel/exit.c --- 25/kernel/exit.c~tasklist_lock-dcache_lock-inversion-fix 2003-04-14 00:27:59.000000000 -0700 +++ 25-akpm/kernel/exit.c 2003-04-14 00:27:59.000000000 -0700 @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -31,10 +32,8 @@ extern struct task_struct *child_reaper; int getrusage(struct task_struct *, int, struct rusage *); -static struct dentry * __unhash_process(struct task_struct *p) +static void __unhash_process(struct task_struct *p) { - struct dentry *proc_dentry; - nr_threads--; detach_pid(p, PIDTYPE_PID); detach_pid(p, PIDTYPE_TGID); @@ -46,34 +45,25 @@ static struct dentry * __unhash_process( } REMOVE_LINKS(p); - proc_dentry = p->proc_dentry; - if (unlikely(proc_dentry != NULL)) { - spin_lock(&dcache_lock); - if (!d_unhashed(proc_dentry)) { - dget_locked(proc_dentry); - __d_drop(proc_dentry); - } else - proc_dentry = NULL; - spin_unlock(&dcache_lock); - } - return proc_dentry; } void release_task(struct task_struct * p) { - struct dentry *proc_dentry; task_t *leader; + struct dentry *proc_dentry; BUG_ON(p->state < TASK_ZOMBIE); atomic_dec(&p->user->processes); + spin_lock(&p->proc_lock); + proc_dentry = proc_pid_unhash(p); write_lock_irq(&tasklist_lock); if (unlikely(p->ptrace)) __ptrace_unlink(p); BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); __exit_signal(p); __exit_sighand(p); - proc_dentry = __unhash_process(p); + __unhash_process(p); /* * If we are the last non-leader member of the thread @@ -92,11 +82,8 @@ void release_task(struct task_struct * p p->parent->cnswap += p->nswap + p->cnswap; sched_exit(p); write_unlock_irq(&tasklist_lock); - - if (unlikely(proc_dentry != NULL)) { - shrink_dcache_parent(proc_dentry); - dput(proc_dentry); - } + spin_unlock(&p->proc_lock); + proc_pid_flush(proc_dentry); release_thread(p); put_task_struct(p); } @@ -107,14 +94,13 @@ void unhash_process(struct task_struct * { struct dentry *proc_dentry; + spin_lock(&p->proc_lock); + proc_dentry = proc_pid_unhash(p); write_lock_irq(&tasklist_lock); - proc_dentry = __unhash_process(p); + __unhash_process(p); write_unlock_irq(&tasklist_lock); - - if (unlikely(proc_dentry != NULL)) { - shrink_dcache_parent(proc_dentry); - dput(proc_dentry); - } + spin_unlock(&p->proc_lock); + proc_pid_flush(proc_dentry); } /* _