From: Kirill Korotaev This patch fixes dcache race between shrink_dcache_XXX functions and dput(). Example race scenario: CPU 0 CPU 1 umount /dev/sda1 generic_shutdown_super shrink_dcache_memory() shrink_dcache_parent dput dentry select_parent prune_one_dentry() <<<< child is dead, locks are released, but parent is still referenced!!! >>>> skip dentry->parent, since it's d_count > 0 message: BUSY inodes after umount... <<< parent is left on dentry_unused list, referencing freed super block We faced these messages about busy inodes constantly after some stress testing with mount/umount operations parrallel with some other activity. This patch helped the problem. The patch was heavilly tested on 2.6.8 during 2 months, this forward-ported version boots and works ok as well. This patch solves the race mentioned above via introducing per super block list of dentries which are being held indirectly by it's child which is going away. This usually happens when a child is dput()'ed last time and is holding it's parent reference though it is not in any lists already and can't be found anywhere. To make sure that shrink_dcache_parent() can't shrink dcache tree because of a real reference and not such indirect one we check a race condition by looking through dcache_shrinker list. If the current dentry is found on dcache_shrinker list than it is being referenced indirectly as described above and should be waited for, otherwise we really can't shrink the tree. Why is it so hacky? We tried to develop a solution which minimally influences perfomance, not slowdowning a fast path and not introducing lock contention via widening existing locks. I'll be happy if someone fixes it more efficiently. Signed-Off-By: Kirill Korotaev Signed-Off-By: Andrey Savochkin Signed-Off-By: Dmitry Mishin Signed-off-by: Andrew Morton --- fs/dcache.c | 242 +++++++++++++++++++++++++++++++++++++++++-------- fs/super.c | 4 include/linux/dcache.h | 3 include/linux/fs.h | 1 4 files changed, 209 insertions(+), 41 deletions(-) diff -puN fs/dcache.c~fix-of-dcache-race-leading-to-busy-inodes-on-umount fs/dcache.c --- 25/fs/dcache.c~fix-of-dcache-race-leading-to-busy-inodes-on-umount 2005-06-27 18:32:10.000000000 -0700 +++ 25-akpm/fs/dcache.c 2005-06-27 18:32:10.000000000 -0700 @@ -111,6 +111,77 @@ static inline void dentry_iput(struct de } } +struct dcache_shrinker { + struct list_head list; + struct dentry *dentry; +}; + +static DECLARE_WAIT_QUEUE_HEAD(dcache_shrinker_wq); + +/* called under dcache_lock */ +static void dcache_shrinker_add(struct dcache_shrinker *ds, + struct dentry *parent, struct dentry *dentry) +{ + if (parent != dentry) { + struct super_block *sb; + + sb = parent->d_sb; + ds->dentry = parent; + list_add(&ds->list, &sb->s_dshrinkers); + } else { + INIT_LIST_HEAD(&ds->list); + } +} + +/* called under dcache_lock */ +static void dcache_shrinker_del(struct dcache_shrinker *ds) +{ + if (ds == NULL || list_empty(&ds->list)) + return; + + list_del_init(&ds->list); + wake_up_all(&dcache_shrinker_wq); +} + +/* called under dcache_lock, drops inside */ +static void dcache_shrinker_wait(struct super_block *sb) +{ + DEFINE_WAIT(wait); + + prepare_to_wait(&dcache_shrinker_wq, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&dcache_lock); + + schedule(); + finish_wait(&dcache_shrinker_wq, &wait); +} + +void dcache_shrinker_wait_sb(struct super_block *sb) +{ + /* the root dentry can be held in dput_recursive */ + spin_lock(&dcache_lock); + while (!list_empty(&sb->s_dshrinkers)) { + dcache_shrinker_wait(sb); + spin_lock(&dcache_lock); + } + spin_unlock(&dcache_lock); +} + +/* dcache_lock protects shrinker's list */ +static void shrink_dcache_racecheck(struct dentry *parent, int *racecheck) +{ + struct super_block *sb; + struct dcache_shrinker *ds; + + sb = parent->d_sb; + list_for_each_entry(ds, &sb->s_dshrinkers, list) { + /* is one of dcache shrinkers working on the dentry? */ + if (ds->dentry == parent) { + *racecheck = 1; + break; + } + } +} + /* * This is dput * @@ -129,8 +200,9 @@ static inline void dentry_iput(struct de */ /* - * dput - release a dentry - * @dentry: dentry to release + * dput_recursive - go upward through the dentry tree and release dentries + * @dentry: starting dentry + * @ds: shrinker to be added to active list (see shrink_dcache_parent) * * Release a dentry. This will drop the usage count and if appropriate * call the dentry unlink method as well as removing it from the queues and @@ -140,17 +212,15 @@ static inline void dentry_iput(struct de * no dcache lock, please. */ -void dput(struct dentry *dentry) +static void dput_recursive(struct dentry *dentry, struct dcache_shrinker *ds) { - if (!dentry) - return; - -repeat: if (atomic_read(&dentry->d_count) == 1) might_sleep(); if (!atomic_dec_and_lock(&dentry->d_count, &dcache_lock)) return; + dcache_shrinker_del(ds); +repeat: spin_lock(&dentry->d_lock); if (atomic_read(&dentry->d_count)) { spin_unlock(&dentry->d_lock); @@ -182,6 +252,7 @@ unhash_it: kill_it: { struct dentry *parent; + struct dcache_shrinker lds; /* If dentry was on d_lru list * delete it from there @@ -191,18 +262,47 @@ kill_it: { dentry_stat.nr_unused--; } list_del(&dentry->d_child); + parent = dentry->d_parent; + dcache_shrinker_add(&lds, parent, dentry); dentry_stat.nr_dentry--; /* For d_free, below */ /*drops the locks, at that point nobody can reach this dentry */ dentry_iput(dentry); - parent = dentry->d_parent; d_free(dentry); - if (dentry == parent) + if (unlikely(dentry == parent)) { + spin_lock(&dcache_lock); + dcache_shrinker_del(&lds); + spin_unlock(&dcache_lock); return; + } dentry = parent; - goto repeat; + spin_lock(&dcache_lock); + dcache_shrinker_del(&lds); + if (atomic_dec_and_test(&dentry->d_count)) + goto repeat; + spin_unlock(&dcache_lock); } } +/* + * dput - release a dentry + * @dentry: dentry to release + * + * Release a dentry. This will drop the usage count and if appropriate + * call the dentry unlink method as well as removing it from the queues and + * releasing its resources. If the parent dentries were scheduled for release + * they too may now get deleted. + * + * no dcache lock, please. + */ + +void dput(struct dentry *dentry) +{ + if (!dentry) + return; + + dput_recursive(dentry, NULL); +} + /** * d_invalidate - invalidate a dentry * @dentry: dentry to invalidate @@ -361,19 +461,23 @@ restart: * removed. * Called with dcache_lock, drops it and then regains. */ -static inline void prune_one_dentry(struct dentry * dentry) +static void prune_one_dentry(struct dentry *dentry) { struct dentry * parent; + struct dcache_shrinker ds; __d_drop(dentry); list_del(&dentry->d_child); + parent = dentry->d_parent; + dcache_shrinker_add(&ds, parent, dentry); dentry_stat.nr_dentry--; /* For d_free, below */ dentry_iput(dentry); parent = dentry->d_parent; d_free(dentry); if (parent != dentry) - dput(parent); + dput_recursive(parent, &ds); spin_lock(&dcache_lock); + dcache_shrinker_del(&ds); } /** @@ -562,13 +666,12 @@ positive: * drop the lock and return early due to latency * constraints. */ -static int select_parent(struct dentry * parent) +static int select_parent(struct dentry *parent, int *racecheck) { struct dentry *this_parent = parent; struct list_head *next; int found = 0; - spin_lock(&dcache_lock); repeat: next = this_parent->d_subdirs.next; resume: @@ -610,6 +713,9 @@ dentry->d_parent->d_name.name, dentry->d #endif goto repeat; } + + if (!found && racecheck != NULL) + shrink_dcache_racecheck(dentry, racecheck); } /* * All done at this level ... ascend and resume the search. @@ -624,7 +730,6 @@ this_parent->d_parent->d_name.name, this goto resume; } out: - spin_unlock(&dcache_lock); return found; } @@ -637,10 +742,66 @@ out: void shrink_dcache_parent(struct dentry * parent) { - int found; + int found, r; - while ((found = select_parent(parent)) != 0) + while (1) { + spin_lock(&dcache_lock); + found = select_parent(parent, NULL); + if (found) + goto found; + + /* + * try again with a dput_recursive() race check. + * it returns quickly if everything was really shrinked + */ + r = 0; + found = select_parent(parent, &r); + if (found) + goto found; + if (!r) + break; + + /* drops the lock inside */ + dcache_shrinker_wait(parent->d_sb); + continue; + +found: + spin_unlock(&dcache_lock); prune_dcache(found); + } + spin_unlock(&dcache_lock); +} + +/* + * Move any unused anon dentries to the end of the unused list. + * called under dcache_lock + */ +static int select_anon(struct hlist_head *head, int *racecheck) +{ + struct hlist_node *lp; + int found = 0; + + hlist_for_each(lp, head) { + struct dentry *this = hlist_entry(lp, struct dentry, d_hash); + if (!list_empty(&this->d_lru)) { + dentry_stat.nr_unused--; + list_del_init(&this->d_lru); + } + + /* + * move only zero ref count dentries to the end + * of the unused list for prune_dcache + */ + if (!atomic_read(&this->d_count)) { + list_add_tail(&this->d_lru, &dentry_unused); + dentry_stat.nr_unused++; + found++; + } + + if (!found && racecheck != NULL) + shrink_dcache_racecheck(this, racecheck); + } + return found; } /** @@ -653,33 +814,36 @@ void shrink_dcache_parent(struct dentry * done under dcache_lock. * */ -void shrink_dcache_anon(struct hlist_head *head) +void shrink_dcache_anon(struct super_block *sb) { - struct hlist_node *lp; - int found; - do { - found = 0; + int found, r; + + while (1) { spin_lock(&dcache_lock); - hlist_for_each(lp, head) { - struct dentry *this = hlist_entry(lp, struct dentry, d_hash); - if (!list_empty(&this->d_lru)) { - dentry_stat.nr_unused--; - list_del_init(&this->d_lru); - } + found = select_anon(&sb->s_anon, NULL); + if (found) + goto found; - /* - * move only zero ref count dentries to the end - * of the unused list for prune_dcache - */ - if (!atomic_read(&this->d_count)) { - list_add_tail(&this->d_lru, &dentry_unused); - dentry_stat.nr_unused++; - found++; - } - } + /* + * try again with a dput_recursive() race check. + * it returns quickly if everything was really shrinked + */ + r = 0; + found = select_anon(&sb->s_anon, &r); + if (found) + goto found; + if (!r) + break; + + /* drops the lock inside */ + dcache_shrinker_wait(sb); + continue; + +found: spin_unlock(&dcache_lock); prune_dcache(found); - } while(found); + } + spin_unlock(&dcache_lock); } /* diff -puN fs/super.c~fix-of-dcache-race-leading-to-busy-inodes-on-umount fs/super.c --- 25/fs/super.c~fix-of-dcache-race-leading-to-busy-inodes-on-umount 2005-06-27 18:32:10.000000000 -0700 +++ 25-akpm/fs/super.c 2005-06-27 18:32:10.000000000 -0700 @@ -69,6 +69,7 @@ static struct super_block *alloc_super(v INIT_LIST_HEAD(&s->s_io); INIT_LIST_HEAD(&s->s_files); INIT_LIST_HEAD(&s->s_instances); + INIT_LIST_HEAD(&s->s_dshrinkers); INIT_HLIST_HEAD(&s->s_anon); INIT_LIST_HEAD(&s->s_inodes); init_rwsem(&s->s_umount); @@ -230,8 +231,9 @@ void generic_shutdown_super(struct super if (root) { sb->s_root = NULL; shrink_dcache_parent(root); - shrink_dcache_anon(&sb->s_anon); + shrink_dcache_anon(sb); dput(root); + dcache_shrinker_wait_sb(sb); fsync_super(sb); lock_super(sb); sb->s_flags &= ~MS_ACTIVE; diff -puN include/linux/dcache.h~fix-of-dcache-race-leading-to-busy-inodes-on-umount include/linux/dcache.h --- 25/include/linux/dcache.h~fix-of-dcache-race-leading-to-busy-inodes-on-umount 2005-06-27 18:32:10.000000000 -0700 +++ 25-akpm/include/linux/dcache.h 2005-06-27 18:32:10.000000000 -0700 @@ -209,7 +209,8 @@ extern struct dentry * d_alloc_anon(stru extern struct dentry * d_splice_alias(struct inode *, struct dentry *); extern void shrink_dcache_sb(struct super_block *); extern void shrink_dcache_parent(struct dentry *); -extern void shrink_dcache_anon(struct hlist_head *); +extern void shrink_dcache_anon(struct super_block *sb); +extern void dcache_shrinker_wait_sb(struct super_block *sb); extern int d_invalidate(struct dentry *); /* only used at mount-time */ diff -puN include/linux/fs.h~fix-of-dcache-race-leading-to-busy-inodes-on-umount include/linux/fs.h --- 25/include/linux/fs.h~fix-of-dcache-race-leading-to-busy-inodes-on-umount 2005-06-27 18:32:10.000000000 -0700 +++ 25-akpm/include/linux/fs.h 2005-06-27 18:32:10.000000000 -0700 @@ -786,6 +786,7 @@ struct super_block { struct list_head s_io; /* parked for writeback */ struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */ struct list_head s_files; + struct list_head s_dshrinkers; /* active dcache shrinkers */ struct block_device *s_bdev; struct list_head s_instances; _