From: Neil Brown When an NFS request arrives, it contains a filehandle which needs to be converted to a dentry. Many filesystems use find_exported_dentry in fs/exportfs/expfs.c. A key part of this on filesystem where a 32bit inode number uniquely locates a file is export_iget which calls iget(sb, inum). iget will either: 1/ find the inode in the inode cache and return it or 2/ create a new inode and call ->read_inode to load it from the storage device. export_iget then verifies the inode is really a good inode (->read_inode didn't detect any problems) and the right inode (base on generation number from the file handle). For this to work reliably, it is important that whenever an inode is *not* in the cache, the on-device version is up-to-date. Otherwise, when read_inode loads the inode it will get bad data. For a file that has not been deleted, this condition always holds: a dirty inode is always flushed to disc before the inode is unhashed. However for a file that is being deleted this condition doesn't (didn't) hold. When iput -> iput_final -> generic_drop_inode -> generic_delete_inode is called we would unhash the inode before calling into the filesytem through ->delete_inode. So there is a small window between when generic_delete_inode unhashes the inode, and when ->delete_inode writes something to disc, where a call to ->read_inode (for export_iget) might discover what it thinks is a valid inode, but is really one that is in the process of being destroyed. It is this window that I want to close by moving the unhashing to the end of generic_delete_inode. fs/fs-writeback.c | 3 ++- fs/inode.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 55 insertions(+), 2 deletions(-) diff -puN fs/fs-writeback.c~inode-unhashing-fix-2 fs/fs-writeback.c --- 25/fs/fs-writeback.c~inode-unhashing-fix-2 2003-05-17 14:09:38.000000000 -0700 +++ 25-akpm/fs/fs-writeback.c 2003-05-17 14:09:38.000000000 -0700 @@ -90,7 +90,8 @@ void __mark_inode_dirty(struct inode *in * Only add valid (hashed) inodes to the superblock's * dirty list. Add blockdev inodes as well. */ - if (hlist_unhashed(&inode->i_hash) && !S_ISBLK(inode->i_mode)) + if ((hlist_unhashed(&inode->i_hash) || (inode->i_state & (I_FREEING|I_CLEAR))) + && !S_ISBLK(inode->i_mode)) goto out; /* diff -puN fs/inode.c~inode-unhashing-fix-2 fs/inode.c --- 25/fs/inode.c~inode-unhashing-fix-2 2003-05-17 14:09:38.000000000 -0700 +++ 25-akpm/fs/inode.c 2003-05-17 14:09:38.000000000 -0700 @@ -466,6 +466,7 @@ static int shrink_icache_memory(int nr, return inodes_stat.nr_unused; } +static void __wait_on_freeing_inode(struct inode *inode); /* * Called with the inode lock held. * NOTE: we are not increasing the inode-refcount, you must call __iget() @@ -477,6 +478,7 @@ static struct inode * find_inode(struct struct hlist_node *node; struct inode * inode = NULL; +repeat: hlist_for_each (node, head) { prefetch(node->next); inode = hlist_entry(node, struct inode, i_hash); @@ -484,6 +486,10 @@ static struct inode * find_inode(struct continue; if (!test(inode, data)) continue; + if (inode->i_state & (I_FREEING|I_CLEAR)) { + __wait_on_freeing_inode(inode); + goto repeat; + } break; } return node ? inode : NULL; @@ -498,6 +504,7 @@ static struct inode * find_inode_fast(st struct hlist_node *node; struct inode * inode = NULL; +repeat: hlist_for_each (node, head) { prefetch(node->next); inode = list_entry(node, struct inode, i_hash); @@ -505,6 +512,10 @@ static struct inode * find_inode_fast(st continue; if (inode->i_sb != sb) continue; + if (inode->i_state & (I_FREEING|I_CLEAR)) { + __wait_on_freeing_inode(inode); + goto repeat; + } break; } return node ? inode : NULL; @@ -933,11 +944,22 @@ void remove_inode_hash(struct inode *ino spin_unlock(&inode_lock); } +/* + * Tell the filesystem that this inode is no longer of any interest and should + * be completely destroyed. + * + * We leave the inode in the inode hash table until *after* the filesystem's + * ->delete_inode completes. This ensures that an iget (such as nfsd might + * instigate) will always find up-to-date information either in the hash or on + * disk. + * + * I_FREEING is set so that no-one will take a new reference to the inode while + * it is being deleted. + */ void generic_delete_inode(struct inode *inode) { struct super_operations *op = inode->i_sb->s_op; - hlist_del_init(&inode->i_hash); list_del_init(&inode->i_list); inode->i_state|=I_FREEING; inodes_stat.nr_inodes--; @@ -956,6 +978,10 @@ void generic_delete_inode(struct inode * delete(inode); } else clear_inode(inode); + spin_lock(&inode_lock); + hlist_del_init(&inode->i_hash); + spin_unlock(&inode_lock); + wake_up_inode(inode); if (inode->i_state != I_CLEAR) BUG(); destroy_inode(inode); @@ -1229,6 +1255,32 @@ repeat: __set_current_state(TASK_RUNNING); } +/* + * If we try to find an inode in the inode hash while it is being deleted, we + * have to wait until the filesystem completes its deletion before reporting + * that it isn't found. This is because iget will immediately call + * ->read_inode, and we want to be sure that evidence of the deletion is found + * by ->read_inode. + * + * This call might return early if an inode which shares the waitq is woken up. + * This is most easily handled by the caller which will loop around again + * looking for the inode. + * + * This is called with inode_lock held. + */ +static void __wait_on_freeing_inode(struct inode *inode) +{ + DECLARE_WAITQUEUE(wait, current); + wait_queue_head_t *wq = i_waitq_head(inode); + + add_wait_queue(wq, &wait); + set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock(&inode_lock); + schedule(); + remove_wait_queue(wq, &wait); + spin_lock(&inode_lock); +} + void wake_up_inode(struct inode *inode) { wait_queue_head_t *wq = i_waitq_head(inode); _