From: William Lee Irwin III Eliminate the inode waitqueue hashtable using bit_waitqueue() via wait_on_bit() and wake_up_bit() to locate the waitqueue head associated with a bit. Signed-off-by: Andrew Morton --- 25-akpm/fs/fs-writeback.c | 20 +++++++---- 25-akpm/fs/inode.c | 69 +++++++++++--------------------------- 25-akpm/include/linux/fs.h | 3 + 25-akpm/include/linux/writeback.h | 6 +-- 25-akpm/kernel/wait.c | 1 5 files changed, 40 insertions(+), 59 deletions(-) diff -puN fs/fs-writeback.c~eliminate-inode-waitqueue-hashtable fs/fs-writeback.c --- 25/fs/fs-writeback.c~eliminate-inode-waitqueue-hashtable 2004-10-03 16:43:16.670797208 -0700 +++ 25-akpm/fs/fs-writeback.c 2004-10-03 16:43:16.679795840 -0700 @@ -244,6 +244,8 @@ static int __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) { + wait_queue_head_t *wqh; + if ((wbc->sync_mode != WB_SYNC_ALL) && (inode->i_state & I_LOCK)) { list_move(&inode->i_list, &inode->i_sb->s_dirty); return 0; @@ -252,12 +254,18 @@ __writeback_single_inode(struct inode *i /* * It's a data-integrity sync. We must wait. */ - while (inode->i_state & I_LOCK) { - __iget(inode); - spin_unlock(&inode_lock); - __wait_on_inode(inode); - iput(inode); - spin_lock(&inode_lock); + if (inode->i_state & I_LOCK) { + DEFINE_WAIT_BIT(wq, &inode->i_state, __I_LOCK); + + wqh = bit_waitqueue(&inode->i_state, __I_LOCK); + do { + __iget(inode); + spin_unlock(&inode_lock); + __wait_on_bit(wqh, &wq, &inode->i_state, __I_LOCK, + inode_wait, TASK_UNINTERRUPTIBLE); + iput(inode); + spin_lock(&inode_lock); + } while (inode->i_state & I_LOCK); } return __sync_single_inode(inode, wbc); } diff -puN fs/inode.c~eliminate-inode-waitqueue-hashtable fs/inode.c --- 25/fs/inode.c~eliminate-inode-waitqueue-hashtable 2004-10-03 16:43:16.672796904 -0700 +++ 25-akpm/fs/inode.c 2004-10-03 16:43:16.681795536 -0700 @@ -1264,37 +1264,10 @@ void remove_dquot_ref(struct super_block #endif -/* - * Hashed waitqueues for wait_on_inode(). The table is pretty small - the - * kernel doesn't lock many inodes at the same time. - */ -#define I_WAIT_TABLE_ORDER 3 -static struct i_wait_queue_head { - wait_queue_head_t wqh; -} ____cacheline_aligned_in_smp i_wait_queue_heads[1<i_state & I_LOCK) { - schedule(); - goto repeat; - } - remove_wait_queue(wq, &wait); - __set_current_state(TASK_RUNNING); + schedule(); + return 0; } /* @@ -1303,36 +1276,39 @@ repeat: * that it isn't found. This is because iget will immediately call * ->read_inode, and we want to be sure that evidence of the deletion is found * by ->read_inode. - * - * This call might return early if an inode which shares the waitq is woken up. - * This is most easily handled by the caller which will loop around again - * looking for the inode. - * * This is called with inode_lock held. */ static void __wait_on_freeing_inode(struct inode *inode) { - DECLARE_WAITQUEUE(wait, current); - wait_queue_head_t *wq = i_waitq_head(inode); + wait_queue_head_t *wq; + DEFINE_WAIT_BIT(wait, &inode->i_state, __I_LOCK); - add_wait_queue(wq, &wait); - set_current_state(TASK_UNINTERRUPTIBLE); + /* + * I_FREEING and I_CLEAR are cleared in process context under + * inode_lock, so we have to give the tasks who would clear them + * a chance to run and acquire inode_lock. + */ + if (!(inode->i_state & I_LOCK)) { + spin_unlock(&inode_lock); + yield(); + spin_lock(&inode_lock); + return; + } + wq = bit_waitqueue(&inode->i_state, __I_LOCK); + prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); spin_unlock(&inode_lock); schedule(); - remove_wait_queue(wq, &wait); + finish_wait(wq, &wait.wait); spin_lock(&inode_lock); } void wake_up_inode(struct inode *inode) { - wait_queue_head_t *wq = i_waitq_head(inode); - /* * Prevent speculative execution through spin_unlock(&inode_lock); */ smp_mb(); - if (waitqueue_active(wq)) - wake_up_all(wq); + wake_up_bit(&inode->i_state, __I_LOCK); } EXPORT_SYMBOL(wake_up_inode); @@ -1368,11 +1344,6 @@ void __init inode_init_early(void) void __init inode_init(unsigned long mempages) { - int i; - - for (i = 0; i < ARRAY_SIZE(i_wait_queue_heads); i++) - init_waitqueue_head(&i_wait_queue_heads[i].wqh); - /* inode slab cache */ inode_cachep = kmem_cache_create("inode_cache", sizeof(struct inode), 0, SLAB_PANIC, init_once, NULL); diff -puN include/linux/fs.h~eliminate-inode-waitqueue-hashtable include/linux/fs.h --- 25/include/linux/fs.h~eliminate-inode-waitqueue-hashtable 2004-10-03 16:43:16.673796752 -0700 +++ 25-akpm/include/linux/fs.h 2004-10-03 16:43:16.682795384 -0700 @@ -1056,7 +1056,8 @@ struct super_operations { #define I_DIRTY_SYNC 1 /* Not dirty enough for O_DATASYNC */ #define I_DIRTY_DATASYNC 2 /* Data-related inode changes pending */ #define I_DIRTY_PAGES 4 /* Data-related inode changes pending */ -#define I_LOCK 8 +#define __I_LOCK 3 +#define I_LOCK (1 << __I_LOCK) #define I_FREEING 16 #define I_CLEAR 32 #define I_NEW 64 diff -puN include/linux/writeback.h~eliminate-inode-waitqueue-hashtable include/linux/writeback.h --- 25/include/linux/writeback.h~eliminate-inode-waitqueue-hashtable 2004-10-03 16:43:16.675796448 -0700 +++ 25-akpm/include/linux/writeback.h 2004-10-03 16:43:16.683795232 -0700 @@ -68,7 +68,7 @@ struct writeback_control { */ void writeback_inodes(struct writeback_control *wbc); void wake_up_inode(struct inode *inode); -void __wait_on_inode(struct inode * inode); +int inode_wait(void *); void sync_inodes_sb(struct super_block *, int wait); void sync_inodes(int wait); @@ -76,8 +76,8 @@ void sync_inodes(int wait); static inline void wait_on_inode(struct inode *inode) { might_sleep(); - if (inode->i_state & I_LOCK) - __wait_on_inode(inode); + wait_on_bit(&inode->i_state, __I_LOCK, inode_wait, + TASK_UNINTERRUPTIBLE); } /* diff -puN kernel/wait.c~eliminate-inode-waitqueue-hashtable kernel/wait.c --- 25/kernel/wait.c~eliminate-inode-waitqueue-hashtable 2004-10-03 16:43:16.676796296 -0700 +++ 25-akpm/kernel/wait.c 2004-10-03 16:43:16.683795232 -0700 @@ -7,6 +7,7 @@ #include #include #include +#include #include #include _