From: William Lee Irwin III On Tue, May 04, 2004 at 11:11:21PM -0700, William Lee Irwin III wrote: > Now, make bh's use the new wakeup primitive also. This has the bugfix > vs. the prior version that autoremoved waitqueue wakeup functions are > made to match autoremove API usage in __wait_event_filtered(). This is still grossly inefficient in that it's only necessary to wake one waiter when the waiter promises to eventually issue another wakeup e.g. when it releases the bit on the page. So here, wake-one semantics are implemented for those cases, using the WQ_FLAG_EXCLUSIVE flag in the waitqueue and the surrounding API's e.g. prepare_to_wait_exclusive(). I took the small liberty of adding list_for_each_entry_reverse_safe() to list.h as it generally makes sense, and gives the opportunity for fair FIFO wakeups wrapped up in a neat API. --- 25-akpm/fs/buffer.c | 32 +++++++++++++++++++++++++++----- 25-akpm/include/linux/buffer_head.h | 5 +++-- 25-akpm/include/linux/list.h | 13 +++++++++++++ 25-akpm/kernel/sched.c | 11 ++++++++--- 25-akpm/mm/filemap.c | 30 +++++++++++++++++++----------- 5 files changed, 70 insertions(+), 21 deletions(-) diff -puN fs/buffer.c~wake-one-pg_locked-bh_lock-semantics fs/buffer.c --- 25/fs/buffer.c~wake-one-pg_locked-bh_lock-semantics 2004-05-04 23:56:33.605541976 -0700 +++ 25-akpm/fs/buffer.c 2004-05-04 23:56:33.616540304 -0700 @@ -78,6 +78,32 @@ void wake_up_buffer(struct buffer_head * } EXPORT_SYMBOL(wake_up_buffer); +static void sync_buffer(struct buffer_head *bh) +{ + struct block_device *bd; + + smp_mb(); + bd = bh->b_bdev; + if (bd) + blk_run_address_space(bd->bd_inode->i_mapping); +} + +void fastcall __lock_buffer(struct buffer_head *bh) +{ + wait_queue_head_t *wqh = bh_waitq_head(bh); + DEFINE_FILTERED_WAIT(wait, bh); + + do { + prepare_to_wait_exclusive(wqh, &wait.wait, TASK_UNINTERRUPTIBLE); + if (buffer_locked(bh)) { + sync_buffer(bh); + io_schedule(); + } + } while (test_set_buffer_locked(bh)); + finish_wait(wqh, &wait.wait); +} +EXPORT_SYMBOL(__lock_buffer); + void fastcall unlock_buffer(struct buffer_head *bh) { clear_buffer_locked(bh); @@ -98,11 +124,7 @@ void __wait_on_buffer(struct buffer_head do { prepare_to_wait(wqh, &wait.wait, TASK_UNINTERRUPTIBLE); if (buffer_locked(bh)) { - struct block_device *bd; - smp_mb(); - bd = bh->b_bdev; - if (bd) - blk_run_address_space(bd->bd_inode->i_mapping); + sync_buffer(bh); io_schedule(); } } while (buffer_locked(bh)); diff -puN include/linux/buffer_head.h~wake-one-pg_locked-bh_lock-semantics include/linux/buffer_head.h --- 25/include/linux/buffer_head.h~wake-one-pg_locked-bh_lock-semantics 2004-05-04 23:56:33.607541672 -0700 +++ 25-akpm/include/linux/buffer_head.h 2004-05-04 23:56:33.618540000 -0700 @@ -170,6 +170,7 @@ struct buffer_head *__bread(struct block struct buffer_head *alloc_buffer_head(int gfp_flags); void free_buffer_head(struct buffer_head * bh); void FASTCALL(unlock_buffer(struct buffer_head *bh)); +void FASTCALL(__lock_buffer(struct buffer_head *bh)); void ll_rw_block(int, int, struct buffer_head * bh[]); void sync_dirty_buffer(struct buffer_head *bh); void submit_bh(int, struct buffer_head *); @@ -279,8 +280,8 @@ static inline void wait_on_buffer(struct static inline void lock_buffer(struct buffer_head *bh) { - while (test_set_buffer_locked(bh)) - __wait_on_buffer(bh); + if (test_set_buffer_locked(bh)) + __lock_buffer(bh); } #endif /* _LINUX_BUFFER_HEAD_H */ diff -puN include/linux/list.h~wake-one-pg_locked-bh_lock-semantics include/linux/list.h --- 25/include/linux/list.h~wake-one-pg_locked-bh_lock-semantics 2004-05-04 23:56:33.608541520 -0700 +++ 25-akpm/include/linux/list.h 2004-05-04 23:56:33.617540152 -0700 @@ -413,6 +413,19 @@ static inline void list_splice_init(stru pos = n, n = list_entry(n->member.next, typeof(*n), member)) /** + * list_for_each_entry_reverse_safe - iterate over list of given type safe against removal of list entry backward + * @pos: the type * to use as a loop counter. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry_reverse_safe(pos, n, head, member) \ + for (pos = list_entry((head)->prev, typeof(*pos), member), \ + n = list_entry(pos->member.prev, typeof(*pos), member); \ + &pos->member != (head); \ + pos = n, n = list_entry(n->member.prev, typeof(*n), member)) + +/** * list_for_each_rcu - iterate over an rcu-protected list * @pos: the &struct list_head to use as a loop counter. * @head: the head for your list. diff -puN kernel/sched.c~wake-one-pg_locked-bh_lock-semantics kernel/sched.c --- 25/kernel/sched.c~wake-one-pg_locked-bh_lock-semantics 2004-05-04 23:56:33.610541216 -0700 +++ 25-akpm/kernel/sched.c 2004-05-04 23:56:33.620539696 -0700 @@ -2525,9 +2525,14 @@ void fastcall wake_up_filtered(wait_queu struct filtered_wait_queue *wait, *save; spin_lock_irqsave(&q->lock, flags); - list_for_each_entry_safe(wait, save, &q->task_list, wait.task_list) { - if (wait->key == key) - wait->wait.func(&wait->wait, mode, 0); + list_for_each_entry_reverse_safe(wait, save, &q->task_list, wait.task_list) { + int exclusive = wait->wait.flags & WQ_FLAG_EXCLUSIVE; + if (wait->key != key) + continue; + else if (!wait->wait.func(&wait->wait, mode, 0)) + continue; + else if (exclusive) + break; } spin_unlock_irqrestore(&q->lock, flags); } diff -puN mm/filemap.c~wake-one-pg_locked-bh_lock-semantics mm/filemap.c --- 25/mm/filemap.c~wake-one-pg_locked-bh_lock-semantics 2004-05-04 23:56:33.612540912 -0700 +++ 25-akpm/mm/filemap.c 2004-05-04 23:56:33.623539240 -0700 @@ -293,17 +293,24 @@ int add_to_page_cache_lru(struct page *p * at a cost of "thundering herd" phenomena during rare hash * collisions. */ -static wait_queue_head_t *page_waitqueue(struct page *page) +static wait_queue_head_t *page_waitqueue(struct page *page, int bit) { const struct zone *zone = page_zone(page); + unsigned long key = (unsigned long)page + bit; - return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)]; + return &zone->wait_table[hash_long(key, zone->wait_table_bits)]; +} + +#define PAGE_KEY_SHIFT (BITS_PER_LONG - (BITS_PER_LONG == 32 ? 5 : 6)) +static void *page_key(struct page *page, unsigned long bit) +{ + return (void *)(page_to_pfn(page) | bit << PAGE_KEY_SHIFT); } void fastcall wait_on_page_bit(struct page *page, int bit_nr) { - wait_queue_head_t *waitqueue = page_waitqueue(page); - DEFINE_FILTERED_WAIT(wait, page); + wait_queue_head_t *waitqueue = page_waitqueue(page, bit_nr); + DEFINE_FILTERED_WAIT(wait, page_key(page, bit_nr)); do { prepare_to_wait(waitqueue, &wait.wait, TASK_UNINTERRUPTIBLE); @@ -334,13 +341,14 @@ EXPORT_SYMBOL(wait_on_page_bit); */ void fastcall unlock_page(struct page *page) { - wait_queue_head_t *waitqueue = page_waitqueue(page); + wait_queue_head_t *waitqueue = page_waitqueue(page, PG_locked); + smp_mb__before_clear_bit(); if (!TestClearPageLocked(page)) BUG(); smp_mb__after_clear_bit(); if (waitqueue_active(waitqueue)) - wake_up_filtered(waitqueue, page); + wake_up_filtered(waitqueue, page_key(page, PG_locked)); } EXPORT_SYMBOL(unlock_page); @@ -351,7 +359,7 @@ EXPORT_SYMBOL(lock_page); */ void end_page_writeback(struct page *page) { - wait_queue_head_t *waitqueue = page_waitqueue(page); + wait_queue_head_t *waitqueue = page_waitqueue(page, PG_writeback); if (!TestClearPageReclaim(page) || rotate_reclaimable_page(page)) { if (!test_clear_page_writeback(page)) @@ -359,7 +367,7 @@ void end_page_writeback(struct page *pag smp_mb__after_clear_bit(); } if (waitqueue_active(waitqueue)) - wake_up_filtered(waitqueue, page); + wake_up_filtered(waitqueue, page_key(page, PG_writeback)); } EXPORT_SYMBOL(end_page_writeback); @@ -374,11 +382,11 @@ EXPORT_SYMBOL(end_page_writeback); */ void fastcall __lock_page(struct page *page) { - wait_queue_head_t *wqh = page_waitqueue(page); - DEFINE_FILTERED_WAIT(wait, page); + wait_queue_head_t *wqh = page_waitqueue(page, PG_locked); + DEFINE_FILTERED_WAIT(wait, page_key(page, PG_locked)); while (TestSetPageLocked(page)) { - prepare_to_wait(wqh, &wait.wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait_exclusive(wqh, &wait.wait, TASK_UNINTERRUPTIBLE); if (PageLocked(page)) { sync_page(page); io_schedule(); _