diff -urNp x-ref/drivers/block/ll_rw_blk.c x/drivers/block/ll_rw_blk.c --- x-ref/drivers/block/ll_rw_blk.c Tue Nov 19 19:45:56 2002 +++ x/drivers/block/ll_rw_blk.c Tue Nov 19 19:46:19 2002 @@ -596,12 +596,20 @@ static struct request *__get_request_wai register struct request *rq; DECLARE_WAITQUEUE(wait, current); - generic_unplug_device(q); add_wait_queue_exclusive(&q->wait_for_requests[rw], &wait); do { set_current_state(TASK_UNINTERRUPTIBLE); - if (q->rq[rw].count == 0) + if (q->rq[rw].count == 0) { + /* + * All we care about is not to stall if any request + * is been released after we set TASK_UNINTERRUPTIBLE. + * This is the most efficient place to unplug the queue + * in case we hit the race and we can get the request + * without waiting. + */ + generic_unplug_device(q); schedule(); + } spin_lock_irq(q->queue_lock); rq = get_request(q, rw); spin_unlock_irq(q->queue_lock); @@ -611,6 +619,17 @@ static struct request *__get_request_wai return rq; } +static void get_request_wait_wakeup(request_queue_t *q, int rw) +{ + /* + * avoid losing an unplug if a second __get_request_wait did the + * generic_unplug_device while our __get_request_wait was running + * w/o the queue_lock held and w/ our request out of the queue. + */ + if (waitqueue_active(&q->wait_for_requests[rw])) + wake_up(&q->wait_for_requests[rw]); +} + /* RO fail safe mechanism */ static long ro_bits[MAX_BLKDEV][8]; @@ -835,9 +854,11 @@ void blkdev_release_request(struct reque */ if (q) { list_add(&req->queue, &q->rq[rw].free); - if (++q->rq[rw].count >= q->batch_requests && - waitqueue_active(&q->wait_for_requests[rw])) - wake_up(&q->wait_for_requests[rw]); + if (++q->rq[rw].count >= q->batch_requests) { + smp_mb(); + if (waitqueue_active(&q->wait_for_requests[rw])) + wake_up(&q->wait_for_requests[rw]); + } } } @@ -955,7 +976,6 @@ static int __make_request(request_queue_ */ max_sectors = get_max_sectors(bh->b_rdev); -again: req = NULL; head = &q->queue_head; /* @@ -964,6 +984,7 @@ again: */ spin_lock_irq(q->queue_lock); +again: insert_here = head->prev; if (list_empty(head)) { q->plug_device_fn(q, bh->b_rdev); /* is atomic */ @@ -1049,6 +1070,9 @@ get_rq: if (req == NULL) { spin_unlock_irq(q->queue_lock); freereq = __get_request_wait(q, rw); + head = &q->queue_head; + spin_lock_irq(q->queue_lock); + get_request_wait_wakeup(q, rw); goto again; } } @@ -1206,6 +1230,11 @@ void __submit_bh(int rw, struct buffer_h generic_make_request(rw, bh); + /* fix race condition with wait_on_buffer() */ + smp_mb(); /* spin_unlock may have inclusive semantics */ + if (waitqueue_active(&bh->b_wait)) + wake_up(&bh->b_wait); + switch (rw) { case WRITE: kstat.pgpgout += count; diff -urNp x-ref/fs/buffer.c x/fs/buffer.c --- x-ref/fs/buffer.c Tue Nov 19 19:45:56 2002 +++ x/fs/buffer.c Tue Nov 19 19:46:19 2002 @@ -158,10 +158,23 @@ void __wait_on_buffer(struct buffer_head get_bh(bh); add_wait_queue(&bh->b_wait, &wait); do { - run_task_queue(&tq_disk); set_task_state(tsk, TASK_UNINTERRUPTIBLE); if (!buffer_locked(bh)) break; + /* + * We must read tq_disk in TQ_ACTIVE after the + * add_wait_queue effect is visible to other cpus. + * We could unplug some line above it wouldn't matter + * but we can't do that right after add_wait_queue + * without an smp_mb() in between because spin_unlock + * has inclusive semantics. + * Doing it here is the most efficient place so we + * don't do a suprious unplug if we get a racy + * wakeup that make buffer_locked to return 0, and + * doing it here avoids an explicit smp_mb() we + * rely on the implicit one in set_task_state. + */ + run_task_queue(&tq_disk); schedule(); } while (buffer_locked(bh)); tsk->state = TASK_RUNNING; @@ -1531,6 +1544,9 @@ static int __block_write_full_page(struc /* Done - end_buffer_io_async will unlock */ SetPageUptodate(page); + + wakeup_page_waiters(page); + return 0; out: @@ -1562,6 +1578,7 @@ out: } while (bh != head); if (need_unlock) UnlockPage(page); + wakeup_page_waiters(page); return err; } @@ -1796,6 +1813,8 @@ int block_read_full_page(struct page *pa else submit_bh(READ, bh); } + + wakeup_page_waiters(page); return 0; } @@ -2424,6 +2443,7 @@ int brw_page(int rw, struct page *page, submit_bh(rw, bh); bh = next; } while (bh != head); + wakeup_page_waiters(page); return 0; } diff -urNp x-ref/fs/reiserfs/inode.c x/fs/reiserfs/inode.c --- x-ref/fs/reiserfs/inode.c Tue Nov 19 19:45:46 2002 +++ x/fs/reiserfs/inode.c Tue Nov 19 19:46:19 2002 @@ -1999,6 +1999,7 @@ static int reiserfs_write_full_page(stru */ if (nr) { submit_bh_for_writepage(arr, nr) ; + wakeup_page_waiters(page); } else { UnlockPage(page) ; } diff -urNp x-ref/include/linux/pagemap.h x/include/linux/pagemap.h --- x-ref/include/linux/pagemap.h Tue Nov 19 19:45:56 2002 +++ x/include/linux/pagemap.h Tue Nov 19 19:46:19 2002 @@ -98,6 +98,8 @@ static inline void wait_on_page(struct p ___wait_on_page(page); } +extern void FASTCALL(wakeup_page_waiters(struct page * page)); + /* * Returns locked page at given index in given cache, creating it if needed. */ diff -urNp x-ref/kernel/ksyms.c x/kernel/ksyms.c --- x-ref/kernel/ksyms.c Tue Nov 19 19:45:56 2002 +++ x/kernel/ksyms.c Tue Nov 19 19:46:25 2002 @@ -315,6 +315,7 @@ EXPORT_SYMBOL(filemap_fdatasync); EXPORT_SYMBOL(filemap_fdatawait); EXPORT_SYMBOL(lock_page); EXPORT_SYMBOL(unlock_page); +EXPORT_SYMBOL(wakeup_page_waiters); /* device registration */ EXPORT_SYMBOL(register_chrdev); diff -urNp x-ref/mm/filemap.c x/mm/filemap.c --- x-ref/mm/filemap.c Tue Nov 19 19:45:56 2002 +++ x/mm/filemap.c Tue Nov 19 19:46:19 2002 @@ -771,6 +771,20 @@ inline wait_queue_head_t * page_waitqueu return wait_table_hashfn(page, &pgdat->wait_table); } +/* + * This must be called after every submit_bh with end_io + * callbacks that would result into the blkdev layer waking + * up the page after a queue unplug. + */ +void wakeup_page_waiters(struct page * page) +{ + wait_queue_head_t * head; + + head = page_waitqueue(page); + if (waitqueue_active(head)) + wake_up(head); +} + /* * Wait for a page to get unlocked. *