From: Nick Piggin This patch includes Chris Mason's fix to only clear queue_full when all tasks have been woken. Previously I think starvation and unfairness could still occur. With this change to the blk-fair-batches patch, Chris is showing some much improved numbers for 2.4 - 170 ms max wait vs 2700ms without blk-fair-batches for a dbench 90 run. He didn't indicate how much difference his patch alone made, but it is an important fix I think. drivers/block/ll_rw_blk.c | 54 ++++++++++++++++++++-------------------------- 1 files changed, 24 insertions(+), 30 deletions(-) diff -puN drivers/block/ll_rw_blk.c~blk-fair-batches-2 drivers/block/ll_rw_blk.c --- 25/drivers/block/ll_rw_blk.c~blk-fair-batches-2 2003-06-11 22:36:24.000000000 -0700 +++ 25-akpm/drivers/block/ll_rw_blk.c 2003-06-11 22:36:24.000000000 -0700 @@ -1319,7 +1319,8 @@ static inline struct request *blk_alloc_ /* * Get a free request, queue_lock must not be held */ -static struct request *get_request(request_queue_t *q, int rw, int gfp_mask) +static struct request * +get_request(request_queue_t *q, int rw, int gfp_mask, int force) { struct request *rq = NULL; struct request_list *rl = &q->rq; @@ -1328,7 +1329,7 @@ static struct request *get_request(reque if (rl->count[rw] == q->nr_requests) blk_set_queue_full(q, rw); - if (blk_queue_full(q, rw) && !elv_may_queue(q, rw)) { + if (blk_queue_full(q, rw) && !force && !elv_may_queue(q, rw)) { spin_unlock_irq(q->queue_lock); goto out; } @@ -1345,11 +1346,10 @@ static struct request *get_request(reque clear_queue_congested(q, rw); if (rl->count[rw] <= batch_requests(q)) { - if (rl->count[rw] == batch_requests(q)) - blk_clear_queue_full(q, rw); - if (waitqueue_active(&rl->wait[rw])) wake_up(&rl->wait[rw]); + else + blk_clear_queue_full(q, rw); } spin_unlock_irq(q->queue_lock); @@ -1388,26 +1388,22 @@ static struct request *get_request_wait( { DEFINE_WAIT(wait); struct request *rq; + int waited = 0; generic_unplug_device(q); do { - rq = get_request(q, rw, GFP_NOIO); + struct request_list *rl = &q->rq; - if (!rq) { - struct request_list *rl = &q->rq; + prepare_to_wait_exclusive(&rl->wait[rw], &wait, + TASK_UNINTERRUPTIBLE); + + rq = get_request(q, rw, GFP_NOIO, waited); - prepare_to_wait_exclusive(&rl->wait[rw], &wait, - TASK_UNINTERRUPTIBLE); - /* - * If _all_ the requests were suddenly returned then - * no wakeup will be delivered. So now we're on the - * waitqueue, go check for that. - */ - rq = get_request(q, rw, GFP_NOIO); - if (!rq) - io_schedule(); - finish_wait(&rl->wait[rw], &wait); + if (!rq) { + io_schedule(); + waited = 1; } + finish_wait(&rl->wait[rw], &wait); } while (!rq); return rq; @@ -1419,10 +1415,10 @@ struct request *blk_get_request(request_ BUG_ON(rw != READ && rw != WRITE); - rq = get_request(q, rw, gfp_mask); - - if (!rq && (gfp_mask & __GFP_WAIT)) + if (gfp_mask & __GFP_WAIT) rq = get_request_wait(q, rw); + else + rq = get_request(q, rw, gfp_mask, 0); return rq; } @@ -1575,11 +1571,10 @@ void __blk_put_request(request_queue_t * clear_queue_congested(q, rw); if (rl->count[rw] <= batch_requests(q)) { - if (rl->count[rw] == batch_requests(q)) - blk_clear_queue_full(q, rw); - if (waitqueue_active(&rl->wait[rw])) wake_up(&rl->wait[rw]); + else + blk_clear_queue_full(q, rw); } } } @@ -1823,7 +1818,7 @@ get_rq: freereq = NULL; } else { spin_unlock_irq(q->queue_lock); - if ((freereq = get_request(q, rw, GFP_ATOMIC)) == NULL) { + if ((freereq = get_request(q, rw, GFP_ATOMIC, 0)) == NULL) { /* * READA bit set */ @@ -1931,8 +1926,7 @@ static inline void blk_partition_remap(s * bio happens to be merged with someone else, and may change bi_dev and * bi_sector for remaps as it sees fit. So the values of these fields * should NOT be depended on after the call to generic_make_request. - * - * */ + */ void generic_make_request(struct bio *bio) { request_queue_t *q; @@ -2446,14 +2440,14 @@ queue_requests_store(struct request_queu blk_set_queue_full(q, READ); else if (rl->count[READ] <= batch_requests(q)) { blk_clear_queue_full(q, READ); - wake_up(&rl->wait[READ]); + wake_up_all(&rl->wait[READ]); } if (rl->count[WRITE] >= q->nr_requests) blk_set_queue_full(q, WRITE); else if (rl->count[WRITE] <= batch_requests(q)) { blk_clear_queue_full(q, WRITE); - wake_up(&rl->wait[WRITE]); + wake_up_all(&rl->wait[WRITE]); } return ret; _