From: Nick Piggin This patch includes Chris Mason's fix to only clear queue_full when all tasks have been woken. Previously I think starvation and unfairness could still occur. With this change to the blk-fair-batches patch, Chris is showing some much improved numbers for 2.4 - 170 ms max wait vs 2700ms without blk-fair-batches for a dbench 90 run. He didn't indicate how much difference his patch alone made, but it is an important fix I think. drivers/block/ll_rw_blk.c | 54 ++++++++++++++++++++-------------------------- 1 files changed, 24 insertions(+), 30 deletions(-) diff -puN drivers/block/ll_rw_blk.c~blk-fair-batches-2 drivers/block/ll_rw_blk.c --- 25/drivers/block/ll_rw_blk.c~blk-fair-batches-2 2003-06-18 00:17:19.000000000 -0700 +++ 25-akpm/drivers/block/ll_rw_blk.c 2003-06-18 00:17:19.000000000 -0700 @@ -1313,7 +1313,8 @@ static inline struct request *blk_alloc_ /* * Get a free request, queue_lock must not be held */ -static struct request *get_request(request_queue_t *q, int rw, int gfp_mask) +static struct request * +get_request(request_queue_t *q, int rw, int gfp_mask, int force) { struct request *rq = NULL; struct request_list *rl = &q->rq; @@ -1321,7 +1322,7 @@ static struct request *get_request(reque spin_lock_irq(q->queue_lock); if (rl->count[rw] == q->nr_requests) blk_set_queue_full(q, rw); - if (blk_queue_full(q, rw) && !elv_may_queue(q, rw)) { + if (blk_queue_full(q, rw) && !force && !elv_may_queue(q, rw)) { spin_unlock_irq(q->queue_lock); goto out; } @@ -1338,11 +1339,10 @@ static struct request *get_request(reque clear_queue_congested(q, rw); if (rl->count[rw] <= batch_requests(q)) { - if (rl->count[rw] == batch_requests(q)) - blk_clear_queue_full(q, rw); - if (waitqueue_active(&rl->wait[rw])) wake_up(&rl->wait[rw]); + else + blk_clear_queue_full(q, rw); } spin_unlock_irq(q->queue_lock); @@ -1381,26 +1381,22 @@ static struct request *get_request_wait( { DEFINE_WAIT(wait); struct request *rq; + int waited = 0; generic_unplug_device(q); do { - rq = get_request(q, rw, GFP_NOIO); + struct request_list *rl = &q->rq; - if (!rq) { - struct request_list *rl = &q->rq; + prepare_to_wait_exclusive(&rl->wait[rw], &wait, + TASK_UNINTERRUPTIBLE); + + rq = get_request(q, rw, GFP_NOIO, waited); - prepare_to_wait_exclusive(&rl->wait[rw], &wait, - TASK_UNINTERRUPTIBLE); - /* - * If _all_ the requests were suddenly returned then - * no wakeup will be delivered. So now we're on the - * waitqueue, go check for that. - */ - rq = get_request(q, rw, GFP_NOIO); - if (!rq) - io_schedule(); - finish_wait(&rl->wait[rw], &wait); + if (!rq) { + io_schedule(); + waited = 1; } + finish_wait(&rl->wait[rw], &wait); } while (!rq); return rq; @@ -1412,10 +1408,10 @@ struct request *blk_get_request(request_ BUG_ON(rw != READ && rw != WRITE); - rq = get_request(q, rw, gfp_mask); - - if (!rq && (gfp_mask & __GFP_WAIT)) + if (gfp_mask & __GFP_WAIT) rq = get_request_wait(q, rw); + else + rq = get_request(q, rw, gfp_mask, 0); return rq; } @@ -1568,11 +1564,10 @@ void __blk_put_request(request_queue_t * clear_queue_congested(q, rw); if (rl->count[rw] <= batch_requests(q)) { - if (rl->count[rw] == batch_requests(q)) - blk_clear_queue_full(q, rw); - if (waitqueue_active(&rl->wait[rw])) wake_up(&rl->wait[rw]); + else + blk_clear_queue_full(q, rw); } } } @@ -1816,7 +1811,7 @@ get_rq: freereq = NULL; } else { spin_unlock_irq(q->queue_lock); - if ((freereq = get_request(q, rw, GFP_ATOMIC)) == NULL) { + if ((freereq = get_request(q, rw, GFP_ATOMIC, 0)) == NULL) { /* * READA bit set */ @@ -1924,8 +1919,7 @@ static inline void blk_partition_remap(s * bio happens to be merged with someone else, and may change bi_dev and * bi_sector for remaps as it sees fit. So the values of these fields * should NOT be depended on after the call to generic_make_request. - * - * */ + */ void generic_make_request(struct bio *bio) { request_queue_t *q; @@ -2439,14 +2433,14 @@ queue_requests_store(struct request_queu blk_set_queue_full(q, READ); else if (rl->count[READ] <= batch_requests(q)) { blk_clear_queue_full(q, READ); - wake_up(&rl->wait[READ]); + wake_up_all(&rl->wait[READ]); } if (rl->count[WRITE] >= q->nr_requests) blk_set_queue_full(q, WRITE); else if (rl->count[WRITE] <= batch_requests(q)) { blk_clear_queue_full(q, WRITE); - wake_up(&rl->wait[WRITE]); + wake_up_all(&rl->wait[WRITE]); } return ret; _