From: Nick Piggin This patch fixes the request batching fairness/starvation issue. Its not clear what is going on with 2.4, but it seems that its a problem around this area. Anyway, previously: * request queue fills up * process 1 calls get_request, sleeps * a couple of requests are freed * process 2 calls get_request, proceeds * a couple of requests are freed * process 2 calls get_request... Now as unlikely as it seems, it could be a problem. Its a fairness problem that process 2 can skip ahead of process 1 anyway. With the patch: * request queue fills up * any process calling get_request will sleep * once the queue gets below the batch watermark, processes start being worken, and may allocate. DESC blk fair batches #2 EDESC From: Nick Piggin This patch includes Chris Mason's fix to only clear queue_full when all tasks have been woken. Previously I think starvation and unfairness could still occur. With this change to the blk-fair-batches patch, Chris is showing some much improved numbers for 2.4 - 170 ms max wait vs 2700ms without blk-fair-batches for a dbench 90 run. He didn't indicate how much difference his patch alone made, but it is an important fix I think. drivers/block/ll_rw_blk.c | 75 ++++++++++++++++++++++++++++++---------------- include/linux/blkdev.h | 26 +++++++++++++++ 2 files changed, 75 insertions(+), 26 deletions(-) diff -puN drivers/block/ll_rw_blk.c~blk-fair-batches drivers/block/ll_rw_blk.c --- 25/drivers/block/ll_rw_blk.c~blk-fair-batches 2003-07-02 22:10:50.000000000 -0700 +++ 25-akpm/drivers/block/ll_rw_blk.c 2003-07-02 22:10:56.000000000 -0700 @@ -53,7 +53,7 @@ unsigned long blk_max_low_pfn, blk_max_p static inline int batch_requests(struct request_queue *q) { - return q->nr_requests - min(q->nr_requests / 8, 8UL); + return q->nr_requests - min(q->nr_requests / 8, 8UL) - 1; } /* @@ -1323,13 +1323,16 @@ static inline struct request *blk_alloc_ /* * Get a free request, queue_lock must not be held */ -static struct request *get_request(request_queue_t *q, int rw, int gfp_mask) +static struct request * +get_request(request_queue_t *q, int rw, int gfp_mask, int force) { struct request *rq = NULL; struct request_list *rl = &q->rq; spin_lock_irq(q->queue_lock); - if (rl->count[rw] >= q->nr_requests && !elv_may_queue(q, rw)) { + if (rl->count[rw] == q->nr_requests) + blk_set_queue_full(q, rw); + if (blk_queue_full(q, rw) && !force && !elv_may_queue(q, rw)) { spin_unlock_irq(q->queue_lock); goto out; } @@ -1344,6 +1347,14 @@ static struct request *get_request(reque rl->count[rw]--; if (rl->count[rw] < queue_congestion_off_threshold(q)) clear_queue_congested(q, rw); + + if (rl->count[rw] <= batch_requests(q)) { + if (waitqueue_active(&rl->wait[rw])) + wake_up(&rl->wait[rw]); + else + blk_clear_queue_full(q, rw); + } + spin_unlock_irq(q->queue_lock); goto out; } @@ -1380,26 +1391,22 @@ static struct request *get_request_wait( { DEFINE_WAIT(wait); struct request *rq; + int waited = 0; generic_unplug_device(q); do { - rq = get_request(q, rw, GFP_NOIO); + struct request_list *rl = &q->rq; - if (!rq) { - struct request_list *rl = &q->rq; + prepare_to_wait_exclusive(&rl->wait[rw], &wait, + TASK_UNINTERRUPTIBLE); - prepare_to_wait_exclusive(&rl->wait[rw], &wait, - TASK_UNINTERRUPTIBLE); - /* - * If _all_ the requests were suddenly returned then - * no wakeup will be delivered. So now we're on the - * waitqueue, go check for that. - */ - rq = get_request(q, rw, GFP_NOIO); - if (!rq) - io_schedule(); - finish_wait(&rl->wait[rw], &wait); + rq = get_request(q, rw, GFP_NOIO, waited); + + if (!rq) { + io_schedule(); + waited = 1; } + finish_wait(&rl->wait[rw], &wait); } while (!rq); return rq; @@ -1411,10 +1418,10 @@ struct request *blk_get_request(request_ BUG_ON(rw != READ && rw != WRITE); - rq = get_request(q, rw, gfp_mask); - - if (!rq && (gfp_mask & __GFP_WAIT)) + if (gfp_mask & __GFP_WAIT) rq = get_request_wait(q, rw); + else + rq = get_request(q, rw, gfp_mask, 0); return rq; } @@ -1565,9 +1572,13 @@ void __blk_put_request(request_queue_t * rl->count[rw]--; if (rl->count[rw] < queue_congestion_off_threshold(q)) clear_queue_congested(q, rw); - if (rl->count[rw] < batch_requests(q) && - waitqueue_active(&rl->wait[rw])) - wake_up(&rl->wait[rw]); + + if (rl->count[rw] <= batch_requests(q)) { + if (waitqueue_active(&rl->wait[rw])) + wake_up(&rl->wait[rw]); + else + blk_clear_queue_full(q, rw); + } } } @@ -1810,7 +1821,7 @@ get_rq: freereq = NULL; } else { spin_unlock_irq(q->queue_lock); - if ((freereq = get_request(q, rw, GFP_ATOMIC)) == NULL) { + if ((freereq = get_request(q, rw, GFP_ATOMIC, 0)) == NULL) { /* * READA bit set */ @@ -1918,8 +1929,7 @@ static inline void blk_partition_remap(s * bio happens to be merged with someone else, and may change bi_dev and * bi_sector for remaps as it sees fit. So the values of these fields * should NOT be depended on after the call to generic_make_request. - * - * */ + */ void generic_make_request(struct bio *bio) { request_queue_t *q; @@ -2429,6 +2439,19 @@ queue_requests_store(struct request_queu else if (rl->count[WRITE] < queue_congestion_off_threshold(q)) clear_queue_congested(q, WRITE); + if (rl->count[READ] >= q->nr_requests) { + blk_set_queue_full(q, READ); + } else if (rl->count[READ] <= batch_requests(q)) { + blk_clear_queue_full(q, READ); + wake_up_all(&rl->wait[READ]); + } + + if (rl->count[WRITE] >= q->nr_requests) { + blk_set_queue_full(q, WRITE); + } else if (rl->count[WRITE] <= batch_requests(q)) { + blk_clear_queue_full(q, WRITE); + wake_up_all(&rl->wait[WRITE]); + } return ret; } diff -puN include/linux/blkdev.h~blk-fair-batches include/linux/blkdev.h --- 25/include/linux/blkdev.h~blk-fair-batches 2003-07-02 22:10:50.000000000 -0700 +++ 25-akpm/include/linux/blkdev.h 2003-07-02 22:10:50.000000000 -0700 @@ -307,6 +307,8 @@ struct request_queue #define QUEUE_FLAG_CLUSTER 0 /* cluster several segments into 1 */ #define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ #define QUEUE_FLAG_STOPPED 2 /* queue is stopped */ +#define QUEUE_FLAG_READFULL 3 /* write queue has been filled */ +#define QUEUE_FLAG_WRITEFULL 4 /* read queue has been filled */ #define blk_queue_plugged(q) !list_empty(&(q)->plug_list) #define blk_queue_tagged(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags) @@ -322,6 +324,30 @@ struct request_queue #define rq_data_dir(rq) ((rq)->flags & 1) +static inline int blk_queue_full(struct request_queue *q, int rw) +{ + if (rw == READ) + return test_bit(QUEUE_FLAG_READFULL, &q->queue_flags); + return test_bit(QUEUE_FLAG_WRITEFULL, &q->queue_flags); +} + +static inline void blk_set_queue_full(struct request_queue *q, int rw) +{ + if (rw == READ) + set_bit(QUEUE_FLAG_READFULL, &q->queue_flags); + else + set_bit(QUEUE_FLAG_WRITEFULL, &q->queue_flags); +} + +static inline void blk_clear_queue_full(struct request_queue *q, int rw) +{ + if (rw == READ) + clear_bit(QUEUE_FLAG_READFULL, &q->queue_flags); + else + clear_bit(QUEUE_FLAG_WRITEFULL, &q->queue_flags); +} + + /* * mergeable request must not have _NOMERGE or _BARRIER bit set, nor may * it already be started by driver. _