diff -urNp --exclude CVS --exclude BitKeeper x-ref/drivers/block/blkpg.c x/drivers/block/blkpg.c --- x-ref/drivers/block/blkpg.c 2003-03-15 03:25:00.000000000 +0100 +++ x/drivers/block/blkpg.c 2003-09-02 01:33:55.000000000 +0200 @@ -261,10 +261,10 @@ int blk_ioctl(kdev_t dev, unsigned int c return blkpg_ioctl(dev, (struct blkpg_ioctl_arg *) arg); case BLKELVGET: - return blkelvget_ioctl(&blk_get_queue(dev)->elevator, + return blkelvget_ioctl(blk_get_queue(dev), (blkelv_ioctl_arg_t *) arg); case BLKELVSET: - return blkelvset_ioctl(&blk_get_queue(dev)->elevator, + return blkelvset_ioctl(blk_get_queue(dev), (blkelv_ioctl_arg_t *) arg); case BLKBSZGET: diff -urNp --exclude CVS --exclude BitKeeper x-ref/drivers/block/elevator.c x/drivers/block/elevator.c --- x-ref/drivers/block/elevator.c 2003-09-02 01:33:40.000000000 +0200 +++ x/drivers/block/elevator.c 2003-09-02 01:33:55.000000000 +0200 @@ -249,23 +249,28 @@ int elevator_noop_merge(request_queue_t void elevator_noop_merge_req(struct request *req, struct request *next) {} -int blkelvget_ioctl(elevator_t * elevator, blkelv_ioctl_arg_t * arg) +int blkelvget_ioctl(request_queue_t *q, blkelv_ioctl_arg_t * arg) { + elevator_t *elevator = &q->elevator; blkelv_ioctl_arg_t output; output.queue_ID = elevator->queue_ID; output.read_latency = elevator->read_latency; output.write_latency = elevator->write_latency; - output.max_bomb_segments = 0; - + output.max_bomb_segments = q->max_queue_sectors; + if (q->low_latency) + output.max_bomb_segments |= MAX_BOMB_LATENCY_MASK; + else + output.max_bomb_segments &= ~MAX_BOMB_LATENCY_MASK; if (copy_to_user(arg, &output, sizeof(blkelv_ioctl_arg_t))) return -EFAULT; return 0; } -int blkelvset_ioctl(elevator_t * elevator, const blkelv_ioctl_arg_t * arg) +int blkelvset_ioctl(request_queue_t *q, const blkelv_ioctl_arg_t * arg) { + elevator_t *elevator = &q->elevator; blkelv_ioctl_arg_t input; if (copy_from_user(&input, arg, sizeof(blkelv_ioctl_arg_t))) @@ -275,9 +280,23 @@ int blkelvset_ioctl(elevator_t * elevato return -EINVAL; if (input.write_latency < 0) return -EINVAL; + if (input.max_bomb_segments < 0) + return -EINVAL; elevator->read_latency = input.read_latency; elevator->write_latency = input.write_latency; + q->low_latency = input.max_bomb_segments & MAX_BOMB_LATENCY_MASK ? 1:0; + printk(KERN_INFO "queue %d: low latency mode is now %s\n", elevator->queue_ID, + q->low_latency ? "on" : "off"); + input.max_bomb_segments &= ~MAX_BOMB_LATENCY_MASK; + if (input.max_bomb_segments) { + q->max_queue_sectors = input.max_bomb_segments; + q->batch_sectors = q->max_queue_sectors / 4; + /* changing around these numbers might cause a missed wakeup */ + wake_up(&q->wait_for_requests); + } + printk(KERN_INFO "queue %d: max queue sectors is now %d\n", elevator->queue_ID, + q->max_queue_sectors); return 0; } diff -urNp --exclude CVS --exclude BitKeeper x-ref/drivers/block/ll_rw_blk.c x/drivers/block/ll_rw_blk.c --- x-ref/drivers/block/ll_rw_blk.c 2003-09-02 01:33:40.000000000 +0200 +++ x/drivers/block/ll_rw_blk.c 2003-09-02 01:36:48.000000000 +0200 @@ -450,6 +450,19 @@ static inline void __generic_unplug_devi if (next == q->last_merge) q->last_merge = NULL; + /* we don't want merges later on to come in + * and significantly increase the amount of + * work during an unplug, it can lead to high + * latencies while some poor waiter tries to + * run an ever increasing chunk of io. + * This does lower throughput some though. + */ + if (q->low_latency) { + struct request *rq; + rq = blkdev_entry_prev_request(&q->queue_head), + rq->elevator_sequence = 0; + } + q->request_fn(q); } } @@ -685,7 +698,9 @@ void blk_init_queue(request_queue_t * q, q->plug_tq.routine = &generic_unplug_device; q->plug_tq.data = q; q->plugged = 0; + q->full = 0; q->can_throttle = 0; + q->low_latency = 0; q->last_merge = NULL; /* @@ -704,7 +719,7 @@ void blk_init_queue(request_queue_t * q, * Get a free request. io_request_lock must be held and interrupts * disabled on the way in. Returns NULL if there are no free requests. */ -static struct request *get_request(request_queue_t *q, int rw) +static struct request *__get_request(request_queue_t *q, int rw) { struct request *rq = NULL; struct request_list *rl = &q->rq; @@ -720,9 +735,9 @@ static struct request *get_request(reque * pending, bail out */ if ((rw == WRITE) || (rw == READ && rl->pending[READ] > rlim)) - return NULL; + goto full; if (blk_oversized_queue_reads(q)) - return NULL; + goto full; } if (!list_empty(&rl->free)) { @@ -734,12 +749,37 @@ static struct request *get_request(reque rq->cmd = rw; rq->special = NULL; rq->q = q; + } else { +full: + q->full = 1; } return rq; } /* + * get a free request, honoring the queue_full condition + */ +static inline struct request *get_request(request_queue_t *q, int rw) +{ + if (q->full) + return NULL; + return __get_request(q, rw); +} + +/* + * helper func to do memory barriers and wakeups when we finally decide + * to clear the queue full condition + */ +static inline void clear_full_and_wake(request_queue_t *q) +{ + q->full = 0; + mb(); + if (waitqueue_active(&q->wait_for_requests)) + wake_up(&q->wait_for_requests); +} + +/* * Here's the request allocation design, low latency version: * * 1: Blocking on request exhaustion is a key part of I/O throttling. @@ -787,23 +827,28 @@ static struct request *__get_request_wai { register struct request *rq; DECLARE_WAITQUEUE(wait, current); + int oversized; add_wait_queue_exclusive(&q->wait_for_requests, &wait); do { set_current_state(TASK_UNINTERRUPTIBLE); spin_lock_irq(q->queue_lock); - if (blk_oversized_queue(q) || q->rq.count == 0) { - __generic_unplug_device(q); + oversized = blk_oversized_queue(q) || q->rq.count == 0; + if (q->full || oversized) { + if (oversized) + __generic_unplug_device(q); spin_unlock_irq(q->queue_lock); schedule(); spin_lock_irq(q->queue_lock); } - rq = get_request(q, rw); + rq = __get_request(q, rw); spin_unlock_irq(q->queue_lock); } while (rq == NULL); remove_wait_queue(&q->wait_for_requests, &wait); current->state = TASK_RUNNING; + if (!waitqueue_active(&q->wait_for_requests)) + clear_full_and_wake(q); return rq; } @@ -1061,6 +1106,8 @@ void blkdev_release_request(struct reque smp_mb(); if (waitqueue_active(&q->wait_for_requests)) wake_up(&q->wait_for_requests); + else + clear_full_and_wake(q); } } } @@ -1273,7 +1320,7 @@ get_rq: * See description above __get_request_wait() */ if (rw_ahead) { - if (q->rq.count < q->batch_requests || blk_oversized_queue_batch(q)) { + if (q->full || q->rq.count < q->batch_requests || blk_oversized_queue_batch(q)) { spin_unlock_irq(q->queue_lock); goto end_io; } diff -urNp --exclude CVS --exclude BitKeeper x-ref/include/linux/blkdev.h x/include/linux/blkdev.h --- x-ref/include/linux/blkdev.h 2003-09-02 01:33:40.000000000 +0200 +++ x/include/linux/blkdev.h 2003-09-02 01:33:55.000000000 +0200 @@ -140,6 +140,13 @@ struct request_queue int head_active:1; /* + * Booleans that indicate whether the queue's free requests have + * been exhausted and is waiting to drop below the batch_requests + * threshold + */ + int full:1; + + /* * Boolean that indicates you will use blk_started_sectors * and blk_finished_sectors in addition to blk_started_io * and blk_finished_io. It enables the throttling code to @@ -147,6 +154,12 @@ struct request_queue */ int can_throttle:1; + /* + * Boolean that indicates the queue should prefer low + * latency over throughput. This enables the q->full checks + */ + int low_latency:1; + unsigned long bounce_pfn; /* diff -urNp --exclude CVS --exclude BitKeeper x-ref/include/linux/elevator.h x/include/linux/elevator.h --- x-ref/include/linux/elevator.h 2003-08-26 00:13:07.000000000 +0200 +++ x/include/linux/elevator.h 2003-09-02 01:33:55.000000000 +0200 @@ -35,14 +35,19 @@ typedef struct blkelv_ioctl_arg_s { int queue_ID; int read_latency; int write_latency; +/* + * (max_bomb_segments & MAX_BOMB_LATENCY_MASK) == 1 indicates low latency + * mode. We're using any odd number to indicate low latency is on. + */ +#define MAX_BOMB_LATENCY_MASK 1 int max_bomb_segments; } blkelv_ioctl_arg_t; #define BLKELVGET _IOR(0x12,106,sizeof(blkelv_ioctl_arg_t)) #define BLKELVSET _IOW(0x12,107,sizeof(blkelv_ioctl_arg_t)) -extern int blkelvget_ioctl(elevator_t *, blkelv_ioctl_arg_t *); -extern int blkelvset_ioctl(elevator_t *, const blkelv_ioctl_arg_t *); +extern int blkelvget_ioctl(request_queue_t *, blkelv_ioctl_arg_t *); +extern int blkelvset_ioctl(request_queue_t *, const blkelv_ioctl_arg_t *); extern void elevator_init(elevator_t *, elevator_t);