Patch from Nick Piggin Now 1 AFAIKS forward decl which really can't go. Better arrangement. More comments. The last BUG_ON fix is included in this, but other than that there should be no functional changes. drivers/block/as-iosched.c | 1083 ++++++++++++++++++++++----------------------- 1 files changed, 546 insertions(+), 537 deletions(-) diff -puN drivers/block/as-iosched.c~as-cleanup-2 drivers/block/as-iosched.c --- 25/drivers/block/as-iosched.c~as-cleanup-2 2003-03-16 19:49:24.000000000 -0800 +++ 25-akpm/drivers/block/as-iosched.c 2003-03-16 19:49:24.000000000 -0800 @@ -81,15 +81,6 @@ static unsigned long write_batch_expire */ static unsigned long antic_expire = HZ / 100; -enum anticipation_states { - ANTIC_OFF=0, /* Not anticipating (normal operation) */ - ANTIC_WAIT_REQ, /* The last read has not yet completed */ - ANTIC_WAIT_NEXT, /* Currently anticipating a request vs - last read (which has completed) */ - ANTIC_FINISHED, /* Anticipating but have found a candidate - or timed out */ -}; - /* * This is the per-process anticipatory I/O scheduler state. It is refcounted * and kmalloc'ed. @@ -158,12 +149,15 @@ struct as_data { unsigned long antic_expire; }; -enum arq_states { - AS_RQ_NEW=0, /* New - not referenced and not on any lists */ - AS_RQ_QUEUED, /* In the request queue. It belongs to the - scheduler */ - AS_RQ_DISPATCHED, /* On the dispatch list. It belongs to the - driver now */ +#define list_entry_fifo(ptr) list_entry((ptr), struct as_rq, fifo) + +enum anticipation_states { + ANTIC_OFF=0, /* Not anticipating (normal operation) */ + ANTIC_WAIT_REQ, /* The last read has not yet completed */ + ANTIC_WAIT_NEXT, /* Currently anticipating a request vs + last read (which has completed) */ + ANTIC_FINISHED, /* Anticipating but have found a candidate + or timed out */ }; /* @@ -192,7 +186,15 @@ struct as_rq { struct list_head fifo; unsigned long expires; - unsigned long state; + unsigned long state; /* debug only */ +}; + +enum arq_states { + AS_RQ_NEW=0, /* New - not referenced and not on any lists */ + AS_RQ_QUEUED, /* In the request queue. It belongs to the + scheduler */ + AS_RQ_DISPATCHED, /* On the dispatch list. It belongs to the + driver now */ }; #define RQ_DATA(rq) ((struct as_rq *) (rq)->elevator_private) @@ -389,40 +391,6 @@ static struct as_rq *as_find_first_arq(s } } -static struct as_rq * -as_choose_req(struct as_data *ad, struct as_rq *arq1, struct as_rq *arq2); - -/* - * as_find_next_arq finds the next request after @prev in elevator order. - */ -static struct as_rq *as_find_next_arq(struct as_data *ad, struct as_rq *last) -{ - const int data_dir = rq_data_dir(last->request); - struct as_rq *ret; - struct rb_node *rbnext = rb_next(&last->rb_node); - struct rb_node *rbprev = rb_prev(&last->rb_node); - struct as_rq *arq_next, *arq_prev; - - BUG_ON(!ON_RB(&last->rb_node)); - - if (rbprev) - arq_prev = rb_entry_arq(rbprev); - else - arq_prev = NULL; - - if (rbnext) - arq_next = rb_entry_arq(rbnext); - else { - arq_next = as_find_first_arq(ad, data_dir); - if (arq_next == last) - arq_next = NULL; - } - - ret = as_choose_req(ad, arq_next, arq_prev); - - return ret; -} - static struct as_rq *__as_add_arq_rb(struct as_data *ad, struct as_rq *arq) { struct rb_node **p = &ARQ_RB_ROOT(ad, arq)->rb_node; @@ -446,7 +414,6 @@ static struct as_rq *__as_add_arq_rb(str } static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq); - /* * Aad the request to the rb tree if it is unique. If there is an alias (an * existing request against the same sector), which can happen when using @@ -460,6 +427,7 @@ static void as_add_arq_rb(struct as_data arq->rb_key = rq_rb_key(rq); + /* This can be caused by direct IO */ while ((alias = __as_add_arq_rb(ad, arq))) as_move_to_dispatch(ad, alias); @@ -494,395 +462,173 @@ as_find_arq_rb(struct as_data *ad, secto return NULL; } -static void as_antic_waitnext(struct as_data *ad); +/* + * IO Scheduler proper + */ + +#define MAXBACK (512 * 1024) /* Maximum distance a process can seek backward + from a previous request it has made. No + seeking backward between processes allowed */ /* - * as_update_iohist keeps a decaying histogram of IO thinktimes, and - * updates @aic->mean_thinktime based on that. It is called when a new - * request is queued. + * as_choose_req selects the preferred one of two requests of the same data_dir + * ignoring time - eg. timeouts, which is the job of as_dispatch_request */ -static void as_update_iohist(struct as_io_context *aic) +static struct as_rq * +as_choose_req(struct as_data *ad, struct as_rq *arq1, struct as_rq *arq2) { - unsigned i; - unsigned long thinktime; - unsigned long total = 0; - unsigned long num = 0; + int data_dir; + sector_t last, s1, s2, d1, d2; + int r1_wrap=0, r2_wrap=0; /* requests are behind the disk head */ + const sector_t maxback = MAXBACK; - if (aic == NULL) - return; + if (arq1 == NULL || arq1 == arq2) + return arq2; + if (arq2 == NULL) + return arq1; - if (test_bit(AS_TASK_IORUNNING, &aic->state)) { - thinktime = jiffies - aic->last_end_request; - thinktime = min(thinktime, MAX_THINKTIME-1); - aic->thinktime[thinktime] += 256; /* fixed point: 1.0 == 1<<8 */ + data_dir = rq_data_dir(arq1->request); - for (i = 0; i < MAX_THINKTIME; i++) { - unsigned long tt = aic->thinktime[i]; - total += i*tt; - num += tt; + last = ad->last_sector[data_dir]; + s1 = arq1->request->sector; + s2 = arq2->request->sector; - aic->thinktime[i] = (tt>>1) + (tt>>2); /* 75% decay */ - } - /* fixed point factor is cancelled here */ - if (num) - aic->mean_thinktime = total / num; + BUG_ON(data_dir != rq_data_dir(arq2->request)); + + /* + * Strict one way elevator _except_ in the case where we allow + * short backward seeks which are biased as twice the cost of a + * similar forward seek. Only for reads and only between reads + * from the same process! + */ + if (s1 >= last) + d1 = s1 - last; + else if (data_dir == READ + && ad->as_io_context == arq1->as_io_context + && s1+maxback >= last) + d1 = (last - s1)*2; + else { + r1_wrap = 1; + d1 = 0; /* shut up, gcc */ + } + + if (s2 >= last) + d2 = s2 - last; + else if (data_dir == READ + && ad->as_io_context == arq2->as_io_context + && s2+maxback >= last) + d2 = (last - s2)*2; + else { + r2_wrap = 1; + d2 = 0; + } + + /* Found required data */ + if (!r1_wrap && r2_wrap) + return arq1; + else if (!r2_wrap && r1_wrap) + return arq2; + else if (r1_wrap && r2_wrap) { + /* both behind the head */ + if (s1 <= s2) + return arq1; + else + return arq2; + } + + /* Both requests in front of the head */ + if (d1 < d2) + return arq1; + else if (d2 < d1) + return arq2; + else { + if (s1 >= s2) + return arq1; + else + return arq2; } } /* - * Look Ma, no comment! + * as_find_next_arq finds the next request after @prev in elevator order. + * this with as_choose_arq form the basis for how the scheduler chooses + * what request to process next. Anticipation works on top of this. */ - -static void as_complete_arq(struct as_data *ad, struct as_rq *arq) +static struct as_rq *as_find_next_arq(struct as_data *ad, struct as_rq *last) { - if (!arq->as_io_context) - return; + const int data_dir = rq_data_dir(last->request); + struct as_rq *ret; + struct rb_node *rbnext = rb_next(&last->rb_node); + struct rb_node *rbprev = rb_prev(&last->rb_node); + struct as_rq *arq_next, *arq_prev; - if (rq_data_dir(arq->request) == READ) { - set_bit(AS_TASK_IORUNNING, &arq->as_io_context->state); - arq->as_io_context->last_end_request = jiffies; - } + BUG_ON(!ON_RB(&last->rb_node)); - if (ad->as_io_context == arq->as_io_context) { - ad->antic_start = jiffies; - ad->aic_finished = 1; - if (ad->antic_status == ANTIC_WAIT_REQ) { - /* - * We were waiting on this request, now anticipate - * the next one - */ - as_antic_waitnext(ad); - } + if (rbprev) + arq_prev = rb_entry_arq(rbprev); + else + arq_prev = NULL; + + if (rbnext) + arq_next = rb_entry_arq(rbnext); + else { + arq_next = as_find_first_arq(ad, data_dir); + if (arq_next == last) + arq_next = NULL; } - put_as_io_context(&arq->as_io_context); -} -static void as_update_arq(struct as_data *ad, struct as_rq *arq); + ret = as_choose_req(ad, arq_next, arq_prev); + + return ret; +} /* - * add arq to rbtree and fifo + * anticipatory scheduling functions follow */ -static void as_add_request(struct as_data *ad, struct as_rq *arq) -{ - const int data_dir = rq_data_dir(arq->request); - - arq->as_io_context = get_as_io_context(); - if (arq->as_io_context) { - atomic_inc(&arq->as_io_context->nr_queued); - if (data_dir == READ) - as_update_iohist(arq->as_io_context); - } +/* + * as_antic_expired tells us when we have anticipated too long. + * The funny "absolute difference" math on the elapsed time is to handle + * jiffy wraps, and disks which have been idle for 0x80000000 jiffies. + */ +static int as_antic_expired(struct as_data *ad) +{ + long delta_jif; - as_add_arq_rb(ad, arq); + delta_jif = jiffies - ad->antic_start; + if (unlikely(delta_jif < 0)) + delta_jif = -delta_jif; + if (delta_jif < ad->antic_expire) + return 0; - /* - * set expire time (only used for reads) and add to fifo list - */ - arq->expires = jiffies + ad->fifo_expire[data_dir]; - list_add_tail(&arq->fifo, &ad->fifo_list[data_dir]); - arq->state = AS_RQ_QUEUED; - as_update_arq(ad, arq); /* keep state machine up to date */ + return 1; } /* - * as_remove_queued_request removes a request from the pre dispatch queue - * without updating refcounts. It is expected the caller will drop the - * reference unless it replaces the request at somepart of the elevator - * (ie. the dispatch queue) + * as_antic_waitnext starts anticipating that a nice request will soon be + * submitted. See also as_antic_waitreq */ -static void as_remove_queued_request(request_queue_t *q, struct request *rq) +static void as_antic_waitnext(struct as_data *ad) { - struct as_rq *arq = RQ_DATA(rq); - - if (!arq) - BUG(); - else { - const int data_dir = rq_data_dir(arq->request); - struct as_data *ad = q->elevator.elevator_data; + unsigned long timeout; - BUG_ON(arq->state != AS_RQ_QUEUED); + BUG_ON(ad->antic_status != ANTIC_OFF + && ad->antic_status != ANTIC_WAIT_REQ); - if (arq->as_io_context) { - BUG_ON(!atomic_read(&arq->as_io_context->nr_queued)); - atomic_dec(&arq->as_io_context->nr_queued); - } - - /* - * Update the "next_arq" cache if we are about to remove its - * entry - */ - if (ad->next_arq[data_dir] == arq) - ad->next_arq[data_dir] = as_find_next_arq(ad, arq); - - list_del_init(&arq->fifo); - as_del_arq_hash(arq); - as_del_arq_rb(ad, arq); - - if (q->last_merge == &rq->queuelist) - q->last_merge = NULL; - - list_del_init(&rq->queuelist); - } - -} - -/* - * as_remove_dispatched_request is called when a driver has completed the - * request (or it has caused an error), and is finished with it. It assumes - * the request is on the dispatch queue. - */ -static void as_remove_request(request_queue_t *q, struct request *rq) -{ - struct as_rq *arq = RQ_DATA(rq); - struct as_data *ad = q->elevator.elevator_data; - - if (q->last_merge == &rq->queuelist) - q->last_merge = NULL; - - list_del_init(&rq->queuelist); - - if (arq) { - list_del_init(&arq->fifo); - as_del_arq_hash(arq); - as_del_arq_rb(ad, arq); - if (arq->as_io_context) { - WARN_ON(!atomic_read(&arq->as_io_context->nr_dispatched)); - atomic_dec(&arq->as_io_context->nr_dispatched); - } - as_complete_arq(ad, arq); - } -} - -static int -as_merge(request_queue_t *q, struct list_head **insert, struct bio *bio) -{ - struct as_data *ad = q->elevator.elevator_data; - struct request *__rq; - int ret; - - /* - * try last_merge to avoid going to hash - */ - ret = elv_try_last_merge(q, bio); - if (ret != ELEVATOR_NO_MERGE) { - __rq = list_entry_rq(q->last_merge); - goto out_insert; - } - - /* - * see if the merge hash can satisfy a back merge - */ - __rq = as_find_arq_hash(ad, bio->bi_sector); - if (__rq) { - BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector); - - if (elv_rq_merge_ok(__rq, bio)) { - ret = ELEVATOR_BACK_MERGE; - goto out; - } - } - - /* - * check for front merge - */ - if (ad->front_merges) { - sector_t rb_key = bio->bi_sector + bio_sectors(bio); - - __rq = as_find_arq_rb(ad, rb_key, bio_data_dir(bio)); - if (__rq) { - BUG_ON(rb_key != rq_rb_key(__rq)); - - if (elv_rq_merge_ok(__rq, bio)) { - ret = ELEVATOR_FRONT_MERGE; - goto out; - } - } - } - - return ELEVATOR_NO_MERGE; -out: - q->last_merge = &__rq->queuelist; -out_insert: - *insert = &__rq->queuelist; - return ret; -} - -static void as_merged_request(request_queue_t *q, struct request *req) -{ - struct as_data *ad = q->elevator.elevator_data; - struct as_rq *arq = RQ_DATA(req); - - /* - * hash always needs to be repositioned, key is end sector - */ - as_del_arq_hash(arq); - as_add_arq_hash(ad, arq); - - /* - * if the merge was a front merge, we need to reposition request - */ - if (rq_rb_key(req) != arq->rb_key) { - as_del_arq_rb(ad, arq); - as_add_arq_rb(ad, arq); - /* - * Note! At this stage of this and the next function, our next - * request may not be optimal - eg the request may have "grown" - * behind the disk head. We currently don't bother adjusting. - */ - } - - q->last_merge = &req->queuelist; -} - -static void -as_merged_requests(request_queue_t *q, struct request *req, - struct request *next) -{ - struct as_data *ad = q->elevator.elevator_data; - struct as_rq *arq = RQ_DATA(req); - struct as_rq *anext = RQ_DATA(next); - - BUG_ON(!arq); - BUG_ON(!anext); - - /* - * reposition arq (this is the merged request) in hash, and in rbtree - * in case of a front merge - */ - as_del_arq_hash(arq); - as_add_arq_hash(ad, arq); - - if (rq_rb_key(req) != arq->rb_key) { - as_del_arq_rb(ad, arq); - as_add_arq_rb(ad, arq); - } - - /* - * if anext expires before arq, assign its expire time to arq - * and move into anext position (anext will be deleted) in fifo - */ - if (!list_empty(&arq->fifo) && !list_empty(&anext->fifo)) { - if (time_before(anext->expires, arq->expires)) { - list_move(&arq->fifo, &anext->fifo); - arq->expires = anext->expires; - /* - * Don't copy here but swap, because when anext is - * removed below, it must contain the unused context - */ - swap_as_io_context(&arq->as_io_context, - &anext->as_io_context); - } - } - - /* - * kill knowledge of next, this one is a goner - */ - as_remove_queued_request(q, next); - put_as_io_context(&anext->as_io_context); -} - -static void as_antic_stop(struct as_data *ad); - -/* - * move an entry to dispatch queue - */ -static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq) -{ - const int data_dir = rq_data_dir(arq->request); - - BUG_ON(!ON_RB(&arq->rb_node)); - - as_antic_stop(ad); - ad->antic_status = ANTIC_OFF; - - /* - * This has to be set in order to be correctly updated by - * as_find_next_arq - */ - ad->last_sector[data_dir] = arq->request->sector - + arq->request->nr_sectors; - - if (data_dir == READ) { - /* In case we have to anticipate after this */ - copy_as_io_context(&ad->as_io_context, &arq->as_io_context); - ad->aic_finished = 0; - } else - put_as_io_context(&ad->as_io_context); - - ad->next_arq[data_dir] = as_find_next_arq(ad, arq); - - /* - * take it off the sort and fifo list, add to dispatch queue - */ - as_remove_queued_request(ad->q, arq->request); - list_add_tail(&arq->request->queuelist, ad->dispatch); - if (arq->as_io_context) - atomic_inc(&arq->as_io_context->nr_dispatched); - - BUG_ON(arq->state != AS_RQ_QUEUED); - arq->state = AS_RQ_DISPATCHED; -} - -#define list_entry_fifo(ptr) list_entry((ptr), struct as_rq, fifo) - -/* - * as_fifo_expired returns 0 if there are no expired reads on the fifo, - * 1 otherwise. It is ratelimited so that we only perform the check once per - * `fifo_expire' interval. Otherwise a large number of expired requests - * would create a hopeless seekstorm. - * - * The funny "absolute difference" math on the elapsed time is to handle - * jiffy wraps, and disks which have been idle for 0x80000000 jiffies. - */ -static int as_fifo_expired(struct as_data *ad, int adir) -{ - struct as_rq *arq; - long delta_jif; - - delta_jif = jiffies - ad->last_check_fifo[adir]; - if (unlikely(delta_jif < 0)) - delta_jif = -delta_jif; - if (delta_jif < ad->fifo_expire[adir]) - return 0; - - ad->last_check_fifo[adir] = jiffies; - - if (list_empty(&ad->fifo_list[adir])) - return 0; - - arq = list_entry_fifo(ad->fifo_list[adir].next); - - return time_after(jiffies, arq->expires); -} - -static int as_antic_expired(struct as_data *ad) -{ - long delta_jif; - - delta_jif = jiffies - ad->antic_start; - if (unlikely(delta_jif < 0)) - delta_jif = -delta_jif; - if (delta_jif < ad->antic_expire) - return 0; - - return 1; -} - -/* - * as_batch_expired returns true if the current batch has expired. - */ -static inline int as_batch_expired(struct as_data *ad) -{ - return time_after(jiffies, ad->current_batch_expires); + timeout = ad->antic_start + ad->antic_expire; +#if 0 /* TODO unif me. This should be fixed. */ + timeout = min(timeout, ad->current_batch_expires); +#endif + mod_timer(&ad->antic_timer, timeout); + + ad->antic_status = ANTIC_WAIT_NEXT; } /* - * anticipatory scheduling functions follow + * as_antic_waitreq starts anticipating. We don't start timing the anticipation + * until the request that we're anticipating on has finished. This means we + * are timing from when the candidate process wakes up hopefully. */ - -static int as_queue_notready(request_queue_t *q); - static void as_antic_waitreq(struct as_data *ad) { BUG_ON(ad->antic_status == ANTIC_FINISHED); @@ -896,46 +642,6 @@ static void as_antic_waitreq(struct as_d } } -static void as_antic_waitnext(struct as_data *ad) -{ - unsigned long timeout; - - BUG_ON(ad->antic_status != ANTIC_OFF - && ad->antic_status != ANTIC_WAIT_REQ); - - timeout = ad->antic_start + ad->antic_expire; -#if 0 - /* FIX THIS!!! */ - timeout = min(timeout, ad->current_batch_expires); -#endif - mod_timer(&ad->antic_timer, timeout); - - ad->antic_status = ANTIC_WAIT_NEXT; -} - -/* - * This is executed in a "deferred" process context, by kblockd. It calls the - * driver's request_fn so the driver can submit that request. - * - * IMPORTANT! Thisguy will reenter the elevator, so set up all queue global - * state before calling, and don't rely on any state over calls. - * - * FIXME! dispatch queue is not a queue at all! - * Andrew! as_queue_notready does not _try_ to move a request to dispatch - * list, in fact it tries not to! Unfortunately it sometimes must in order - * to guarantee elv_next_request will return !NULL after a ready indication. - */ -static void as_work_handler(void *data) -{ - struct request_queue *q = data; - unsigned long flags; - - spin_lock_irqsave(q->queue_lock, flags); - if (!as_queue_notready(q)) - q->request_fn(q); - spin_unlock_irqrestore(q->queue_lock, flags); -} - /* * This is called directly by the functions in this file to stop anticipation. * We kill the timer and schedule a call to the request_fn asap. @@ -948,6 +654,7 @@ static void as_antic_stop(struct as_data if (status == ANTIC_WAIT_NEXT) del_timer(&ad->antic_timer); ad->antic_status = ANTIC_FINISHED; + /* see as_work_handler */ kblockd_schedule_work(&ad->antic_work); } } @@ -1005,7 +712,7 @@ static int as_close_req(struct as_data * * * If the task which has submitted the request has exitted, break anticipation. * - * If this task has queued some other reads, do not enter enticipation. + * If this task has queued some other IO, do not enter enticipation. */ static int as_can_break_anticipation(struct as_data *ad, struct as_rq *arq) { @@ -1051,6 +758,75 @@ static int as_can_break_anticipation(str } /* + * as_can_anticipate indicates weather we should either run arq + * or keep anticipating a better request. + */ +static int as_can_anticipate(struct as_data *ad, struct as_rq *arq) +{ + if (!ad->as_io_context) + /* + * Last request submitted was a write + */ + return 0; + + if (ad->antic_status == ANTIC_FINISHED) + /* + * Don't restart if we have just finished. Run the next request + */ + return 0; + + if (arq && as_can_break_anticipation(ad, arq)) + /* + * This request is a good candidate. Don't keep anticipating, + * run it. + */ + return 0; + + /* + * OK from here, we haven't finished, and don't have a decent request! + * Status is either ANTIC_OFF so start waiting, + * ANTIC_WAIT_REQ so continue waiting for request to finish + * or ANTIC_WAIT_NEXT so continue waiting for an acceptable request. + * + */ + + return 1; +} + +/* + * as_update_iohist keeps a decaying histogram of IO thinktimes, and + * updates @aic->mean_thinktime based on that. It is called when a new + * request is queued. + */ +static void as_update_iohist(struct as_io_context *aic) +{ + unsigned i; + unsigned long thinktime; + unsigned long total = 0; + unsigned long num = 0; + + if (aic == NULL) + return; + + if (test_bit(AS_TASK_IORUNNING, &aic->state)) { + thinktime = jiffies - aic->last_end_request; + thinktime = min(thinktime, MAX_THINKTIME-1); + aic->thinktime[thinktime] += 256; /* fixed point: 1.0 == 1<<8 */ + + for (i = 0; i < MAX_THINKTIME; i++) { + unsigned long tt = aic->thinktime[i]; + total += i*tt; + num += tt; + + aic->thinktime[i] = (tt>>1) + (tt>>2); /* 75% decay */ + } + /* fixed point factor is cancelled here */ + if (num) + aic->mean_thinktime = total / num; + } +} + +/* * as_update_arq must be called whenever a request (arq) is added to * the sort_list. This function keeps caches up to date, and checks if the * request might be one we are "anticipating" @@ -1110,120 +886,178 @@ static void as_update_arq(struct as_data } /* - * as_can_anticipate indicates weather we should either run arq - * or keep anticipating a better request. + * as_complete_arq is to be called when a request has completed and returned + * something to the requesting process, be it an error or data. */ -static int as_can_anticipate(struct as_data *ad, struct as_rq *arq) +static void as_complete_arq(struct as_data *ad, struct as_rq *arq) { - BUG_ON(ad->antic_status == ANTIC_WAIT_REQ || - ad->antic_status == ANTIC_WAIT_NEXT); + if (!arq->as_io_context) + return; - if (!ad->as_io_context) - /* - * Last request submitted was a write - */ - return 0; + if (rq_data_dir(arq->request) == READ) { + set_bit(AS_TASK_IORUNNING, &arq->as_io_context->state); + arq->as_io_context->last_end_request = jiffies; + } - if (ad->antic_status == ANTIC_FINISHED) - /* - * Don't restart if we have just finished. Run the next request - */ - return 0; + if (ad->as_io_context == arq->as_io_context) { + ad->antic_start = jiffies; + ad->aic_finished = 1; + if (ad->antic_status == ANTIC_WAIT_REQ) { + /* + * We were waiting on this request, now anticipate + * the next one + */ + as_antic_waitnext(ad); + } + } + put_as_io_context(&arq->as_io_context); +} + +/* + * as_remove_queued_request removes a request from the pre dispatch queue + * without updating refcounts. It is expected the caller will drop the + * reference unless it replaces the request at somepart of the elevator + * (ie. the dispatch queue) + */ +static void as_remove_queued_request(request_queue_t *q, struct request *rq) +{ + struct as_rq *arq = RQ_DATA(rq); + + if (!arq) + BUG(); + else { + const int data_dir = rq_data_dir(arq->request); + struct as_data *ad = q->elevator.elevator_data; + + BUG_ON(arq->state != AS_RQ_QUEUED); + + if (arq->as_io_context) { + BUG_ON(!atomic_read(&arq->as_io_context->nr_queued)); + atomic_dec(&arq->as_io_context->nr_queued); + } - if (arq && as_can_break_anticipation(ad, arq)) /* - * This request is a good candidate. Don't keep anticipating, - * run it. + * Update the "next_arq" cache if we are about to remove its + * entry */ + if (ad->next_arq[data_dir] == arq) + ad->next_arq[data_dir] = as_find_next_arq(ad, arq); + + list_del_init(&arq->fifo); + as_del_arq_hash(arq); + as_del_arq_rb(ad, arq); + + if (q->last_merge == &rq->queuelist) + q->last_merge = NULL; + + list_del_init(&rq->queuelist); + } + +} + +/* + * as_remove_request is called when a driver has completed the request + * (or it has caused an error), and is finished with it. It assumes + * the request is on the dispatch queue. + */ +static void as_remove_request(request_queue_t *q, struct request *rq) +{ + struct as_rq *arq = RQ_DATA(rq); + struct as_data *ad = q->elevator.elevator_data; + + if (q->last_merge == &rq->queuelist) + q->last_merge = NULL; + + list_del_init(&rq->queuelist); + + if (arq) { + list_del_init(&arq->fifo); + as_del_arq_hash(arq); + as_del_arq_rb(ad, arq); + if (arq->as_io_context) { + WARN_ON(!atomic_read(&arq->as_io_context->nr_dispatched)); + atomic_dec(&arq->as_io_context->nr_dispatched); + } + as_complete_arq(ad, arq); + } +} + +/* + * as_fifo_expired returns 0 if there are no expired reads on the fifo, + * 1 otherwise. It is ratelimited so that we only perform the check once per + * `fifo_expire' interval. Otherwise a large number of expired requests + * would create a hopeless seekstorm. + * + * See as_antic_expired comment. + */ +static int as_fifo_expired(struct as_data *ad, int adir) +{ + struct as_rq *arq; + long delta_jif; + + delta_jif = jiffies - ad->last_check_fifo[adir]; + if (unlikely(delta_jif < 0)) + delta_jif = -delta_jif; + if (delta_jif < ad->fifo_expire[adir]) return 0; - /* - * OK from here, we haven't finished, and don't have a decent request! - * Status is ANTIC_OFF so start waiting. - */ + ad->last_check_fifo[adir] = jiffies; - return 1; + if (list_empty(&ad->fifo_list[adir])) + return 0; + + arq = list_entry_fifo(ad->fifo_list[adir].next); + + return time_after(jiffies, arq->expires); } -#define MAXBACK (512 * 1024) +/* + * as_batch_expired returns true if the current batch has expired. A batch + * is a set of reads or a set of writes. + */ +static inline int as_batch_expired(struct as_data *ad) +{ + return time_after(jiffies, ad->current_batch_expires); +} /* - * as_choose_req selects the preferred one of two requests of the same data_dir - * ignoring time - eg. timeouts, which is the job of as_dispatch_request + * move an entry to dispatch queue */ -static struct as_rq * -as_choose_req(struct as_data *ad, struct as_rq *arq1, struct as_rq *arq2) +static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq) { - int data_dir; - sector_t last, s1, s2, d1, d2; - int r1_wrap=0, r2_wrap=0; /* requests are behind the disk head */ - const sector_t maxback = MAXBACK; + const int data_dir = rq_data_dir(arq->request); + + BUG_ON(!ON_RB(&arq->rb_node)); - if (arq1 == NULL || arq1 == arq2) - return arq2; - if (arq2 == NULL) - return arq1; + as_antic_stop(ad); + ad->antic_status = ANTIC_OFF; - data_dir = rq_data_dir(arq1->request); + /* + * This has to be set in order to be correctly updated by + * as_find_next_arq + */ + ad->last_sector[data_dir] = arq->request->sector + + arq->request->nr_sectors; - last = ad->last_sector[data_dir]; - s1 = arq1->request->sector; - s2 = arq2->request->sector; + if (data_dir == READ) { + /* In case we have to anticipate after this */ + copy_as_io_context(&ad->as_io_context, &arq->as_io_context); + ad->aic_finished = 0; + } else + put_as_io_context(&ad->as_io_context); - BUG_ON(data_dir != rq_data_dir(arq2->request)); + ad->next_arq[data_dir] = as_find_next_arq(ad, arq); /* - * Strict one way elevator _except_ in the case where we allow - * short backward seeks which are biased as twice the cost of a - * similar forward seek. Only for reads and only between reads - * from the same process! + * take it off the sort and fifo list, add to dispatch queue */ - if (s1 >= last) - d1 = s1 - last; - else if (data_dir == READ - && ad->as_io_context == arq1->as_io_context - && s1+maxback >= last) - d1 = (last - s1)*2; - else { - r1_wrap = 1; - d1 = 0; /* shut up, gcc */ - } - - if (s2 >= last) - d2 = s2 - last; - else if (data_dir == READ - && ad->as_io_context == arq2->as_io_context - && s2+maxback >= last) - d2 = (last - s2)*2; - else { - r2_wrap = 1; - d2 = 0; - } + as_remove_queued_request(ad->q, arq->request); + list_add_tail(&arq->request->queuelist, ad->dispatch); + if (arq->as_io_context) + atomic_inc(&arq->as_io_context->nr_dispatched); - /* Found required data */ - if (!r1_wrap && r2_wrap) - return arq1; - else if (!r2_wrap && r1_wrap) - return arq2; - else if (r1_wrap && r2_wrap) { - /* both behind the head */ - if (s1 <= s2) - return arq1; - else - return arq2; - } - - /* Both requests in front of the head */ - if (d1 < d2) - return arq1; - else if (d2 < d1) - return arq2; - else { - if (s1 >= s2) - return arq1; - else - return arq2; - } + BUG_ON(arq->state != AS_RQ_QUEUED); + arq->state = AS_RQ_DISPATCHED; } /* @@ -1312,7 +1146,6 @@ dispatch_writes: return 0; dispatch_request: - /* * If a request has expired, service it. */ @@ -1349,6 +1182,32 @@ static struct request *as_next_request(r return rq; } +/* + * add arq to rbtree and fifo + */ +static void as_add_request(struct as_data *ad, struct as_rq *arq) +{ + const int data_dir = rq_data_dir(arq->request); + + arq->as_io_context = get_as_io_context(); + if (arq->as_io_context) { + atomic_inc(&arq->as_io_context->nr_queued); + + if (data_dir == READ) + as_update_iohist(arq->as_io_context); + } + + as_add_arq_rb(ad, arq); + + /* + * set expire time (only used for reads) and add to fifo list + */ + arq->expires = jiffies + ad->fifo_expire[data_dir]; + list_add_tail(&arq->fifo, &ad->fifo_list[data_dir]); + arq->state = AS_RQ_QUEUED; + as_update_arq(ad, arq); /* keep state machine up to date */ +} + static void as_insert_request(request_queue_t *q, struct request *rq, struct list_head *insert_here) @@ -1444,6 +1303,154 @@ as_latter_request(request_queue_t *q, st return ret; } +static int +as_merge(request_queue_t *q, struct list_head **insert, struct bio *bio) +{ + struct as_data *ad = q->elevator.elevator_data; + struct request *__rq; + int ret; + + /* + * try last_merge to avoid going to hash + */ + ret = elv_try_last_merge(q, bio); + if (ret != ELEVATOR_NO_MERGE) { + __rq = list_entry_rq(q->last_merge); + goto out_insert; + } + + /* + * see if the merge hash can satisfy a back merge + */ + __rq = as_find_arq_hash(ad, bio->bi_sector); + if (__rq) { + BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector); + + if (elv_rq_merge_ok(__rq, bio)) { + ret = ELEVATOR_BACK_MERGE; + goto out; + } + } + + /* + * check for front merge + */ + if (ad->front_merges) { + sector_t rb_key = bio->bi_sector + bio_sectors(bio); + + __rq = as_find_arq_rb(ad, rb_key, bio_data_dir(bio)); + if (__rq) { + BUG_ON(rb_key != rq_rb_key(__rq)); + + if (elv_rq_merge_ok(__rq, bio)) { + ret = ELEVATOR_FRONT_MERGE; + goto out; + } + } + } + + return ELEVATOR_NO_MERGE; +out: + q->last_merge = &__rq->queuelist; +out_insert: + *insert = &__rq->queuelist; + return ret; +} + +static void as_merged_request(request_queue_t *q, struct request *req) +{ + struct as_data *ad = q->elevator.elevator_data; + struct as_rq *arq = RQ_DATA(req); + + /* + * hash always needs to be repositioned, key is end sector + */ + as_del_arq_hash(arq); + as_add_arq_hash(ad, arq); + + /* + * if the merge was a front merge, we need to reposition request + */ + if (rq_rb_key(req) != arq->rb_key) { + as_del_arq_rb(ad, arq); + as_add_arq_rb(ad, arq); + /* + * Note! At this stage of this and the next function, our next + * request may not be optimal - eg the request may have "grown" + * behind the disk head. We currently don't bother adjusting. + */ + } + + q->last_merge = &req->queuelist; +} + +static void +as_merged_requests(request_queue_t *q, struct request *req, + struct request *next) +{ + struct as_data *ad = q->elevator.elevator_data; + struct as_rq *arq = RQ_DATA(req); + struct as_rq *anext = RQ_DATA(next); + + BUG_ON(!arq); + BUG_ON(!anext); + + /* + * reposition arq (this is the merged request) in hash, and in rbtree + * in case of a front merge + */ + as_del_arq_hash(arq); + as_add_arq_hash(ad, arq); + + if (rq_rb_key(req) != arq->rb_key) { + as_del_arq_rb(ad, arq); + as_add_arq_rb(ad, arq); + } + + /* + * if anext expires before arq, assign its expire time to arq + * and move into anext position (anext will be deleted) in fifo + */ + if (!list_empty(&arq->fifo) && !list_empty(&anext->fifo)) { + if (time_before(anext->expires, arq->expires)) { + list_move(&arq->fifo, &anext->fifo); + arq->expires = anext->expires; + /* + * Don't copy here but swap, because when anext is + * removed below, it must contain the unused context + */ + swap_as_io_context(&arq->as_io_context, + &anext->as_io_context); + } + } + + /* + * kill knowledge of next, this one is a goner + */ + as_remove_queued_request(q, next); + put_as_io_context(&anext->as_io_context); +} + +/* + * This is executed in a "deferred" process context, by kblockd. It calls the + * driver's request_fn so the driver can submit that request. + * + * IMPORTANT! This guy will reenter the elevator, so set up all queue global + * state before calling, and don't rely on any state over calls. + * + * FIXME! dispatch queue is not a queue at all! + */ +static void as_work_handler(void *data) +{ + struct request_queue *q = data; + unsigned long flags; + + spin_lock_irqsave(q->queue_lock, flags); + if (!as_queue_notready(q)) + q->request_fn(q); + spin_unlock_irqrestore(q->queue_lock, flags); +} + static void as_exit(request_queue_t *q, elevator_t *e) { struct as_data *ad = e->elevator_data; @@ -1527,6 +1534,8 @@ static int as_init(request_queue_t *q, e ad->batch_expire[WRITE] = write_batch_expire; e->elevator_data = ad; + ad->current_batch_expires = jiffies + ad->batch_expire[READ]; + for (i = READ; i <= WRITE; i++) { struct request_list *rl = &q->rq[i]; struct list_head *entry; _