anticipatory I/O scheduler block/Makefile | 3 block/as-iosched.c | 1341 +++++++++++++++++++++++++++++++++++++++++++++++++++++ block/ll_rw_blk.c | 19 linux/elevator.h | 5 4 files changed, 1366 insertions(+), 2 deletions(-) diff -puN /dev/null drivers/block/as-iosched.c --- /dev/null 2002-08-30 16:31:37.000000000 -0700 +++ 25-akpm/drivers/block/as-iosched.c 2003-02-20 23:20:32.000000000 -0800 @@ -0,0 +1,1341 @@ +/* + * linux/drivers/block/as-iosched.c + * + * Anticipatory & deadline i/o scheduler. + * + * Copyright (C) 2002 Jens Axboe + * Nick Piggin + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct ant_stats { + int reads; /* total read requests */ + int writes; /* total write requests */ + int anticipate_starts; + int expired_read_batches; + int expired_write_batches; + int timeouts; + int anticipate_hits; + int expired_fifo_reads; + int expired_fifo_writes; + int close_requests; + int matching_ids; + + int ant_delay_hist[100]; /* milliseconds */ + + /* + * This is a logarithmic (base 2) histogram + */ + int lba_forward_offsets[32]; + int lba_backward_offsets[32]; +} ant_stats; + +/* + * See Documentation/as-iosched.txt + */ + +/* + * max time before a read is submitted. + */ +static unsigned long read_expire = HZ / 10; + +/* + * ditto for writes, these limits are not hard, even + * if the disk is capable of satisfying them. + */ +static unsigned long write_expire = 1 * HZ; + +/* + * read_batch_expire describes how long we will allow a stream of reads to + * persist before looking to see whether it is time to switch over to writes. + */ +static unsigned long read_batch_expire = HZ / 5; + +/* + * write_batch_expire describes how long we will allow a stream of writes to + * persist before looking to see whether it is time to switch over to reads. + */ +static unsigned long write_batch_expire = HZ / 20; + +/* + * max time we may wait to anticipate a read + */ +static unsigned long antic_expire = HZ / 50; + +static const int as_hash_shift = 10; +#define DL_HASH_BLOCK(sec) ((sec) >> 3) +#define DL_HASH_FN(sec) (hash_long(DL_HASH_BLOCK((sec)), as_hash_shift)) +#define DL_HASH_ENTRIES (1 << as_hash_shift) +#define rq_hash_key(rq) ((rq)->sector + (rq)->nr_sectors) +#define list_entry_hash(ptr) list_entry((ptr), struct as_rq, hash) +#define ON_HASH(drq) (drq)->hash_valid_count + +#define DL_INVALIDATE_HASH(dd) \ + do { \ + if (!++(dd)->hash_valid_count) \ + (dd)->hash_valid_count = 1; \ + } while (0) + +/* + * Nick! These need descriptions + */ +#define ANTIC_OFF 0 +#define ANTIC_WAIT 1 +#define ANTIC_TIMEOUT 2 +#define ANTIC_FOUND 3 + +struct as_data { + /* + * run time data + */ + + /* + * requests (as_rq s) are present on both sort_list and fifo_list + */ + struct rb_root sort_list[2]; + struct list_head fifo_list[2]; + + struct as_rq *next_drq[2];/* next in sort order */ + sector_t last_sector[2]; /* last READ and WRITE sectors */ + struct list_head *dispatch; /* driver dispatch queue */ + struct list_head *hash; /* request hash */ + unsigned long hash_valid_count; /* barrier hash count */ + unsigned long current_batch_expires; + unsigned long last_check_fifo[2]; + int batch_data_dir; /* current/last batch READ or WRITE */ + + int antic_status; + unsigned long antic_start; /* jiffies: when it started */ + struct timer_list antic_timer; /* anticipatory scheduling timer */ + struct work_struct antic_work; /* anticipatory scheduling work */ + unsigned long current_id; /* Identify the expected process */ + + /* + * settings that change how the i/o scheduler behaves + */ + unsigned long fifo_expire[2]; + unsigned long batch_expire[2]; + unsigned long front_merges; + unsigned long antic_expire; +}; + +/* + * pre-request data. + */ +struct as_rq { + /* + * rbtree index, key is the starting offset + */ + struct rb_node rb_node; + sector_t rb_key; + + struct request *request; + + unsigned long request_id; + + /* + * request hash, key is the ending offset (for back merge lookup) + */ + struct list_head hash; + unsigned long hash_valid_count; + + /* + * expire fifo + */ + struct list_head fifo; + unsigned long expires; +}; + +static void +as_move_request(struct as_data *dd, struct as_rq *drq); + +/* + * as_update_drq must be called whenever a request (drq) is added to + * the sort_list. This function keeps caches up to date, and checks if the + * request might be one we are "anticipating" + */ +static void +as_update_drq(struct as_data *dd, struct as_rq *drq); + +static kmem_cache_t *drq_pool; + +#define RQ_DATA(rq) ((struct as_rq *) (rq)->elevator_private) + +/* + * the back merge hash support functions + */ +static inline void __as_del_drq_hash(struct as_rq *drq) +{ + drq->hash_valid_count = 0; + list_del_init(&drq->hash); +} + +static inline void as_del_drq_hash(struct as_rq *drq) +{ + if (ON_HASH(drq)) + __as_del_drq_hash(drq); +} + +static void +as_add_drq_hash(struct as_data *dd, struct as_rq *drq) +{ + struct request *rq = drq->request; + + BUG_ON(ON_HASH(drq)); + + drq->hash_valid_count = dd->hash_valid_count; + list_add(&drq->hash, &dd->hash[DL_HASH_FN(rq_hash_key(rq))]); +} + +static struct request * +as_find_drq_hash(struct as_data *dd, sector_t offset) +{ + struct list_head *hash_list = &dd->hash[DL_HASH_FN(offset)]; + struct list_head *entry, *next = hash_list->next; + + while ((entry = next) != hash_list) { + struct as_rq *drq = list_entry_hash(entry); + struct request *__rq = drq->request; + + next = entry->next; + + BUG_ON(!ON_HASH(drq)); + + if (!rq_mergeable(__rq) + || drq->hash_valid_count != dd->hash_valid_count) { + __as_del_drq_hash(drq); + continue; + } + + if (rq_hash_key(__rq) == offset) + return __rq; + } + + return NULL; +} + +/* + * rb tree support functions + */ +#define RB_NONE (2) +#define RB_EMPTY(root) ((root)->rb_node == NULL) +#define ON_RB(node) ((node)->rb_color != RB_NONE) +#define RB_CLEAR(node) ((node)->rb_color = RB_NONE) +#define rb_entry_drq(node) rb_entry((node), struct as_rq, rb_node) +#define DRQ_RB_ROOT(dd, drq) (&(dd)->sort_list[rq_data_dir((drq)->request)]) +#define rq_rb_key(rq) (rq)->sector + +/* + * as_find_first_drq finds the first (lowest sector numbered) request + * for the specified data_dir. Used to sweep back to the start of the disk + * (1-way elevator) after we process the last (highest sector) request. + */ +static struct as_rq * +as_find_first_drq(struct as_data *dd, int data_dir) +{ + struct rb_node *n = dd->sort_list[data_dir].rb_node; + + if (n == NULL) + return NULL; + + for (;;) { + if (n->rb_left == NULL) + return rb_entry_drq(n); + + n = n->rb_left; + } +} + +static struct as_rq * +__as_add_drq_rb(struct as_data *dd, struct as_rq *drq) +{ + struct rb_node **p = &DRQ_RB_ROOT(dd, drq)->rb_node; + struct rb_node *parent = NULL; + struct as_rq *__drq; + + while (*p) { + parent = *p; + __drq = rb_entry_drq(parent); + + if (drq->rb_key < __drq->rb_key) + p = &(*p)->rb_left; + else if (drq->rb_key > __drq->rb_key) + p = &(*p)->rb_right; + else + return __drq; + } + + rb_link_node(&drq->rb_node, parent, p); + return 0; +} + +/* + * Add the request to the rb tree if it is unique. If there is an alias (an + * existing request against the same sector), which can happen when using + * direct IO, then move the alias to the dispatch list and then add the + * request. + */ +static void +as_add_drq_rb(struct as_data *dd, struct as_rq *drq) +{ + struct as_rq *alias; + struct request *rq = drq->request; + + drq->rb_key = rq_rb_key(rq); + + while ((alias = __as_add_drq_rb(dd, drq))) + as_move_request(dd, alias); + + rb_insert_color(&drq->rb_node, DRQ_RB_ROOT(dd, drq)); + as_update_drq(dd, drq); +} + +static struct as_rq * +as_choose_req(struct as_data *dd, + struct as_rq *drq1, struct as_rq *drq2); + +static inline void +as_del_drq_rb(struct as_data *dd, struct as_rq *drq) +{ + const int data_dir = rq_data_dir(drq->request); + + if (dd->next_drq[data_dir] == drq) { + struct rb_node *rbnext = rb_next(&drq->rb_node); + struct rb_node *rbprev = rb_prev(&drq->rb_node); + struct as_rq *drq_next, *drq_prev; + + if (rbprev) + drq_prev = rb_entry_drq(rbprev); + else + drq_prev = NULL; + + if (rbnext) + drq_next = rb_entry_drq(rbnext); + else + drq_next = as_find_first_drq(dd, data_dir); + + dd->next_drq[data_dir] = as_choose_req(dd, + drq_next, drq_prev); + } + + if (ON_RB(&drq->rb_node)) { + rb_erase(&drq->rb_node, DRQ_RB_ROOT(dd, drq)); + RB_CLEAR(&drq->rb_node); + } +} + +static struct request * +as_find_drq_rb(struct as_data *dd, sector_t sector, int data_dir) +{ + struct rb_node *n = dd->sort_list[data_dir].rb_node; + struct as_rq *drq; + + while (n) { + drq = rb_entry_drq(n); + + if (sector < drq->rb_key) + n = n->rb_left; + else if (sector > drq->rb_key) + n = n->rb_right; + else + return drq->request; + } + + return NULL; +} + +/* + * add drq to rbtree and fifo + */ +static void +as_add_request(struct as_data *dd, struct as_rq *drq) +{ + const int data_dir = rq_data_dir(drq->request); + + as_add_drq_rb(dd, drq); + /* + * set expire time (only used for reads) and add to fifo list + */ + drq->expires = jiffies + dd->fifo_expire[data_dir]; + list_add_tail(&drq->fifo, &dd->fifo_list[data_dir]); +} + +/* + * remove rq from rbtree, fifo, and hash + */ +static void as_remove_request(request_queue_t *q, struct request *rq) +{ + struct as_rq *drq = RQ_DATA(rq); + + if (drq) { + struct as_data *dd = q->elevator.elevator_data; + + list_del_init(&drq->fifo); + as_del_drq_hash(drq); + as_del_drq_rb(dd, drq); + } + + if (q->last_merge == &rq->queuelist) + q->last_merge = NULL; + + list_del_init(&rq->queuelist); +} + +static int +as_merge(request_queue_t *q, struct list_head **insert, struct bio *bio) +{ + struct as_data *dd = q->elevator.elevator_data; + struct request *__rq; + int ret; + + /* + * try last_merge to avoid going to hash + */ + ret = elv_try_last_merge(q, bio); + if (ret != ELEVATOR_NO_MERGE) { + __rq = list_entry_rq(q->last_merge); + goto out_insert; + } + + /* + * see if the merge hash can satisfy a back merge + */ + __rq = as_find_drq_hash(dd, bio->bi_sector); + if (__rq) { + BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector); + + if (elv_rq_merge_ok(__rq, bio)) { + ret = ELEVATOR_BACK_MERGE; + goto out; + } + } + + /* + * check for front merge + */ + if (dd->front_merges) { + sector_t rb_key = bio->bi_sector + bio_sectors(bio); + + __rq = as_find_drq_rb(dd, rb_key, bio_data_dir(bio)); + if (__rq) { + BUG_ON(rb_key != rq_rb_key(__rq)); + + if (elv_rq_merge_ok(__rq, bio)) { + ret = ELEVATOR_FRONT_MERGE; + goto out; + } + } + } + + return ELEVATOR_NO_MERGE; +out: + q->last_merge = &__rq->queuelist; +out_insert: + *insert = &__rq->queuelist; + return ret; +} + +static void as_merged_request(request_queue_t *q, struct request *req) +{ + struct as_data *dd = q->elevator.elevator_data; + struct as_rq *drq = RQ_DATA(req); + + /* + * hash always needs to be repositioned, key is end sector + */ + as_del_drq_hash(drq); + as_add_drq_hash(dd, drq); + + /* + * if the merge was a front merge, we need to reposition request + */ + if (rq_rb_key(req) != drq->rb_key) { + as_del_drq_rb(dd, drq); + as_add_drq_rb(dd, drq); + } + + q->last_merge = &req->queuelist; +} + +static void +as_merged_requests(request_queue_t *q, struct request *req, + struct request *next) +{ + struct as_data *dd = q->elevator.elevator_data; + struct as_rq *drq = RQ_DATA(req); + struct as_rq *dnext = RQ_DATA(next); + + BUG_ON(!drq); + BUG_ON(!dnext); + + /* + * reposition drq (this is the merged request) in hash, and in rbtree + * in case of a front merge + */ + as_del_drq_hash(drq); + as_add_drq_hash(dd, drq); + + if (rq_rb_key(req) != drq->rb_key) { + as_del_drq_rb(dd, drq); + as_add_drq_rb(dd, drq); + } + + /* + * if dnext expires before drq, assign its expire time to drq + * and move into dnext position (dnext will be deleted) in fifo + */ + if (!list_empty(&drq->fifo) && !list_empty(&dnext->fifo)) { + if (time_before(dnext->expires, drq->expires)) { + list_move(&drq->fifo, &dnext->fifo); + drq->expires = dnext->expires; + } + } + + /* + * kill knowledge of next, this one is a goner + */ + as_remove_request(q, next); +} + +/* + * move request from sort list to dispatch queue. + */ +static void +as_move_to_dispatch(struct as_data *dd, struct as_rq *drq) +{ + request_queue_t *q = drq->request->q; + + as_remove_request(q, drq->request); + list_add_tail(&drq->request->queuelist, dd->dispatch); +} + + +/* + * move an entry to dispatch queue + */ +static void +as_move_request(struct as_data *dd, struct as_rq *drq) +{ + const int data_dir = rq_data_dir(drq->request); + struct rb_node *rbnext = rb_next(&drq->rb_node); + struct rb_node *rbprev = rb_prev(&drq->rb_node); + struct as_rq *drq_next, *drq_prev; + + BUG_ON(!ON_RB(&drq->rb_node)); + + if (rbprev) + drq_prev = rb_entry_drq(rbprev); + else + drq_prev = NULL; + + if (rbnext) + drq_next = rb_entry_drq(rbnext); + else + drq_next = as_find_first_drq(dd, data_dir); + dd->next_drq[data_dir] = as_choose_req(dd, drq_next, drq_prev); + + dd->last_sector[data_dir] = drq->request->sector + drq->request->nr_sectors; + + if (data_dir == READ) + /* In case we have to anticipate after this */ + dd->current_id = drq->request_id; + + /* + * take it off the sort and fifo list, move + * to dispatch queue + */ + as_move_to_dispatch(dd, drq); +} + +#define list_entry_fifo(ptr) list_entry((ptr), struct as_rq, fifo) + +/* + * as_fifo_expired returns 0 if there are no expired reads on the fifo, + * 1 otherwise. It is ratelimited so that we only perform the check once per + * `fifo_expire' interval. Otherwise a large number of expired requests + * would create a hopeless seekstorm. + * + * The funny "absolute difference" math on the elapsed time is to handle + * jiffy wraps, and disks which have been idle for 0x80000000 jiffies. + */ +static int as_fifo_expired(struct as_data *dd, int ddir) +{ + struct as_rq *drq; + long delta_jif; + + delta_jif = jiffies - dd->last_check_fifo[ddir]; + if (unlikely(delta_jif < 0)) + delta_jif = -delta_jif; + if (delta_jif < dd->fifo_expire[ddir]) + return 0; + + dd->last_check_fifo[ddir] = jiffies; + + if (list_empty(&dd->fifo_list[ddir])) + return 0; + + drq = list_entry_fifo(dd->fifo_list[ddir].next); + + return time_after(jiffies, drq->expires); +} + +/* + * as_batch_expired returns true if the current batch has expired. + */ +static inline int as_batch_expired(struct as_data *dd) +{ + return time_after(jiffies, dd->current_batch_expires); +} + +/* + * anticipatory scheduling functions follow + */ + +static inline unsigned long request_id(void) +{ + return (unsigned long)current->pid; +} + +static int as_queue_empty(request_queue_t *q); + +/* + * as_anticipate_work is scheduled by as_anticipate_timeout. It + * stops anticipation, ie. resumes dispatching requests to a device. + */ +static void as_anticipate_work(void *data) +{ + struct request_queue *q = data; + unsigned long flags; + + spin_lock_irqsave(q->queue_lock, flags); + if (!as_queue_empty(q)) + q->request_fn(q); + spin_unlock_irqrestore(q->queue_lock, flags); +} + +/* + * as_anticipate_timeout is the timer function set by + * as_start_anticipate. + */ +static void as_anticipate_timeout(unsigned long data) +{ + struct request_queue *q = (struct request_queue *)data; + struct as_data *dd = q->elevator.elevator_data; + unsigned long flags; + + spin_lock_irqsave(q->queue_lock, flags); + + if (dd->antic_status != ANTIC_FOUND) + dd->antic_status = ANTIC_TIMEOUT; + + blk_remove_plug(q); + schedule_work(&dd->antic_work); + ant_stats.timeouts++; + + spin_unlock_irqrestore(q->queue_lock, flags); +} + +/* + * as_close_req decides if one request is considered "close" to the + * previous one issued. + * Nick: this needs fixing for HZ = 100 + */ +static int +as_close_req(struct as_data *dd, struct as_rq *drq) +{ + unsigned long delay = jiffies - dd->antic_start; + sector_t last = dd->last_sector[dd->batch_data_dir]; + sector_t next = drq->request->sector; + + sector_t delta; /* acceptable close offset (in sectors) */ + + if (dd->antic_status == ANTIC_OFF || delay <= 2) + delta = 64; + else if (delay <= dd->antic_expire / 2) + delta = 64 << ((delay - 2)*2); + else + return 1; + + return (last <= next) && (next <= last + delta); +} + +#define MAXBACK (512 * 1024) + +static struct as_rq * +as_choose_req(struct as_data *dd, + struct as_rq *drq1, struct as_rq *drq2) +{ + int data_dir; + sector_t last, s1, s2, d1, d2; + const sector_t maxback = MAXBACK; + + if (drq1 == NULL) + return drq2; + if (drq2 == NULL) + return drq1; + + data_dir = rq_data_dir(drq1->request); + last = dd->last_sector[data_dir]; + s1 = drq1->request->sector; + s2 = drq2->request->sector; + + BUG_ON(data_dir != rq_data_dir(drq2->request)); + + /* + * Nick: boggle. What algorithm is this implementing? + */ + if (data_dir == READ) { + if (s1 >= last) + d1 = s1 - last; + else { + /* count large back seeks as a forward seek */ + if (dd->current_id == drq1->request_id && s1+maxback >= last) + d1 = (last - s1)*2; + else + d1 = (last - s1)*8; + } + + if (s2 >= last) + d2 = s2 - last; + else { + if (dd->current_id == drq2->request_id && s2+maxback >= last) + d2 = (last - s2)*2; + else + d2 = (last - s2)*8; + } + } else { /* data_dir == WRITE */ + if (s1 >= last && s2 >= last) { + d1 = s1 - last; + d2 = s2 - last; + } else if (s1 >= last && s2 < last) { + d1 = s1 - last; + d2 = d1+1; + } else if (s2 >= last && s1 < last) { + d2 = s2 - last; + d1 = d2+1; + } else { + d1 = s1; + d2 = s2; + } + } + + if (d1 < d2) + return drq1; + else if (d2 < d1) + return drq2; + else { + if (s1 >= s2) + return drq1; + else + return drq2; + } +} + +/* + * as_antic_req, has @dd been anticipating this @drq? + */ +static int +as_antic_req(struct as_data *dd, struct as_rq *drq) +{ + if (as_close_req(dd, drq)) { + ant_stats.close_requests++; + return 1; + } + if (dd->current_id == drq->request_id) { + ant_stats.matching_ids++; + return 1; + } + return 0; +} + +/* + * as_update_drq must be called whenever a request (drq) is added to + * the sort_list. This function keeps caches up to date, and checks if the + * request might be one we are "anticipating" + */ +static void +as_update_drq(struct as_data *dd, struct as_rq *drq) +{ + const int data_dir = rq_data_dir(drq->request); + sector_t last = dd->last_sector[data_dir]; + sector_t this = drq->request->sector; + unsigned long delay = jiffies - dd->antic_start; + + drq->request_id = request_id(); + + if (data_dir == READ) + ant_stats.reads++; + else + ant_stats.writes++; + + /* keep the next_drq cache up to date */ + dd->next_drq[data_dir] = as_choose_req(dd, drq, dd->next_drq[data_dir]); + + /* have we been anticipating this request? */ + if (dd->antic_status != ANTIC_OFF && data_dir == READ + && as_antic_req(dd, drq)) { + long lba_offset; + int neg; + int log2; + + if (delay >= ARRAY_SIZE(ant_stats.ant_delay_hist)) + delay = ARRAY_SIZE(ant_stats.ant_delay_hist) - 1; + ant_stats.ant_delay_hist[delay]++; + ant_stats.anticipate_hits++; + + lba_offset = this - last; + neg = 0; + if (lba_offset < 0) { + lba_offset = -lba_offset; + neg = 1; + } + log2 = ffs(lba_offset); + BUG_ON(log2 >= 32); + if (neg) + ant_stats.lba_backward_offsets[log2]++; + else + ant_stats.lba_forward_offsets[log2]++; + + del_timer(&dd->antic_timer); + dd->antic_status = ANTIC_FOUND; + blk_remove_plug(drq->request->q); + schedule_work(&dd->antic_work); + } +} + +/* + * argh. Nick, can you please comment every clause in this function? + */ +static int +can_start_anticipation(struct as_data *dd, struct as_rq *drq) +{ + if (dd->antic_status == ANTIC_FOUND) + return 0; + if (!(dd->antic_status == ANTIC_OFF || + time_before(jiffies, dd->antic_start + dd->antic_expire))) + return 0; + if (drq && as_antic_req(dd, drq)) + return 0; + return 1; +} + +/* + * as_dispatch_request selects the best request according to + * read/write expire, batch expire, etc, and moves it to the dispatch + * queue. Returns 1 if a request was found, 0 otherwise. + */ +static int as_dispatch_request(struct request_queue *q) +{ + struct as_data *dd = q->elevator.elevator_data; + struct as_rq *drq; + const int reads = !list_empty(&dd->fifo_list[READ]); + const int writes = !list_empty(&dd->fifo_list[WRITE]); + + if (!(reads || writes)) + return 0; + + if (as_batch_expired(dd)) { + if (dd->batch_data_dir == READ) + ant_stats.expired_read_batches++; + else + ant_stats.expired_write_batches++; + } + + if (!(reads && writes && as_batch_expired(dd))) { + /* + * batch is still running or no reads or no writes + */ + drq = dd->next_drq[dd->batch_data_dir]; + + if (dd->batch_data_dir == READ && dd->antic_expire) { + if (as_fifo_expired(dd, READ)) + goto fifo_expired; + + if (can_start_anticipation(dd, drq)) { + unsigned long timeout; + + if (dd->antic_status == ANTIC_OFF) { + ant_stats.anticipate_starts++; + dd->antic_start = jiffies; + } + timeout = dd->antic_start + dd->antic_expire; +#if 0 + /* FIX THIS!!! */ + timeout = min(timeout, + dd->current_batch_expires); +#endif + mod_timer(&dd->antic_timer, timeout); + + dd->antic_status = ANTIC_WAIT; + blk_plug_device(q); + + return 0; + } + + } + + if (drq) { + /* we have a "next request" */ + if (reads && !writes) + dd->current_batch_expires = + jiffies + dd->batch_expire[READ]; + goto dispatch_request; + } + } + + /* + * at this point we are not running a batch. select the appropriate + * data direction (read / write) + */ + + if (reads) { + BUG_ON(RB_EMPTY(&dd->sort_list[READ])); + + if (writes && dd->batch_data_dir == READ) + /* + * Last batch was a read, switch to writes + */ + goto dispatch_writes; + + dd->batch_data_dir = READ; + drq = dd->next_drq[dd->batch_data_dir]; + dd->current_batch_expires = jiffies + + dd->batch_expire[dd->batch_data_dir]; + goto dispatch_request; + } + + /* + * there are either no reads or the last batch was a read + */ + + if (writes) { +dispatch_writes: + BUG_ON(RB_EMPTY(&dd->sort_list[WRITE])); + + dd->batch_data_dir = WRITE; + drq = dd->next_drq[dd->batch_data_dir]; + dd->current_batch_expires = jiffies + + dd->batch_expire[dd->batch_data_dir]; + goto dispatch_request; + } + + BUG(); + return 0; + +dispatch_request: + + /* + * If a request has expired, service it. + */ + + if (as_fifo_expired(dd, dd->batch_data_dir)) { +fifo_expired: + if (dd->batch_data_dir == WRITE) + ant_stats.expired_fifo_writes++; + else + ant_stats.expired_fifo_reads++; + drq = list_entry_fifo(dd->fifo_list[dd->batch_data_dir].next); + BUG_ON(drq == NULL); + } + + /* + * drq is the selected appropriate request. + */ + dd->antic_status = ANTIC_OFF; + as_move_request(dd, drq); + return 1; +} + +static struct request *as_next_request(request_queue_t *q) +{ + struct as_data *dd = q->elevator.elevator_data; + struct request *rq = NULL; + + /* + * if there are still requests on the dispatch queue, grab the first one + */ + if (!list_empty(dd->dispatch) || as_dispatch_request(q)) + rq = list_entry_rq(dd->dispatch->next); + return rq; +} + +static void +as_insert_request(request_queue_t *q, struct request *rq, + struct list_head *insert_here) +{ + struct as_data *dd = q->elevator.elevator_data; + struct as_rq *drq = RQ_DATA(rq); + + if (unlikely(rq->flags & REQ_HARDBARRIER)) { + DL_INVALIDATE_HASH(dd); + q->last_merge = NULL; + } + + if (unlikely(!blk_fs_request(rq))) { + if (!insert_here) + insert_here = dd->dispatch->prev; + + list_add(&rq->queuelist, insert_here); + + if (rq_data_dir(rq) == READ && dd->antic_status != ANTIC_OFF) { + del_timer(&dd->antic_timer); + dd->antic_status = ANTIC_FOUND; + blk_remove_plug(q); + schedule_work(&dd->antic_work); + } + + return; + } + + if (rq_mergeable(rq)) { + as_add_drq_hash(dd, drq); + + if (!q->last_merge) + q->last_merge = &rq->queuelist; + } + + as_add_request(dd, drq); +} + +static int as_queue_empty(request_queue_t *q) +{ + struct as_data *dd = q->elevator.elevator_data; + + if (!list_empty(&dd->fifo_list[WRITE]) + || !list_empty(&dd->fifo_list[READ]) + || !list_empty(dd->dispatch) ) + return 0; + + return 1; +} + +/* + * as_queue_notready tells us weather or not as_next_request + * will return us a request or NULL. With the previous work conserving + * scheduler this API was designed around, if a queue had requests in it, + * as_next_request would return a request, and drivers seem to make + * that assumption + */ +static int as_queue_notready(request_queue_t *q) +{ + struct as_data *dd = q->elevator.elevator_data; + + if (!list_empty(dd->dispatch)) + return 0; + + if (dd->antic_status == ANTIC_WAIT) + return 1; + + if (!as_dispatch_request(q)) + return 1; + + return 0; +} + +static struct request * +as_former_request(request_queue_t *q, struct request *rq) +{ + struct as_rq *drq = RQ_DATA(rq); + struct rb_node *rbprev = rb_prev(&drq->rb_node); + + if (rbprev) + return rb_entry_drq(rbprev)->request; + + return NULL; +} + +static struct request * +as_latter_request(request_queue_t *q, struct request *rq) +{ + struct as_rq *drq = RQ_DATA(rq); + struct rb_node *rbnext = rb_next(&drq->rb_node); + + if (rbnext) + return rb_entry_drq(rbnext)->request; + + return NULL; +} + +static void as_exit(request_queue_t *q, elevator_t *e) +{ + struct as_data *dd = e->elevator_data; + struct as_rq *drq; + struct request *rq; + int i; + + BUG_ON(!list_empty(&dd->fifo_list[READ])); + BUG_ON(!list_empty(&dd->fifo_list[WRITE])); + + for (i = READ; i <= WRITE; i++) { + struct request_list *rl = &q->rq[i]; + struct list_head *entry; + + list_for_each(entry, &rl->free) { + rq = list_entry_rq(entry); + + if ((drq = RQ_DATA(rq)) == NULL) + continue; + + rq->elevator_private = NULL; + kmem_cache_free(drq_pool, drq); + } + } + + kfree(dd->hash); + kfree(dd); +} + +/* + * initialize elevator private data (as_data), and alloc a drq for + * each request on the free lists + */ +static int as_init(request_queue_t *q, elevator_t *e) +{ + struct as_data *dd; + struct as_rq *drq; + struct request *rq; + int i, ret = 0; + + if (!drq_pool) + return -ENOMEM; + + dd = kmalloc(sizeof(*dd), GFP_KERNEL); + if (!dd) + return -ENOMEM; + memset(dd, 0, sizeof(*dd)); + + dd->hash = kmalloc(sizeof(struct list_head)*DL_HASH_ENTRIES,GFP_KERNEL); + if (!dd->hash) { + kfree(dd); + return -ENOMEM; + } + + /* anticipatory scheduling helpers */ + dd->antic_timer.function = as_anticipate_timeout; + dd->antic_timer.data = (unsigned long)q; + init_timer(&dd->antic_timer); + INIT_WORK(&dd->antic_work, as_anticipate_work, q); + + for (i = 0; i < DL_HASH_ENTRIES; i++) + INIT_LIST_HEAD(&dd->hash[i]); + + INIT_LIST_HEAD(&dd->fifo_list[READ]); + INIT_LIST_HEAD(&dd->fifo_list[WRITE]); + dd->sort_list[READ] = RB_ROOT; + dd->sort_list[WRITE] = RB_ROOT; + dd->dispatch = &q->queue_head; + dd->fifo_expire[READ] = read_expire; + dd->fifo_expire[WRITE] = write_expire; + dd->hash_valid_count = 1; + dd->front_merges = 1; + dd->antic_expire = antic_expire; + dd->batch_expire[READ] = read_batch_expire; + dd->batch_expire[WRITE] = write_batch_expire; + e->elevator_data = dd; + + for (i = READ; i <= WRITE; i++) { + struct request_list *rl = &q->rq[i]; + struct list_head *entry; + + list_for_each(entry, &rl->free) { + rq = list_entry_rq(entry); + + drq = kmem_cache_alloc(drq_pool, GFP_KERNEL); + if (!drq) { + ret = -ENOMEM; + break; + } + + memset(drq, 0, sizeof(*drq)); + INIT_LIST_HEAD(&drq->fifo); + INIT_LIST_HEAD(&drq->hash); + RB_CLEAR(&drq->rb_node); + drq->request = rq; + rq->elevator_private = drq; + } + } + + if (ret) + as_exit(q, e); + + return ret; +} + +/* + * sysfs parts below + */ +struct as_fs_entry { + struct attribute attr; + ssize_t (*show)(struct as_data *, char *); + ssize_t (*store)(struct as_data *, const char *, size_t); +}; + +static ssize_t +as_var_show(unsigned int var, char *page) +{ + return sprintf(page, "%d\n", var); +} + +static ssize_t +as_var_store(unsigned long *var, const char *page, size_t count) +{ + char *p = (char *) page; + + *var = simple_strtoul(p, &p, 10); + return count; +} + +#define SHOW_FUNCTION(__FUNC, __VAR) \ +static ssize_t __FUNC(struct as_data *dd, char *page) \ +{ \ + return as_var_show(__VAR, (page)); \ +} +SHOW_FUNCTION(as_readexpire_show, dd->fifo_expire[READ]); +SHOW_FUNCTION(as_writeexpire_show, dd->fifo_expire[WRITE]); +SHOW_FUNCTION(as_frontmerges_show, dd->front_merges); +SHOW_FUNCTION(as_anticexpire_show, dd->antic_expire); +SHOW_FUNCTION(as_read_batchexpire_show, dd->batch_expire[READ]); +SHOW_FUNCTION(as_write_batchexpire_show, dd->batch_expire[WRITE]); +#undef SHOW_FUNCTION + +#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ +static ssize_t __FUNC(struct as_data *dd, const char *page, size_t count) \ +{ \ + int ret = as_var_store(__PTR, (page), count); \ + if (*(__PTR) < (MIN)) \ + *(__PTR) = (MIN); \ + else if (*(__PTR) > (MAX)) \ + *(__PTR) = (MAX); \ + return ret; \ +} +STORE_FUNCTION(as_readexpire_store, &dd->fifo_expire[READ], 0, INT_MAX); +STORE_FUNCTION(as_writeexpire_store, &dd->fifo_expire[WRITE], 0, INT_MAX); +STORE_FUNCTION(as_frontmerges_store, &dd->front_merges, 0, 1); +STORE_FUNCTION(as_anticexpire_store, &dd->antic_expire, 0, INT_MAX); +STORE_FUNCTION(as_read_batchexpire_store, + &dd->batch_expire[READ], 0, INT_MAX); +STORE_FUNCTION(as_write_batchexpire_store, + &dd->batch_expire[WRITE], 0, INT_MAX); +#undef STORE_FUNCTION + +static struct as_fs_entry as_readexpire_entry = { + .attr = {.name = "read_expire", .mode = S_IRUGO | S_IWUSR }, + .show = as_readexpire_show, + .store = as_readexpire_store, +}; +static struct as_fs_entry as_writeexpire_entry = { + .attr = {.name = "write_expire", .mode = S_IRUGO | S_IWUSR }, + .show = as_writeexpire_show, + .store = as_writeexpire_store, +}; +static struct as_fs_entry as_frontmerges_entry = { + .attr = {.name = "front_merges", .mode = S_IRUGO | S_IWUSR }, + .show = as_frontmerges_show, + .store = as_frontmerges_store, +}; +static struct as_fs_entry as_anticexpire_entry = { + .attr = {.name = "antic_expire", .mode = S_IRUGO | S_IWUSR }, + .show = as_anticexpire_show, + .store = as_anticexpire_store, +}; +static struct as_fs_entry as_read_batchexpire_entry = { + .attr = {.name = "read_batch_expire", .mode = S_IRUGO | S_IWUSR }, + .show = as_read_batchexpire_show, + .store = as_read_batchexpire_store, +}; +static struct as_fs_entry as_write_batchexpire_entry = { + .attr = {.name = "write_batch_expire", .mode = S_IRUGO | S_IWUSR }, + .show = as_write_batchexpire_show, + .store = as_write_batchexpire_store, +}; + +static struct attribute *default_attrs[] = { + &as_readexpire_entry.attr, + &as_writeexpire_entry.attr, + &as_frontmerges_entry.attr, + &as_anticexpire_entry.attr, + &as_read_batchexpire_entry.attr, + &as_write_batchexpire_entry.attr, + NULL, +}; + +#define to_as(atr) container_of((atr), struct as_fs_entry, attr) + +static ssize_t +as_attr_show(struct kobject *kobj, struct attribute *attr, char *page) +{ + elevator_t *e = container_of(kobj, elevator_t, kobj); + struct as_fs_entry *entry = to_as(attr); + + if (!entry->show) + return 0; + + return entry->show(e->elevator_data, page); +} + +static ssize_t +as_attr_store(struct kobject *kobj, struct attribute *attr, + const char *page, size_t length) +{ + elevator_t *e = container_of(kobj, elevator_t, kobj); + struct as_fs_entry *entry = to_as(attr); + + if (!entry->store) + return -EINVAL; + + return entry->store(e->elevator_data, page, length); +} + +static struct sysfs_ops as_sysfs_ops = { + .show = as_attr_show, + .store = as_attr_store, +}; + +struct kobj_type as_ktype = { + .sysfs_ops = &as_sysfs_ops, + .default_attrs = default_attrs, +}; + +static int __init as_slab_setup(void) +{ + drq_pool = kmem_cache_create("as_drq", sizeof(struct as_rq), + 0, 0, NULL, NULL); + + if (!drq_pool) + panic("as: can't init slab pool\n"); + + return 0; +} + +subsys_initcall(as_slab_setup); + +elevator_t iosched_as = { + .elevator_merge_fn = as_merge, + .elevator_merged_fn = as_merged_request, + .elevator_merge_req_fn = as_merged_requests, + .elevator_next_req_fn = as_next_request, + .elevator_add_req_fn = as_insert_request, + .elevator_remove_req_fn = as_remove_request, + .elevator_queue_empty_fn = as_queue_notready, + .elevator_former_req_fn = as_former_request, + .elevator_latter_req_fn = as_latter_request, + .elevator_init_fn = as_init, + .elevator_exit_fn = as_exit, + + .elevator_ktype = &as_ktype, +}; + +EXPORT_SYMBOL(iosched_as); diff -puN drivers/block/Makefile~as-iosched drivers/block/Makefile --- 25/drivers/block/Makefile~as-iosched 2003-02-20 23:20:19.000000000 -0800 +++ 25-akpm/drivers/block/Makefile 2003-02-20 23:20:30.000000000 -0800 @@ -8,7 +8,8 @@ # In the future, some of these should be built conditionally. # -obj-y := elevator.o ll_rw_blk.o ioctl.o genhd.o scsi_ioctl.o deadline-iosched.o +obj-y := elevator.o ll_rw_blk.o ioctl.o genhd.o scsi_ioctl.o \ + deadline-iosched.o as-iosched.o obj-$(CONFIG_MAC_FLOPPY) += swim3.o obj-$(CONFIG_BLK_DEV_FD) += floppy.o diff -puN include/linux/elevator.h~as-iosched include/linux/elevator.h --- 25/include/linux/elevator.h~as-iosched 2003-02-20 23:20:19.000000000 -0800 +++ 25-akpm/include/linux/elevator.h 2003-02-20 23:20:30.000000000 -0800 @@ -73,6 +73,11 @@ extern elevator_t elevator_noop; */ extern elevator_t iosched_deadline; +/* + * anticipatory I/O scheduler + */ +extern elevator_t iosched_as; + extern int elevator_init(request_queue_t *, elevator_t *); extern void elevator_exit(request_queue_t *); extern inline int bio_rq_in_between(struct bio *, struct request *, struct list_head *); diff -puN drivers/block/ll_rw_blk.c~as-iosched drivers/block/ll_rw_blk.c --- 25/drivers/block/ll_rw_blk.c~as-iosched 2003-02-20 23:20:19.000000000 -0800 +++ 25-akpm/drivers/block/ll_rw_blk.c 2003-02-20 23:20:30.000000000 -0800 @@ -1212,6 +1212,18 @@ nomem: static int __make_request(request_queue_t *, struct bio *); +static elevator_t *chosen_elevator = &iosched_as; + +static int __init elevator_setup(char *str) +{ + if (!strcmp(str, "deadline")) { + chosen_elevator = &iosched_deadline; + printk("elevator: cfq\n"); + } + return 1; +} +__setup("elevator=", elevator_setup); + /** * blk_init_queue - prepare a request queue for use with a block device * @q: The &request_queue_t to be initialised @@ -1247,7 +1259,12 @@ int blk_init_queue(request_queue_t *q, r if (blk_init_free_list(q)) return -ENOMEM; - if ((ret = elevator_init(q, &iosched_deadline))) { + if (chosen_elevator == &iosched_deadline) + printk("deadline elevator\n"); + else if (chosen_elevator == &iosched_as) + printk("anticipatory scheduling elevator\n"); + + if ((ret = elevator_init(q, chosen_elevator))) { blk_cleanup_queue(q); return ret; } _