From: Nick Piggin It seems to go alright the basic tests. Really helps "1 reader and 1 writer" on my SCSI drive, but I'd like you to do your run of TCQ tests on it if you could. Quite a few big changes in order to get SCSI TCQ working properly. I'll give you a short: * Measure write batches by # of requests. Time has little meaning as we can fill a lot of disk cache in a small amount of time. (This option should be moved back to a time based tunable, which adjusts itself based on how long the last run took. At the moment it probably won't writeout enough on small request size devices) * When changing batch direction, don't submit any requests in the new direction until all outstanding requests have been completed. * When starting a new batch direction, make the first request a hard barrier (don't know if this helps). * Don't start timing a read (actually "SYNC") batch until the first request has been completed. This should help with writeback caches. It also includes some fixes / misc stuff done while the above was being done: * cleanup / move enums around * Now that we have sync vs async, we shouldn't need to keep track of "data dir". Get rid of it. * Thinktime is only accounted when the process has no outstanding requests. This seems pretty sane when you think about it. * Limit the rate at which seek_distance can grow. A generally nicely seeking process shouldn't suffer _too_ much if they have to seek to a fragment, or have to page in some code/library/swap, etc etc. * Properly set arq->state = AS_RQ_NEW. Now that arqs are dynamically initialised, the previous place this was done was more useless than ever. * Allow REQ_HARDBARRIER requests to be dispatched normally. No reason why they shouldn't. drivers/block/as-iosched.c | 175 ++++++++++++++++++++++++--------------------- 1 files changed, 96 insertions(+), 79 deletions(-) diff -puN drivers/block/as-iosched.c~as-jumbo-patch-for-scsi drivers/block/as-iosched.c --- 25/drivers/block/as-iosched.c~as-jumbo-patch-for-scsi 2003-05-28 03:10:51.000000000 -0700 +++ 25-akpm/drivers/block/as-iosched.c 2003-05-28 03:10:51.000000000 -0700 @@ -47,10 +47,10 @@ static unsigned long write_expire = HZ / static unsigned long read_batch_expire = HZ / 5; /* - * write_batch_expire describes how long we will allow a stream of writes to - * persist before looking to see whether it is time to switch over to reads. + * write_batch_expire describes how many write request we allow before looking + * to see whether it is time to switch over to reads. */ -static unsigned long write_batch_expire = HZ / 20; +static unsigned long write_batch_expire = 5; /* * max time we may wait to anticipate a read (default around 6ms) @@ -74,6 +74,12 @@ static unsigned long antic_expire = ((HZ */ #define MAX_THINKTIME (HZ/50UL) +/* Bits in as_io_context.state */ +enum as_io_states { + AS_TASK_RUNNING=0, /* Process has not exitted */ + AS_TASK_IORUNNING, /* Process has completed some IO */ +}; + struct as_io_context { atomic_t refcount; pid_t pid; @@ -87,10 +93,6 @@ struct as_io_context { unsigned long ttime_total; unsigned long ttime_samples; unsigned long ttime_mean; - /* Read / write pattern */ - int last_data_dir; - unsigned long dir_after_read[2]; - int mean_dir_after_read; /* Layout pattern */ long seek_samples; sector_t last_request_pos; @@ -98,10 +100,13 @@ struct as_io_context { sector_t seek_mean; }; -/* Bits in as_io_context.state */ -enum as_io_states { - AS_TASK_RUNNING=0, /* Process has not exitted */ - AS_TASK_IORUNNING, /* Process has completed some IO */ +enum anticipation_status { + ANTIC_OFF=0, /* Not anticipating (normal operation) */ + ANTIC_WAIT_REQ, /* The last read has not yet completed */ + ANTIC_WAIT_NEXT, /* Currently anticipating a request vs + last read (which has completed) */ + ANTIC_FINISHED, /* Anticipating but have found a candidate + * or timed out */ }; struct as_data { @@ -118,21 +123,23 @@ struct as_data { struct list_head fifo_list[2]; struct as_rq *next_arq[2]; /* next in sort order */ - sector_t last_sector[2]; /* last REQ_SYNC and REQ_ASYNC sectors */ + sector_t last_sector[2]; /* last REQ_SYNC & REQ_ASYNC sectors */ struct list_head *dispatch; /* driver dispatch queue */ struct list_head *hash; /* request hash */ unsigned long hash_valid_count; /* barrier hash count */ unsigned long current_batch_expires; unsigned long last_check_fifo[2]; - int batch_data_dir; /* current/last batch REQ_SYNC or REQ_ASYNC */ + int changed_batch; + int batch_data_dir; /* current batch REQ_SYNC / REQ_ASYNC */ mempool_t *arq_pool; - int antic_status; + enum anticipation_status antic_status; unsigned long antic_start; /* jiffies: when it started */ struct timer_list antic_timer; /* anticipatory scheduling timer */ struct work_struct antic_work; /* Deferred unplugging */ struct as_io_context *as_io_context;/* Identify the expected process */ int aic_finished; /* IO associated with as_io_context finished */ + int nr_dispatched; /* * settings that change how the i/o scheduler behaves @@ -144,15 +151,6 @@ struct as_data { #define list_entry_fifo(ptr) list_entry((ptr), struct as_rq, fifo) -enum anticipation_states { - ANTIC_OFF=0, /* Not anticipating (normal operation) */ - ANTIC_WAIT_REQ, /* The last read has not yet completed */ - ANTIC_WAIT_NEXT, /* Currently anticipating a request vs - last read (which has completed) */ - ANTIC_FINISHED, /* Anticipating but have found a candidate - * or timed out */ -}; - /* * per-request data. */ @@ -260,9 +258,6 @@ static struct as_io_context *get_as_io_c ret->ttime_total = 0; ret->ttime_samples = 0; ret->ttime_mean = 0; - ret->dir_after_read[REQ_SYNC] = 0; - ret->dir_after_read[REQ_ASYNC] = 0; - ret->mean_dir_after_read = REQ_SYNC; ret->seek_total = 0; ret->seek_samples = 0; ret->seek_mean = 0; @@ -750,13 +745,12 @@ static int as_can_break_anticipation(str } aic = ad->as_io_context; + BUG_ON(!aic); + if (arq && aic == arq->as_io_context) { /* request from same process */ return 1; } - - if (!aic) - return 0; if (!test_bit(AS_TASK_RUNNING, &aic->state)) { /* process anticipated on has exitted */ @@ -778,20 +772,16 @@ static int as_can_break_anticipation(str return 1; } - if (aic->mean_dir_after_read != REQ_SYNC) { - /* next request from this process will probably be a write */ - return 1; - } - if (arq && aic->seek_samples) { sector_t s; if (ad->last_sector[REQ_SYNC] < arq->request->sector) s = arq->request->sector - ad->last_sector[REQ_SYNC]; else s = ad->last_sector[REQ_SYNC] - arq->request->sector; - if (aic->seek_mean > s) + if (aic->seek_mean > s) { /* this request is better than what we're expecting */ return 1; + } } return 0; @@ -849,7 +839,9 @@ static void as_update_iohist(struct as_i return; if (data_dir == REQ_SYNC) { - if (test_bit(AS_TASK_IORUNNING, &aic->state)) { + if (test_bit(AS_TASK_IORUNNING, &aic->state) + && !atomic_read(&aic->nr_queued) + && !atomic_read(&aic->nr_dispatched)) { /* Calculate read -> read thinktime */ thinktime = jiffies - aic->last_end_request; thinktime = min(thinktime, MAX_THINKTIME-1); @@ -873,8 +865,16 @@ static void as_update_iohist(struct as_i seek_dist = aic->last_request_pos - rq->sector; aic->last_request_pos = rq->sector + rq->nr_sectors; + /* + * Don't allow the seek distance to get too large from the + * odd fragment, pagein, etc + */ if (!aic->seek_samples) seek_dist = 0; + else if (aic->seek_samples < 400) /* second&third seek */ + seek_dist = min(seek_dist, (aic->seek_mean * 4) + 2*1024*1024); + else + seek_dist = min(seek_dist, (aic->seek_mean * 4) + 2*1024*64); aic->seek_samples += 256; aic->seek_total += 256*seek_dist; @@ -887,23 +887,6 @@ static void as_update_iohist(struct as_i aic->seek_total = (aic->seek_total>>1) + (aic->seek_total>>2); } - - /* Calculate read/write pattern */ - if (aic->last_data_dir == REQ_SYNC) { - unsigned long rprob, wprob; - aic->dir_after_read[data_dir] += 256; - rprob = aic->dir_after_read[REQ_SYNC]; - wprob = aic->dir_after_read[REQ_ASYNC]; - - if (rprob*4 >= wprob*5) - aic->mean_dir_after_read = REQ_SYNC; - else - aic->mean_dir_after_read = REQ_ASYNC; - - aic->dir_after_read[REQ_SYNC] = (rprob>>1) + (rprob>>2); - aic->dir_after_read[REQ_ASYNC] = (wprob>>1) + (wprob>>2); - } - aic->last_data_dir = data_dir; } /* @@ -940,13 +923,35 @@ static void as_completed_request(request struct as_rq *arq = RQ_DATA(rq); struct as_io_context *aic = arq->as_io_context; - arq->state = AS_RQ_NEW; - - if (unlikely(!blk_fs_request(rq) || rq->flags & REQ_HARDBARRIER)) { + if (unlikely(!blk_fs_request(rq))) { WARN_ON(aic); return; } + if (blk_fs_request(rq) && arq->state == AS_RQ_NEW) + printk(KERN_INFO "warning: as_completed_request got bad request\n"); + + if (arq->state != AS_RQ_DISPATCHED) + return; + + if (ad->changed_batch && ad->nr_dispatched == 1) { + kblockd_schedule_work(&ad->antic_work); + ad->changed_batch = 2; + } + ad->nr_dispatched--; + + /* + * Start counting the batch from when a request of that direction is + * actually serviced. This should help devices with big TCQ windows + * and writeback caches + */ + if (ad->batch_data_dir == REQ_SYNC && ad->changed_batch + && ad->batch_data_dir == arq->is_sync) { + ad->current_batch_expires = jiffies + + ad->batch_expire[REQ_SYNC]; + ad->changed_batch = 0; + } + if (!aic) return; @@ -1036,7 +1041,7 @@ static void as_remove_request(request_qu { struct as_rq *arq = RQ_DATA(rq); - if (unlikely(!blk_fs_request(rq) || rq->flags & REQ_HARDBARRIER)) + if (unlikely(!blk_fs_request(rq))) return; if (arq) { @@ -1082,9 +1087,14 @@ static int as_fifo_expired(struct as_dat */ static inline int as_batch_expired(struct as_data *ad) { - return time_after(jiffies, ad->current_batch_expires) && - (ad->batch_data_dir == REQ_ASYNC || - time_after(jiffies, ad->fifo_expire[REQ_SYNC])); + if (ad->changed_batch) + return 0; + + if (ad->batch_data_dir == REQ_SYNC) + return time_after(jiffies, ad->current_batch_expires) && + time_after(jiffies, ad->fifo_expire[REQ_SYNC]); + + return !ad->current_batch_expires; } /* @@ -1106,11 +1116,16 @@ static void as_move_to_dispatch(struct a ad->last_sector[data_dir] = arq->request->sector + arq->request->nr_sectors; + ad->nr_dispatched++; + if (data_dir == REQ_SYNC) { /* In case we have to anticipate after this */ copy_as_io_context(&ad->as_io_context, &arq->as_io_context); - } else + } else { put_as_io_context(&ad->as_io_context); + if (ad->current_batch_expires) + ad->current_batch_expires--; + } ad->aic_finished = 0; ad->next_arq[data_dir] = as_find_next_arq(ad, arq); @@ -1138,7 +1153,10 @@ static int as_dispatch_request(struct as const int reads = !list_empty(&ad->fifo_list[REQ_SYNC]); const int writes = !list_empty(&ad->fifo_list[REQ_ASYNC]); - if (!(reads || writes)) + if (!(reads || writes) + || ad->antic_status == ANTIC_WAIT_REQ + || ad->antic_status == ANTIC_WAIT_NEXT + || ad->changed_batch == 1) return 0; if (!(reads && writes && as_batch_expired(ad)) ) { @@ -1180,10 +1198,10 @@ static int as_dispatch_request(struct as */ goto dispatch_writes; + if (ad->batch_data_dir == REQ_ASYNC) + ad->changed_batch = 1; ad->batch_data_dir = REQ_SYNC; arq = list_entry_fifo(ad->fifo_list[ad->batch_data_dir].next); - ad->current_batch_expires = jiffies + - ad->batch_expire[ad->batch_data_dir]; ad->last_check_fifo[ad->batch_data_dir] = jiffies; goto dispatch_request; } @@ -1197,9 +1215,10 @@ dispatch_writes: BUG_ON(RB_EMPTY(&ad->sort_list[REQ_ASYNC])); ad->batch_data_dir = REQ_ASYNC; + if (ad->batch_data_dir == REQ_SYNC) + ad->changed_batch = 1; + ad->current_batch_expires = ad->batch_expire[REQ_ASYNC]; arq = ad->next_arq[ad->batch_data_dir]; - ad->current_batch_expires = jiffies + - ad->batch_expire[ad->batch_data_dir]; goto dispatch_request; } @@ -1217,6 +1236,16 @@ fifo_expired: BUG_ON(arq == NULL); } + if (ad->changed_batch) { + if (ad->changed_batch == 1 && ad->nr_dispatched) + return 0; + if (ad->changed_batch == 2 && ad->batch_data_dir == REQ_ASYNC) + ad->changed_batch = 0; + else + ad->changed_batch = 2; + arq->request->flags |= REQ_HARDBARRIER; + } + /* * arq is the selected appropriate request. */ @@ -1287,16 +1316,6 @@ as_insert_request(request_queue_t *q, st while (ad->next_arq[REQ_ASYNC]) as_move_to_dispatch(ad, ad->next_arq[REQ_ASYNC]); - - list_add_tail(&rq->queuelist, ad->dispatch); - - /* Stop anticipating - let this request get through */ - if (!list_empty(ad->dispatch) - && (ad->antic_status == ANTIC_WAIT_REQ - || ad->antic_status == ANTIC_WAIT_NEXT)) - as_antic_stop(ad); - - return; } if (unlikely(!blk_fs_request(rq))) { @@ -1534,13 +1553,11 @@ static int as_set_request(request_queue_ if (arq) { RB_CLEAR(&arq->rb_node); arq->request = rq; - + arq->state = AS_RQ_NEW; arq->as_io_context = NULL; INIT_LIST_HEAD(&arq->hash); arq->hash_valid_count = 0; - INIT_LIST_HEAD(&arq->fifo); - rq->elevator_private = arq; return 0; } _