From: Nick Piggin This one gathers better statistics about the new process problem. It improves estimation for initial process IO. That is, better calculations for whether it will be worth waiting after a process submits its first read. This is done with a per queue average thinktime and seek time for a second read submitted from a process. When combined with 3/3, numbers are around the same as mm1 for most long lived tasks, but much better for things like the top 4 benchmarks. Probably wants rwhron and the OSDL database guys to give it some testing. test 2.6.0-test9-mm1 2.6.0-test9-mm1-np Cat kernel source during seq read 0:26.89 0:24.75 Cat kernel source during seq write 9:17.80 0:23.48 ls -lr kernel source during seq read 0:11.03 0:14.68 ls -lr kernel source during seq write 0:49.95 0:08.06 contest no_load 143s 0 loads 144s 0 loads contest io_load 193s 40.2 loads 193s 40.1 loads contest read_load 186s 11.6 loads 190s 10.5 loads contest list_load 201s 5.0 loads 200s 5.0 loads pgbench 1 client 31.3TPS 31.5TPS pgbench 4 clients 37.7TPS 37.5TPS pgbench 16 clients 42.1TPS 48.1TPS 25-akpm/drivers/block/as-iosched.c | 209 +++++++++++++++++++++++-------------- 1 files changed, 131 insertions(+), 78 deletions(-) diff -puN drivers/block/as-iosched.c~as-new-process-estimation drivers/block/as-iosched.c --- 25/drivers/block/as-iosched.c~as-new-process-estimation Fri Nov 7 15:25:38 2003 +++ 25-akpm/drivers/block/as-iosched.c Fri Nov 7 15:25:38 2003 @@ -70,6 +70,7 @@ /* Bits in as_io_context.state */ enum as_io_states { AS_TASK_RUNNING=0, /* Process has not exitted */ + AS_TASK_IOSTARTED, /* Process has started some IO */ AS_TASK_IORUNNING, /* Process has completed some IO */ }; @@ -99,6 +100,14 @@ struct as_data { sector_t last_sector[2]; /* last REQ_SYNC & REQ_ASYNC sectors */ struct list_head *dispatch; /* driver dispatch queue */ struct list_head *hash; /* request hash */ + + unsigned long exit_prob; /* probability a task will exit while + being waited on */ + unsigned long new_ttime_total; /* mean thinktime on new proc */ + unsigned long new_ttime_mean; + u64 new_seek_total; /* mean seek on new proc */ + sector_t new_seek_mean; + unsigned long current_batch_expires; unsigned long last_check_fifo[2]; int changed_batch; /* 1: waiting for old batch to end */ @@ -186,6 +195,7 @@ static void free_as_io_context(struct as /* Called when the task exits */ static void exit_as_io_context(struct as_io_context *aic) { + WARN_ON(!test_bit(AS_TASK_RUNNING, &aic->state)); clear_bit(AS_TASK_RUNNING, &aic->state); } @@ -608,8 +618,15 @@ static void as_antic_timeout(unsigned lo spin_lock_irqsave(q->queue_lock, flags); if (ad->antic_status == ANTIC_WAIT_REQ || ad->antic_status == ANTIC_WAIT_NEXT) { + struct as_io_context *aic = ad->io_context->aic; + ad->antic_status = ANTIC_FINISHED; kblockd_schedule_work(&ad->antic_work); + + if (aic->ttime_samples == 0) { + /* process anticipated on has exitted or timed out*/ + ad->exit_prob = (7*ad->exit_prob + 256)/8; + } } spin_unlock_irqrestore(q->queue_lock, flags); } @@ -623,7 +640,7 @@ static int as_close_req(struct as_data * unsigned long delay; /* milliseconds */ sector_t last = ad->last_sector[ad->batch_data_dir]; sector_t next = arq->request->sector; - sector_t delta; /* acceptable close offset (in sectors) */ + sector_t delta; /* acceptable close offset (in sectors) */ if (ad->antic_status == ANTIC_OFF || !ad->ioc_finished) delay = 0; @@ -657,6 +674,15 @@ static int as_can_break_anticipation(str { struct io_context *ioc; struct as_io_context *aic; + sector_t s; + + ioc = ad->io_context; + BUG_ON(!ioc); + + if (arq && ioc == arq->io_context) { + /* request from same process */ + return 1; + } if (arq && arq->is_sync == REQ_SYNC && as_close_req(ad, arq)) { /* close request */ @@ -671,20 +697,14 @@ static int as_can_break_anticipation(str return 1; } - ioc = ad->io_context; - BUG_ON(!ioc); - - if (arq && ioc == arq->io_context) { - /* request from same process */ - return 1; - } - aic = ioc->aic; if (!aic) return 0; if (!test_bit(AS_TASK_RUNNING, &aic->state)) { /* process anticipated on has exitted */ + if (aic->ttime_samples == 0) + ad->exit_prob = (7*ad->exit_prob + 256)/8; return 1; } @@ -698,27 +718,36 @@ static int as_can_break_anticipation(str return 1; } - if (aic->seek_samples == 0 || aic->ttime_samples == 0) { - /* - * Process has just started IO. Don't anticipate. - * TODO! Must fix this up. - */ - return 1; - } - - if (aic->ttime_mean > ad->antic_expire) { + if (aic->ttime_samples == 0) { + if (ad->new_ttime_mean > ad->antic_expire) + return 1; + if (ad->exit_prob > 128) + return 1; + } else if (aic->ttime_mean > ad->antic_expire) { /* the process thinks too much between requests */ return 1; } - if (arq && aic->seek_samples) { - sector_t s; - if (ad->last_sector[REQ_SYNC] < arq->request->sector) - s = arq->request->sector - ad->last_sector[REQ_SYNC]; - else - s = ad->last_sector[REQ_SYNC] - arq->request->sector; + if (!arq) + return 0; + + if (ad->last_sector[REQ_SYNC] < arq->request->sector) + s = arq->request->sector - ad->last_sector[REQ_SYNC]; + else + s = ad->last_sector[REQ_SYNC] - arq->request->sector; + + if (aic->seek_samples == 0) { + /* + * Process has just started IO. Use past statistics to + * guage success possibility + */ + if (ad->new_seek_mean/2 > s) { + /* this request is better than what we're expecting */ + return 1; + } - if (aic->seek_mean > (s>>1)) { + } else { + if (aic->seek_mean/2 > s) { /* this request is better than what we're expecting */ return 1; } @@ -763,12 +792,51 @@ static int as_can_anticipate(struct as_d return 1; } +static void as_update_thinktime(struct as_data *ad, struct as_io_context *aic, unsigned long ttime) +{ + /* fixed point: 1.0 == 1<<8 */ + if (aic->ttime_samples == 0) { + ad->new_ttime_total = (7*ad->new_ttime_total + 256*ttime) / 8; + ad->new_ttime_mean = ad->new_ttime_total / 256; + + ad->exit_prob = (7*ad->exit_prob)/8; + } + aic->ttime_samples = (7*aic->ttime_samples + 256) / 8; + aic->ttime_total = (7*aic->ttime_total + 256*ttime) / 8; + aic->ttime_mean = (aic->ttime_total + 128) / aic->ttime_samples; +} + +static void as_update_seekdist(struct as_data *ad, struct as_io_context *aic, sector_t sdist) +{ + u64 total; + + if (aic->seek_samples == 0) { + ad->new_seek_total = (7*ad->new_seek_total + 256*(u64)sdist)/8; + ad->new_seek_mean = ad->new_seek_total / 256; + } + + /* + * Don't allow the seek distance to get too large from the + * odd fragment, pagein, etc + */ + if (aic->seek_samples <= 60) /* second&third seek */ + sdist = min(sdist, (aic->seek_mean * 4) + 2*1024*1024); + else + sdist = min(sdist, (aic->seek_mean * 4) + 2*1024*64); + + aic->seek_samples = (7*aic->seek_samples + 256) / 8; + aic->seek_total = (7*aic->seek_total + (u64)256*sdist) / 8; + total = aic->seek_total + (aic->seek_samples/2); + do_div(total, aic->seek_samples); + aic->seek_mean = (sector_t)total; +} + /* * as_update_iohist keeps a decaying histogram of IO thinktimes, and * updates @aic->ttime_mean based on that. It is called when a new * request is queued. */ -static void as_update_iohist(struct as_io_context *aic, struct request *rq) +static void as_update_iohist(struct as_data *ad, struct as_io_context *aic, struct request *rq) { struct as_rq *arq = RQ_DATA(rq); int data_dir = arq->is_sync; @@ -779,60 +847,29 @@ static void as_update_iohist(struct as_i return; if (data_dir == REQ_SYNC) { + unsigned long in_flight = atomic_read(&aic->nr_queued) + + atomic_read(&aic->nr_dispatched); spin_lock(&aic->lock); - - if (test_bit(AS_TASK_IORUNNING, &aic->state) - && !atomic_read(&aic->nr_queued) - && !atomic_read(&aic->nr_dispatched)) { + if (test_bit(AS_TASK_IORUNNING, &aic->state) || + test_bit(AS_TASK_IOSTARTED, &aic->state)) { /* Calculate read -> read thinktime */ - thinktime = jiffies - aic->last_end_request; - thinktime = min(thinktime, MAX_THINKTIME-1); - /* fixed point: 1.0 == 1<<8 */ - aic->ttime_samples += 256; - aic->ttime_total += 256*thinktime; - if (aic->ttime_samples) - /* fixed point factor is cancelled here */ - aic->ttime_mean = (aic->ttime_total + 128) - / aic->ttime_samples; - aic->ttime_samples = (aic->ttime_samples>>1) - + (aic->ttime_samples>>2); - aic->ttime_total = (aic->ttime_total>>1) - + (aic->ttime_total>>2); - } - - /* Calculate read -> read seek distance */ - if (!aic->seek_samples) - seek_dist = 0; - else if (aic->last_request_pos < rq->sector) - seek_dist = rq->sector - aic->last_request_pos; - else - seek_dist = aic->last_request_pos - rq->sector; - + if (test_bit(AS_TASK_IORUNNING, &aic->state) + && in_flight == 0) { + thinktime = jiffies - aic->last_end_request; + thinktime = min(thinktime, MAX_THINKTIME-1); + } else + thinktime = 0; + as_update_thinktime(ad, aic, thinktime); + + /* Calculate read -> read seek distance */ + if (aic->last_request_pos < rq->sector) + seek_dist = rq->sector - aic->last_request_pos; + else + seek_dist = aic->last_request_pos - rq->sector; + as_update_seekdist(ad, aic, seek_dist); + } aic->last_request_pos = rq->sector + rq->nr_sectors; - - /* - * Don't allow the seek distance to get too large from the - * odd fragment, pagein, etc - */ - if (aic->seek_samples < 400) /* second&third seek */ - seek_dist = min(seek_dist, (aic->seek_mean * 4) - + 2*1024*1024); - else - seek_dist = min(seek_dist, (aic->seek_mean * 4) - + 2*1024*64); - - aic->seek_samples += 256; - aic->seek_total += (u64)256*seek_dist; - if (aic->seek_samples) { - u64 total = aic->seek_total + (aic->seek_samples>>1); - do_div(total, aic->seek_samples); - aic->seek_mean = (sector_t)total; - } - aic->seek_samples = (aic->seek_samples>>1) - + (aic->seek_samples>>2); - aic->seek_total = (aic->seek_total>>1) - + (aic->seek_total>>2); - + set_bit(AS_TASK_IOSTARTED, &aic->state); spin_unlock(&aic->lock); } } @@ -1376,8 +1413,8 @@ static void as_add_request(struct as_dat arq->io_context = as_get_io_context(); if (arq->io_context) { + as_update_iohist(ad, arq->io_context->aic, arq->request); atomic_inc(&arq->io_context->aic->nr_queued); - as_update_iohist(arq->io_context->aic, arq->request); } alias = as_add_arq_rb(ad, arq); @@ -1885,6 +1922,17 @@ as_var_store(unsigned long *var, const c return count; } +static ssize_t as_est_show(struct as_data *ad, char *page) +{ + int pos = 0; + + pos += sprintf(page+pos, "%lu %% exit probability\n", 100*ad->exit_prob/256); + pos += sprintf(page+pos, "%lu ms new thinktime\n", ad->new_ttime_mean); + pos += sprintf(page+pos, "%llu sectors new seek distance\n", (unsigned long long)ad->new_seek_mean); + + return pos; +} + #define SHOW_FUNCTION(__FUNC, __VAR) \ static ssize_t __FUNC(struct as_data *ad, char *page) \ { \ @@ -1916,6 +1964,10 @@ STORE_FUNCTION(as_write_batchexpire_stor &ad->batch_expire[REQ_ASYNC], 0, INT_MAX); #undef STORE_FUNCTION +static struct as_fs_entry as_est_entry = { + .attr = {.name = "est_time", .mode = S_IRUGO }, + .show = as_est_show, +}; static struct as_fs_entry as_readexpire_entry = { .attr = {.name = "read_expire", .mode = S_IRUGO | S_IWUSR }, .show = as_readexpire_show, @@ -1943,6 +1995,7 @@ static struct as_fs_entry as_write_batch }; static struct attribute *default_attrs[] = { + &as_est_entry.attr, &as_readexpire_entry.attr, &as_writeexpire_entry.attr, &as_anticexpire_entry.attr, _