From: Nick Piggin Introduce the notion of cooperating processes (those that submit requests close to one another), and use these statistics to make better choices about whether or not to do anticipatory waiting. Help and analysis from Seetharami Seelam Performance testing from Seelam: I set up my system and executed a couple of tests that I used for OLS. I tested with AS, cooperative process patch merged in -mm tree (which I called Nick, below) and the cooperative patch with modifications to as_update_iohist (which I called Seelam). I used a dual-processor (2.28GHz Pentium 4 Xeon) system, with 1 GB main memory and 1 MB L2 cache, running Linux 2.6.9. Only a single processor is used for the experiments. I used 7.2K RPM Maxtor 10GB drive configured with ext2 file system. Experiment 1 (ex1) consists of reading one Linux source trees using find . -type f -exec cat '{}' ';' > /dev/null. Experiment 2 (ex2) consists of reading two disjoint Linux source trees using find . -type f -exec cat '{}' ';' > /dev/null. Experiment 3 (ex3) consists of streaming read of a 2GB file in the background and 1 instance of the chunk reads in Experiment 1. Timings for reading the Linux source are shown below: AS Nick Seelam ex1: 0m25.813s 0m27.859s 0m27.640s ex2: 1m11.468s 1m13.918s 1m5.869s ex3: 81m44.352s 10m38.572s 6m47.994s The difference between the numbers in Experiment 3 must be due to the code in as_update_iohist. (akpm: that's not part of this patch. So this patch is "Nick"). Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton --- drivers/block/as-iosched.c | 269 +++++++++++++++++++++++---------------------- 1 files changed, 141 insertions(+), 128 deletions(-) diff -puN drivers/block/as-iosched.c~as-cooperating-processes drivers/block/as-iosched.c --- devel/drivers/block/as-iosched.c~as-cooperating-processes 2005-09-10 23:59:24.000000000 -0700 +++ devel-akpm/drivers/block/as-iosched.c 2005-09-10 23:59:24.000000000 -0700 @@ -4,7 +4,7 @@ * Anticipatory & deadline i/o scheduler. * * Copyright (C) 2002 Jens Axboe - * Nick Piggin + * Nick Piggin * */ #include @@ -103,6 +103,9 @@ struct as_data { unsigned long exit_prob; /* probability a task will exit while being waited on */ + unsigned long exit_no_coop; /* probablility an exitted task will + not be part of a later cooperating + request */ unsigned long new_ttime_total; /* mean thinktime on new proc */ unsigned long new_ttime_mean; u64 new_seek_total; /* mean seek on new proc */ @@ -629,34 +632,145 @@ static void as_antic_timeout(unsigned lo /* process anticipated on has exitted or timed out*/ ad->exit_prob = (7*ad->exit_prob + 256)/8; } + if (!test_bit(AS_TASK_RUNNING, &aic->state)) { + /* process not "saved" by a cooperating request */ + ad->exit_no_coop = (7*ad->exit_no_coop + 256)/8; + } } spin_unlock_irqrestore(q->queue_lock, flags); } +static void as_update_thinktime(struct as_data *ad, struct as_io_context *aic, unsigned long ttime) +{ + /* fixed point: 1.0 == 1<<8 */ + if (aic->ttime_samples == 0) { + ad->new_ttime_total = (7*ad->new_ttime_total + 256*ttime) / 8; + ad->new_ttime_mean = ad->new_ttime_total / 256; + + ad->exit_prob = (7*ad->exit_prob)/8; + } + aic->ttime_samples = (7*aic->ttime_samples + 256) / 8; + aic->ttime_total = (7*aic->ttime_total + 256*ttime) / 8; + aic->ttime_mean = (aic->ttime_total + 128) / aic->ttime_samples; +} + +static void as_update_seekdist(struct as_data *ad, struct as_io_context *aic, sector_t sdist) +{ + u64 total; + + if (aic->seek_samples == 0) { + ad->new_seek_total = (7*ad->new_seek_total + 256*(u64)sdist)/8; + ad->new_seek_mean = ad->new_seek_total / 256; + } + + /* + * Don't allow the seek distance to get too large from the + * odd fragment, pagein, etc + */ + if (aic->seek_samples <= 60) /* second&third seek */ + sdist = min(sdist, (aic->seek_mean * 4) + 2*1024*1024); + else + sdist = min(sdist, (aic->seek_mean * 4) + 2*1024*64); + + aic->seek_samples = (7*aic->seek_samples + 256) / 8; + aic->seek_total = (7*aic->seek_total + (u64)256*sdist) / 8; + total = aic->seek_total + (aic->seek_samples/2); + do_div(total, aic->seek_samples); + aic->seek_mean = (sector_t)total; +} + +/* + * as_update_iohist keeps a decaying histogram of IO thinktimes, and + * updates @aic->ttime_mean based on that. It is called when a new + * request is queued. + */ +static void as_update_iohist(struct as_data *ad, struct as_io_context *aic, struct request *rq) +{ + struct as_rq *arq = RQ_DATA(rq); + int data_dir = arq->is_sync; + unsigned long thinktime = 0; + sector_t seek_dist; + + if (aic == NULL) + return; + + if (data_dir == REQ_SYNC) { + unsigned long in_flight = atomic_read(&aic->nr_queued) + + atomic_read(&aic->nr_dispatched); + spin_lock(&aic->lock); + if (test_bit(AS_TASK_IORUNNING, &aic->state) || + test_bit(AS_TASK_IOSTARTED, &aic->state)) { + /* Calculate read -> read thinktime */ + if (test_bit(AS_TASK_IORUNNING, &aic->state) + && in_flight == 0) { + thinktime = jiffies - aic->last_end_request; + thinktime = min(thinktime, MAX_THINKTIME-1); + } + as_update_thinktime(ad, aic, thinktime); + + /* Calculate read -> read seek distance */ + if (aic->last_request_pos < rq->sector) + seek_dist = rq->sector - aic->last_request_pos; + else + seek_dist = aic->last_request_pos - rq->sector; + as_update_seekdist(ad, aic, seek_dist); + } + aic->last_request_pos = rq->sector + rq->nr_sectors; + set_bit(AS_TASK_IOSTARTED, &aic->state); + spin_unlock(&aic->lock); + } +} + /* * as_close_req decides if one request is considered "close" to the * previous one issued. */ -static int as_close_req(struct as_data *ad, struct as_rq *arq) +static int as_close_req(struct as_data *ad, struct as_io_context *aic, struct as_rq *arq) { unsigned long delay; /* milliseconds */ sector_t last = ad->last_sector[ad->batch_data_dir]; sector_t next = arq->request->sector; sector_t delta; /* acceptable close offset (in sectors) */ + sector_t s; if (ad->antic_status == ANTIC_OFF || !ad->ioc_finished) delay = 0; else delay = ((jiffies - ad->antic_start) * 1000) / HZ; - if (delay <= 1) - delta = 64; + if (delay == 0) + delta = 8192; else if (delay <= 20 && delay <= ad->antic_expire) - delta = 64 << (delay-1); + delta = 8192 << delay; else return 1; - return (last - (delta>>1) <= next) && (next <= last + delta); + if ((last <= next + (delta>>1)) && (next <= last + delta)) + return 1; + + if (last < next) + s = next - last; + else + s = last - next; + + if (aic->seek_samples == 0) { + /* + * Process has just started IO. Use past statistics to + * guage success possibility + */ + if (ad->new_seek_mean > s) { + /* this request is better than what we're expecting */ + return 1; + } + + } else { + if (aic->seek_mean > s) { + /* this request is better than what we're expecting */ + return 1; + } + } + + return 0; } /* @@ -676,7 +790,6 @@ static int as_can_break_anticipation(str { struct io_context *ioc; struct as_io_context *aic; - sector_t s; ioc = ad->io_context; BUG_ON(!ioc); @@ -698,13 +811,6 @@ static int as_can_break_anticipation(str if (!aic) return 0; - if (!test_bit(AS_TASK_RUNNING, &aic->state)) { - /* process anticipated on has exitted */ - if (aic->ttime_samples == 0) - ad->exit_prob = (7*ad->exit_prob + 256)/8; - return 1; - } - if (atomic_read(&aic->nr_queued) > 0) { /* process has more requests queued */ return 1; @@ -715,57 +821,45 @@ static int as_can_break_anticipation(str return 1; } - if (arq && arq->is_sync == REQ_SYNC && as_close_req(ad, arq)) { + if (arq && arq->is_sync == REQ_SYNC && as_close_req(ad, aic, arq)) { /* * Found a close request that is not one of ours. * - * This makes close requests from another process reset - * our thinktime delay. Is generally useful when there are + * This makes close requests from another process update + * our IO history. Is generally useful when there are * two or more cooperating processes working in the same * area. */ - spin_lock(&aic->lock); - aic->last_end_request = jiffies; - spin_unlock(&aic->lock); + if (!test_bit(AS_TASK_RUNNING, &aic->state)) { + if (aic->ttime_samples == 0) + ad->exit_prob = (7*ad->exit_prob + 256)/8; + + ad->exit_no_coop = (7*ad->exit_no_coop)/8; + } + + as_update_iohist(ad, aic, arq->request); return 1; } + if (!test_bit(AS_TASK_RUNNING, &aic->state)) { + /* process anticipated on has exitted */ + if (aic->ttime_samples == 0) + ad->exit_prob = (7*ad->exit_prob + 256)/8; + + if (ad->exit_no_coop > 128) + return 1; + } if (aic->ttime_samples == 0) { if (ad->new_ttime_mean > ad->antic_expire) return 1; - if (ad->exit_prob > 128) + if (ad->exit_prob * ad->exit_no_coop > 128*256) return 1; } else if (aic->ttime_mean > ad->antic_expire) { /* the process thinks too much between requests */ return 1; } - if (!arq) - return 0; - - if (ad->last_sector[REQ_SYNC] < arq->request->sector) - s = arq->request->sector - ad->last_sector[REQ_SYNC]; - else - s = ad->last_sector[REQ_SYNC] - arq->request->sector; - - if (aic->seek_samples == 0) { - /* - * Process has just started IO. Use past statistics to - * guage success possibility - */ - if (ad->new_seek_mean > s) { - /* this request is better than what we're expecting */ - return 1; - } - - } else { - if (aic->seek_mean > s) { - /* this request is better than what we're expecting */ - return 1; - } - } - return 0; } @@ -805,88 +899,6 @@ static int as_can_anticipate(struct as_d return 1; } -static void as_update_thinktime(struct as_data *ad, struct as_io_context *aic, unsigned long ttime) -{ - /* fixed point: 1.0 == 1<<8 */ - if (aic->ttime_samples == 0) { - ad->new_ttime_total = (7*ad->new_ttime_total + 256*ttime) / 8; - ad->new_ttime_mean = ad->new_ttime_total / 256; - - ad->exit_prob = (7*ad->exit_prob)/8; - } - aic->ttime_samples = (7*aic->ttime_samples + 256) / 8; - aic->ttime_total = (7*aic->ttime_total + 256*ttime) / 8; - aic->ttime_mean = (aic->ttime_total + 128) / aic->ttime_samples; -} - -static void as_update_seekdist(struct as_data *ad, struct as_io_context *aic, sector_t sdist) -{ - u64 total; - - if (aic->seek_samples == 0) { - ad->new_seek_total = (7*ad->new_seek_total + 256*(u64)sdist)/8; - ad->new_seek_mean = ad->new_seek_total / 256; - } - - /* - * Don't allow the seek distance to get too large from the - * odd fragment, pagein, etc - */ - if (aic->seek_samples <= 60) /* second&third seek */ - sdist = min(sdist, (aic->seek_mean * 4) + 2*1024*1024); - else - sdist = min(sdist, (aic->seek_mean * 4) + 2*1024*64); - - aic->seek_samples = (7*aic->seek_samples + 256) / 8; - aic->seek_total = (7*aic->seek_total + (u64)256*sdist) / 8; - total = aic->seek_total + (aic->seek_samples/2); - do_div(total, aic->seek_samples); - aic->seek_mean = (sector_t)total; -} - -/* - * as_update_iohist keeps a decaying histogram of IO thinktimes, and - * updates @aic->ttime_mean based on that. It is called when a new - * request is queued. - */ -static void as_update_iohist(struct as_data *ad, struct as_io_context *aic, struct request *rq) -{ - struct as_rq *arq = RQ_DATA(rq); - int data_dir = arq->is_sync; - unsigned long thinktime; - sector_t seek_dist; - - if (aic == NULL) - return; - - if (data_dir == REQ_SYNC) { - unsigned long in_flight = atomic_read(&aic->nr_queued) - + atomic_read(&aic->nr_dispatched); - spin_lock(&aic->lock); - if (test_bit(AS_TASK_IORUNNING, &aic->state) || - test_bit(AS_TASK_IOSTARTED, &aic->state)) { - /* Calculate read -> read thinktime */ - if (test_bit(AS_TASK_IORUNNING, &aic->state) - && in_flight == 0) { - thinktime = jiffies - aic->last_end_request; - thinktime = min(thinktime, MAX_THINKTIME-1); - } else - thinktime = 0; - as_update_thinktime(ad, aic, thinktime); - - /* Calculate read -> read seek distance */ - if (aic->last_request_pos < rq->sector) - seek_dist = rq->sector - aic->last_request_pos; - else - seek_dist = aic->last_request_pos - rq->sector; - as_update_seekdist(ad, aic, seek_dist); - } - aic->last_request_pos = rq->sector + rq->nr_sectors; - set_bit(AS_TASK_IOSTARTED, &aic->state); - spin_unlock(&aic->lock); - } -} - /* * as_update_arq must be called whenever a request (arq) is added to * the sort_list. This function keeps caches up to date, and checks if the @@ -1952,6 +1964,7 @@ static ssize_t as_est_show(struct as_dat int pos = 0; pos += sprintf(page+pos, "%lu %% exit probability\n", 100*ad->exit_prob/256); + pos += sprintf(page+pos, "%lu %% probability of exiting without a cooperating process submitting IO\n", 100*ad->exit_no_coop/256); pos += sprintf(page+pos, "%lu ms new thinktime\n", ad->new_ttime_mean); pos += sprintf(page+pos, "%llu sectors new seek distance\n", (unsigned long long)ad->new_seek_mean); _