From: Nick Piggin The big regression from deadline is tiobench random reads with TCQ disks, however it is present in -linus as well, and would have been since day 1 of AS, but nobody has complained too loudly. http://developer.osdl.org/judith/tiobench/4CPU/rr.html This problem is probably a distilation of what causes lower database throughput, because I have only ever seen it with TCQ drives, and pgbench and OraSim are actually getting higher throughput here with a non TCQ drive. That is not to say that TCQ is useless, it obviously can provide a very real and significant boost. What I might do in the (near) future is get AS to detect TCQ and turn itself off indefinitely unless/until the a sysfs flag is set, and default that flag to off. This patch changes the AS tunables a bit to be more on par with deadline. It lowers the threshold for random reading processes to be considered unsuitable for anticipation, and it slightly rearranges and comments the "cooperative seek distance" logic. With this patch, AS is now very competitive with deadline on the single IDE and SCSI (non TCQ) disks here. In fact, I don't have any regressions anywhere. Even when TCQ is on, although throughput can be lower, AS still has benefits because of its much better read vs write latency and general tendancy to keep number of outstanding tags smaller. --- drivers/block/as-iosched.c | 49 +++++++++++++++++++++++---------------------- 1 files changed, 26 insertions(+), 23 deletions(-) diff -puN drivers/block/as-iosched.c~as-tuning drivers/block/as-iosched.c --- 25/drivers/block/as-iosched.c~as-tuning 2004-01-11 00:11:03.000000000 -0800 +++ 25-akpm/drivers/block/as-iosched.c 2004-01-11 00:11:03.000000000 -0800 @@ -31,19 +31,19 @@ /* * max time before a read is submitted. */ -#define default_read_expire (HZ / 20) +#define default_read_expire (HZ / 8) /* * ditto for writes, these limits are not hard, even * if the disk is capable of satisfying them. */ -#define default_write_expire (HZ / 5) +#define default_write_expire (HZ / 4) /* * read_batch_expire describes how long we will allow a stream of reads to * persist before looking to see whether it is time to switch over to writes. */ -#define default_read_batch_expire (HZ / 5) +#define default_read_batch_expire (HZ / 4) /* * write_batch_expire describes how long we want a stream of writes to run for. @@ -51,7 +51,7 @@ * See, the problem is: we can send a lot of writes to disk cache / TCQ in * a short amount of time... */ -#define default_write_batch_expire (HZ / 20) +#define default_write_batch_expire (HZ / 16) /* * max time we may wait to anticipate a read (default around 6ms) @@ -426,6 +426,8 @@ as_find_arq_rb(struct as_data *ad, secto * for a request. */ +#define BACK_PENALTY 2 + /* * as_choose_req selects the preferred one of two requests of the same data_dir * ignoring time - eg. timeouts, which is the job of as_dispatch_request @@ -459,7 +461,7 @@ as_choose_req(struct as_data *ad, struct if (s1 >= last) d1 = s1 - last; else if (s1+maxback >= last) - d1 = (last - s1)*2; + d1 = (last - s1)*BACK_PENALTY; else { r1_wrap = 1; d1 = 0; /* shut up, gcc */ @@ -468,7 +470,7 @@ as_choose_req(struct as_data *ad, struct if (s2 >= last) d2 = s2 - last; else if (s2+maxback >= last) - d2 = (last - s2)*2; + d2 = (last - s2)*BACK_PENALTY; else { r2_wrap = 1; d2 = 0; @@ -657,7 +659,6 @@ static int as_close_req(struct as_data * return (last - (delta>>1) <= next) && (next <= last + delta); } -static void as_update_thinktime(struct as_data *ad, struct as_io_context *aic, unsigned long ttime); /* * as_can_break_anticipation returns true if we have been anticipating this * request. @@ -685,20 +686,6 @@ static int as_can_break_anticipation(str return 1; } - if (arq && arq->is_sync == REQ_SYNC && as_close_req(ad, arq)) { - /* close request */ - struct as_io_context *aic = ioc->aic; - if (aic) { - unsigned long thinktime; - spin_lock(&aic->lock); - thinktime = jiffies - aic->last_end_request; - aic->last_end_request = jiffies; - as_update_thinktime(ad, aic, thinktime); - spin_unlock(&aic->lock); - } - return 1; - } - if (ad->ioc_finished && as_antic_expired(ad)) { /* * In this situation status should really be FINISHED, @@ -728,6 +715,22 @@ static int as_can_break_anticipation(str return 1; } + if (arq && arq->is_sync == REQ_SYNC && as_close_req(ad, arq)) { + /* + * Found a close request that is not one of ours. + * + * This makes close requests from another process reset + * our thinktime delay. Is generally useful when there are + * two or more cooperating processes working in the same + * area. + */ + spin_lock(&aic->lock); + aic->last_end_request = jiffies; + spin_unlock(&aic->lock); + return 1; + } + + if (aic->ttime_samples == 0) { if (ad->new_ttime_mean > ad->antic_expire) return 1; @@ -751,13 +754,13 @@ static int as_can_break_anticipation(str * Process has just started IO. Use past statistics to * guage success possibility */ - if (ad->new_seek_mean/2 > s) { + if (ad->new_seek_mean > s) { /* this request is better than what we're expecting */ return 1; } } else { - if (aic->seek_mean/2 > s) { + if (aic->seek_mean > s) { /* this request is better than what we're expecting */ return 1; } _