From: Ram Pai Currently the readahead code tends to read one more page than it should with seeky database-style loads. This was to prevent bogus readahead triggering when we step into the last page of the current window. The patch removes that workaround and fixes up the suboptimal logic instead. wrt the "rounding errors" mentioned in this patch, Ram provided the following description: Say the i/o size is 20 pages. Our algorithm starts by a initial average i/o size of 'ra_pages/2' which is mostly say 16. Now every time we take a average, the 'average' progresses as follows (16+20)/2=18 (18+20)/2=19 (19+20)/2=19 (19+20)/2=19..... and the rounding error makes it never touch 20 Benchmarking sitrep: IOZONE run on a nfs mounted filesystem: client machine 2proc, 733MHz, 2GB memory server machine 8proc, 700Mhz, 8GB memory ./iozone -c -t1 -s 4096m -r 128k --------------------------------------------------------- | | throughput | throughput | throughput | | | KB/sec | KB/sec | KB/sec | | | 266 | 266+patch | 2.4.20 | --------------------------------------------------------- |sequential read| 11697.55 | 11700.98 | 10846.87 | | | | | | |re-read | 11698.39 | 11691.84 | 10865.39 | | | | | | |reverse read | 20002.71 | 20099.86 | 10340.34 | | | | | | |stride read | 13813.01 | 13850.28 | 10193.87 | | | | | | |random read | 19705.06 | 19978.00 | 10839.57 | | | | | | |random mix | 28465.68 | 29964.38 | 10779.17 | | | | | | |pread | 11692.95 | 11697.29 | 10863.56 | --------------------------------------------------------- SYSBENCH run on machine 2proc, 733MHz, 256MB memory --------------------------------------------------------- | | 266 | 266+patch | 2.4.21 | --------------------------------------------------------- |time spent | 79.6253 | 79.8176 | 73.2605sec | | | | | | |Mb/sec | 1.959Mb.sec| 1.954Mb/sec| 2.129Mb/sec| | | | | | |requests/sec | 125.59 | 125.29 | 136.54 | | | | | | |no of Reads | 6001 | 6001 | 6008 | | | | | | |no of Writes | 3999 | 3999 | 3995 | | | | | | --------------------------------------------------------- --- 25-akpm/mm/readahead.c | 46 +++++++++++++++++++++++----------------------- 1 files changed, 23 insertions(+), 23 deletions(-) diff -puN mm/readahead.c~seeky-readahead-speedups mm/readahead.c --- 25/mm/readahead.c~seeky-readahead-speedups Mon May 17 13:59:13 2004 +++ 25-akpm/mm/readahead.c Mon May 17 13:59:13 2004 @@ -353,7 +353,7 @@ page_cache_readahead(struct address_spac unsigned orig_next_size; unsigned actual; int first_access=0; - unsigned long preoffset=0; + unsigned long average; /* * Here we detect the case where the application is performing @@ -394,10 +394,17 @@ page_cache_readahead(struct address_spac if (ra->serial_cnt <= (max * 2)) ra->serial_cnt++; } else { - ra->average = (ra->average + ra->serial_cnt) / 2; + /* + * to avoid rounding errors, ensure that 'average' + * tends towards the value of ra->serial_cnt. + */ + average = ra->average; + if (average < ra->serial_cnt) { + average++; + } + ra->average = (average + ra->serial_cnt) / 2; ra->serial_cnt = 1; } - preoffset = ra->prev_page; ra->prev_page = offset; if (offset >= ra->start && offset <= (ra->start + ra->size)) { @@ -457,18 +464,13 @@ do_io: * ahead window and get some I/O underway for the new * current window. */ - if (!first_access && preoffset >= ra->start && - preoffset < (ra->start + ra->size)) { - /* Heuristic: If 'n' pages were - * accessed in the current window, there - * is a high probability that around 'n' pages - * shall be used in the next current window. - * - * To minimize lazy-readahead triggered - * in the next current window, read in - * an extra page. + if (!first_access) { + /* Heuristic: there is a high probability + * that around ra->average number of + * pages shall be accessed in the next + * current window. */ - ra->next_size = preoffset - ra->start + 2; + ra->next_size = min(ra->average , (unsigned long)max); } ra->start = offset; ra->size = ra->next_size; @@ -492,21 +494,19 @@ do_io: */ if (ra->ahead_start == 0) { /* - * if the average io-size is less than maximum + * If the average io-size is more than maximum * readahead size of the file the io pattern is * sequential. Hence bring in the readahead window * immediately. - * Else the i/o pattern is random. Bring - * in the readahead window only if the last page of - * the current window is accessed (lazy readahead). + * If the average io-size is less than maximum + * readahead size of the file the io pattern is + * random. Hence don't bother to readahead. */ - unsigned long average = ra->average; - + average = ra->average; if (ra->serial_cnt > average) - average = (ra->serial_cnt + ra->average) / 2; + average = (ra->serial_cnt + ra->average + 1) / 2; - if ((average >= max) || (offset == (ra->start + - ra->size - 1))) { + if (average > max) { ra->ahead_start = ra->start + ra->size; ra->ahead_size = ra->next_size; actual = do_page_cache_readahead(mapping, filp, _