Some workloads really, really want to have no readahead. Databases which are perfoming small synchronous I/Os against a file which has extremely poor layout. Any readahead at all is a lose here. But the current readahead code refuses to adapt that low. Fix it up so that we can indeed adaptively disable readahead altogether, and do not start it again until we have seen max_readahead()'s worth of consecutive reads. include/linux/mm.h | 3 +- mm/filemap.c | 4 +-- mm/readahead.c | 61 ++++++++++++++++++++++++++++++++++++++++------------- 3 files changed, 51 insertions(+), 17 deletions(-) diff -puN mm/readahead.c~readahead-shrink-to-zero mm/readahead.c --- 25/mm/readahead.c~readahead-shrink-to-zero 2003-02-28 00:03:34.000000000 -0800 +++ 25-akpm/mm/readahead.c 2003-02-28 03:34:09.000000000 -0800 @@ -136,6 +136,12 @@ static int read_pages(struct address_spa * ahead_size: Together, these form the "ahead window". * ra_pages: The externally controlled max readahead for this fd. * + * When readahead is in the "maximally shrunk" state (next_size == -1UL), + * readahead is disabled. In this state, prev_page and size are used, inside + * handle_ra_miss(), to detect the resumption of sequential I/O. Once there + * has been a decent run of sequential I/O (defined by get_min_readahead), + * readahead is reenabled. + * * The readahead code manages two windows - the "current" and the "ahead" * windows. The intent is that while the application is walking the pages * in the current window, I/O is underway on the ahead window. When the @@ -168,6 +174,8 @@ static int read_pages(struct address_spa * will continue to perform linear reads. Either at the new file position, or * at the old one after another seek. * + * After enough misses, readahead is fully disabled. (next_size = -1UL). + * * There is a special-case: if the first page which the application tries to * read happens to be the first page of the file, it is assumed that a linear * read is about to happen and the window is immediately set to half of the @@ -253,14 +261,19 @@ int do_page_cache_readahead(struct addre int ret = 0; while (nr_to_read) { + int err; + unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_CACHE_SIZE; if (this_chunk > nr_to_read) this_chunk = nr_to_read; - ret = __do_page_cache_readahead(mapping, filp, + err = __do_page_cache_readahead(mapping, filp, offset, this_chunk); - if (ret < 0) + if (err < 0) { + ret = err; break; + } + ret += err; offset += this_chunk; nr_to_read -= this_chunk; } @@ -286,6 +299,7 @@ check_ra_success(struct file_ra_state *r ra->ahead_size = ra->next_size; } else { ra->next_size = -1UL; + ra->size = 0; } } } @@ -345,16 +359,19 @@ page_cache_readahead(struct address_spac ra->next_size += 2; } else { /* - * A miss - lseek, pread, etc. Shrink the readahead + * A miss - lseek, pagefault, pread, etc. Shrink the readahead * window by 25%. */ - ra->next_size -= ra->next_size / 4; + ra->next_size -= ra->next_size / 4 + 2; } - if (ra->next_size > max) + if ((long)ra->next_size > (long)max) ra->next_size = max; - if (ra->next_size < min) - ra->next_size = min; + if ((long)ra->next_size <= 0L) { + ra->next_size = -1UL; + ra->size = 0; + goto out; /* Readahead is off */ + } /* * Is this request outside the current window? @@ -374,6 +391,7 @@ page_cache_readahead(struct address_spac ra->prev_page = ra->start; ra->ahead_start = 0; ra->ahead_size = 0; + /* * Control now returns, probably to sleep until I/O * completes against the first ahead page. @@ -394,7 +412,6 @@ do_io: ra->size = ra->next_size; ra->ahead_start = 0; /* Invalidate these */ ra->ahead_size = 0; - actual = do_page_cache_readahead(mapping, filp, offset, ra->size); check_ra_success(ra, ra->size, actual, orig_next_size); @@ -455,21 +472,37 @@ page_cache_readaround(struct address_spa * not found. This will happen if it was evicted by the VM (readahead * thrashing) or if the readahead window is maximally shrunk. * - * If the window has been maximally shrunk (next_size == 0) then bump it up - * again to resume readahead. + * If the window has been maximally shrunk (next_size == -1UL) then look to see + * if we are getting misses against sequential file offsets. If so, and this + * persists then resume readahead. * * Otherwise we're thrashing, so shrink the readahead window by three pages. * This is because it is grown by two pages on a readahead hit. Theory being * that the readahead window size will stabilise around the maximum level at * which there is no thrashing. */ -void handle_ra_miss(struct address_space *mapping, struct file_ra_state *ra) +void handle_ra_miss(struct address_space *mapping, + struct file_ra_state *ra, pgoff_t offset) { - const unsigned long min = get_min_readahead(ra); - if (ra->next_size == -1UL) { - ra->next_size = min; + const unsigned long max = get_max_readahead(ra); + + if (offset != ra->prev_page + 1) { + ra->size = 0; /* Not sequential */ + } else { + ra->size++; /* A sequential read */ + if (ra->size >= max) { /* Resume readahead */ + ra->start = offset - max; + ra->next_size = max; + ra->size = max; + ra->ahead_start = 0; + ra->ahead_size = 0; + } + } + ra->prev_page = offset; } else { + const unsigned long min = get_min_readahead(ra); + ra->next_size -= 3; if (ra->next_size < min) ra->next_size = min; diff -puN include/linux/mm.h~readahead-shrink-to-zero include/linux/mm.h --- 25/include/linux/mm.h~readahead-shrink-to-zero 2003-02-28 00:03:34.000000000 -0800 +++ 25-akpm/include/linux/mm.h 2003-02-28 00:30:57.000000000 -0800 @@ -558,6 +558,7 @@ int write_one_page(struct page *page, in /* readahead.c */ #define VM_MAX_READAHEAD 128 /* kbytes */ #define VM_MIN_READAHEAD 16 /* kbytes (includes current page) */ + int do_page_cache_readahead(struct address_space *mapping, struct file *filp, unsigned long offset, unsigned long nr_to_read); void page_cache_readahead(struct address_space *mapping, @@ -569,7 +570,7 @@ void page_cache_readaround(struct addres struct file *filp, unsigned long offset); void handle_ra_miss(struct address_space *mapping, - struct file_ra_state *ra); + struct file_ra_state *ra, pgoff_t offset); unsigned long max_sane_readahead(unsigned long nr); /* Do stack extension */ diff -puN mm/filemap.c~readahead-shrink-to-zero mm/filemap.c --- 25/mm/filemap.c~readahead-shrink-to-zero 2003-02-28 00:03:34.000000000 -0800 +++ 25-akpm/mm/filemap.c 2003-02-28 00:30:56.000000000 -0800 @@ -562,7 +562,7 @@ void do_generic_mapping_read(struct addr find_page: page = find_get_page(mapping, index); if (unlikely(page == NULL)) { - handle_ra_miss(mapping, ra); + handle_ra_miss(mapping, ra, index); goto no_cached_page; } if (!PageUptodate(page)) @@ -978,7 +978,7 @@ retry_find: page = find_get_page(mapping, pgoff); if (!page) { if (did_readahead) { - handle_ra_miss(mapping,ra); + handle_ra_miss(mapping, ra, pgoff); did_readahead = 0; } goto no_cached_page; _