fdatasync can fail to wait on some pages due to a race. If some task (eg pdflush) is flushing the same mapping it can remove a page's dirty tag but not then mark that page as being under writeback, because pdflush hit a locked buffer in __block_write_full_page(). This will happen because kjournald is writing the buffer. In this situation __block_write_full_page() will redirty the page so that fsync notices it, but there is a window where the page eludes the radix tree dirty page walk. Consequently a concurrent fsync will fail to notice the page when walking the radix tree's dirty pages. The approach taken by this patch is to leave the page marked as dirty in the radix tree while ->writepage is working out what to do with it. This ensures that a concurrent write-for-sync will successfully locate the page and will then block in lock_page() until the non-write-for-sync code has finished altering the page state. --- 25-akpm/fs/mpage.c | 2 +- 25-akpm/include/linux/mm.h | 1 + 25-akpm/mm/page-writeback.c | 35 ++++++++++++++++++++++++++++++++++- 25-akpm/mm/vmscan.c | 2 +- 4 files changed, 37 insertions(+), 3 deletions(-) diff -puN fs/mpage.c~clear_page_dirty_for_io fs/mpage.c --- 25/fs/mpage.c~clear_page_dirty_for_io 2004-04-03 03:00:16.454730056 -0800 +++ 25-akpm/fs/mpage.c 2004-04-03 03:00:16.461728992 -0800 @@ -643,7 +643,7 @@ mpage_writepages(struct address_space *m wait_on_page_writeback(page); if (page->mapping == mapping && !PageWriteback(page) && - test_clear_page_dirty(page)) { + clear_page_dirty_for_io(page)) { if (writepage) { ret = (*writepage)(page, wbc); if (ret) { diff -puN mm/page-writeback.c~clear_page_dirty_for_io mm/page-writeback.c --- 25/mm/page-writeback.c~clear_page_dirty_for_io 2004-04-03 03:00:16.455729904 -0800 +++ 25-akpm/mm/page-writeback.c 2004-04-03 03:00:16.462728840 -0800 @@ -472,7 +472,7 @@ int write_one_page(struct page *page, in if (wait) wait_on_page_writeback(page); - if (test_clear_page_dirty(page)) { + if (clear_page_dirty_for_io(page)) { page_cache_get(page); ret = mapping->a_ops->writepage(page, &wbc); if (ret == 0 && wait) { @@ -574,6 +574,36 @@ int test_clear_page_dirty(struct page *p EXPORT_SYMBOL(test_clear_page_dirty); /* + * Clear a page's dirty flag, while caring for dirty memory accounting. + * Returns true if the page was previously dirty. + * + * This is for preparing to put the page under writeout. We leave the page + * tagged as dirty in the radix tree so that a concurrent write-for-sync + * can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage + * implementation will run either set_page_writeback() or set_page_dirty(), + * at which stage we bring the page's dirty flag and radix-tree dirty tag + * back into sync. + * + * This incoherency between the page's dirty flag and radix-tree tag is + * unfortunate, but it only exists while the page is locked. + */ +int clear_page_dirty_for_io(struct page *page) +{ + struct address_space *mapping = page->mapping; + + if (mapping) { + if (TestClearPageDirty(page)) { + if (!mapping->backing_dev_info->memory_backed) + dec_page_state(nr_dirty); + return 1; + } + return 0; + } + return TestClearPageDirty(page); +} +EXPORT_SYMBOL(clear_page_dirty_for_io); + +/* * Clear a page's dirty flag while ignoring dirty memory accounting */ int __clear_page_dirty(struct page *page) @@ -629,6 +659,9 @@ int test_set_page_writeback(struct page if (!ret) radix_tree_tag_set(&mapping->page_tree, page->index, PAGECACHE_TAG_WRITEBACK); + if (!PageDirty(page)) + radix_tree_tag_clear(&mapping->page_tree, page->index, + PAGECACHE_TAG_DIRTY); spin_unlock_irqrestore(&mapping->tree_lock, flags); } else { ret = TestSetPageWriteback(page); diff -puN include/linux/mm.h~clear_page_dirty_for_io include/linux/mm.h --- 25/include/linux/mm.h~clear_page_dirty_for_io 2004-04-03 03:00:16.456729752 -0800 +++ 25-akpm/include/linux/mm.h 2004-04-03 03:00:16.463728688 -0800 @@ -472,6 +472,7 @@ int get_user_pages(struct task_struct *t int __set_page_dirty_buffers(struct page *page); int __set_page_dirty_nobuffers(struct page *page); int set_page_dirty_lock(struct page *page); +int clear_page_dirty_for_io(struct page *page); /* * Prototype to add a shrinker callback for ageable caches. diff -puN mm/vmscan.c~clear_page_dirty_for_io mm/vmscan.c --- 25/mm/vmscan.c~clear_page_dirty_for_io 2004-04-03 03:00:16.458729448 -0800 +++ 25-akpm/mm/vmscan.c 2004-04-03 03:00:16.464728536 -0800 @@ -354,7 +354,7 @@ shrink_list(struct list_head *page_list, goto keep_locked; if (!may_write_to_queue(mapping->backing_dev_info)) goto keep_locked; - if (test_clear_page_dirty(page)) { + if (clear_page_dirty_for_io(page)) { int res; struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, _