If pdflush hits a locked-and-clean buffer in __block_write_full_page() it will just pass over the buffer. Typically the buffer is an ext3 data=ordered buffer which is being written by kjournald, but a similar thing can happen with blockdev buffers and ll_rw_block(). This is bad because the buffer is still under I/O and a subsequent fsync's fdatawait() needs to know about it. It is not practical to tag the page for writeback - only the submitter of the I/O can do that, because the submitter has control of the end_io handler. So instead, redirty the page so a subsequent fsync's fdatawrite() will wait on the underway I/O. There is a risk that pdflush::background_writeout() will lock up, repeatedly trying and failing to write the same page. This is prevented by ensuring that background_writeout() always throttles when it made no progress. --- 25-akpm/fs/buffer.c | 19 ++++++++++++------- 25-akpm/fs/fs-writeback.c | 9 +++++++++ 25-akpm/include/linux/writeback.h | 1 + 25-akpm/mm/page-writeback.c | 8 ++++---- 4 files changed, 26 insertions(+), 11 deletions(-) diff -puN fs/buffer.c~block_write_full_page-redirty fs/buffer.c --- 25/fs/buffer.c~block_write_full_page-redirty 2004-04-03 03:00:16.719689776 -0800 +++ 25-akpm/fs/buffer.c 2004-04-03 03:00:16.728688408 -0800 @@ -1802,14 +1802,18 @@ static int __block_write_full_page(struc get_bh(bh); if (!buffer_mapped(bh)) continue; - if (wbc->sync_mode != WB_SYNC_NONE) { + /* + * If it's a fully non-blocking write attempt and we cannot + * lock the buffer then redirty the page. Note that this can + * potentially cause a busy-wait loop from pdflush and kswapd + * activity, but those code paths have their own higher-level + * throttling. + */ + if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { lock_buffer(bh); - } else { - if (test_set_buffer_locked(bh)) { - if (buffer_dirty(bh)) - __set_page_dirty_nobuffers(page); - continue; - } + } else if (test_set_buffer_locked(bh)) { + __set_page_dirty_nobuffers(page); + continue; } if (test_clear_buffer_dirty(bh)) { if (!buffer_uptodate(bh)) @@ -1857,6 +1861,7 @@ done: if (uptodate) SetPageUptodate(page); end_page_writeback(page); + wbc->pages_skipped++; /* We didn't write this page */ } return err; diff -puN fs/fs-writeback.c~block_write_full_page-redirty fs/fs-writeback.c --- 25/fs/fs-writeback.c~block_write_full_page-redirty 2004-04-03 03:00:16.721689472 -0800 +++ 25-akpm/fs/fs-writeback.c 2004-04-03 03:00:16.729688256 -0800 @@ -279,6 +279,7 @@ sync_sb_inodes(struct super_block *sb, s struct inode, i_list); struct address_space *mapping = inode->i_mapping; struct backing_dev_info *bdi = mapping->backing_dev_info; + long pages_skipped; if (bdi->memory_backed) { if (sb == blockdev_superblock) { @@ -326,6 +327,7 @@ sync_sb_inodes(struct super_block *sb, s BUG_ON(inode->i_state & I_FREEING); __iget(inode); + pages_skipped = wbc->pages_skipped; __writeback_single_inode(inode, wbc); if (wbc->sync_mode == WB_SYNC_HOLD) { inode->dirtied_when = jiffies; @@ -333,6 +335,13 @@ sync_sb_inodes(struct super_block *sb, s } if (current_is_pdflush()) writeback_release(bdi); + if (wbc->pages_skipped != pages_skipped) { + /* + * writeback is not making progress due to locked + * buffers. Skip this inode for now. + */ + list_move(&inode->i_list, &sb->s_dirty); + } spin_unlock(&inode_lock); iput(inode); spin_lock(&inode_lock); diff -puN include/linux/writeback.h~block_write_full_page-redirty include/linux/writeback.h --- 25/include/linux/writeback.h~block_write_full_page-redirty 2004-04-03 03:00:16.722689320 -0800 +++ 25-akpm/include/linux/writeback.h 2004-04-03 03:00:16.729688256 -0800 @@ -39,6 +39,7 @@ struct writeback_control { older than this */ long nr_to_write; /* Write this many pages, and decrement this for each page written */ + long pages_skipped; /* Pages which were not written */ int nonblocking; /* Don't get stuck on request queues */ int encountered_congestion; /* An output: a queue is full */ int for_kupdate; /* A kupdate writeback */ diff -puN mm/page-writeback.c~block_write_full_page-redirty mm/page-writeback.c --- 25/mm/page-writeback.c~block_write_full_page-redirty 2004-04-03 03:00:16.723689168 -0800 +++ 25-akpm/mm/page-writeback.c 2004-04-03 03:00:16.730688104 -0800 @@ -261,13 +261,13 @@ static void background_writeout(unsigned break; wbc.encountered_congestion = 0; wbc.nr_to_write = MAX_WRITEBACK_PAGES; + wbc.pages_skipped = 0; writeback_inodes(&wbc); min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; - if (wbc.nr_to_write > 0) { + if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { /* Wrote less than expected */ - if (wbc.encountered_congestion) - blk_congestion_wait(WRITE, HZ/10); - else + blk_congestion_wait(WRITE, HZ/10); + if (!wbc.encountered_congestion) break; } } _