From: Daniel McNeil Should fix the final data-exposure problems with combined O_DIRECT and buffered I/O against local filesystems. The problem is that there are windows in which the normal kupdate writeback function has moved pages onto a different page list (locked_pages or io_pages). This writeback will discover that the page already has buffers which are under I/O (due to JBD commit activity) and __block_write_full_page() will then put the page back onto the dirty_pages list. But this leaves a window in which the page is not visible to filemap_datawrite()/filemap_fdatawait(). Consequently the sync which O_DIRECT performs to flush out any pending buffered writes will fail to wait on some blocks which already have I/O in flight. With enough luck, an O_DIRECT read can get in there and peek at unwritten data. We fix this by introducing serialisation between regular memory-cleansing writeback (WB_SYNC_NONE) and data-integrity writeback (WB_SYNC_ALL). A per-address_space rwsem is added. memory-cleansing writeback uses a read_trylock() so it doesn't get blocked by sync() operations. And the sync() operations use down_write() so they have exclusive access to the address_space's writeback, thus eliminating all these races. --- 25-akpm/fs/buffer.c | 3 +-- 25-akpm/fs/inode.c | 1 + 25-akpm/include/linux/fs.h | 1 + 25-akpm/mm/page-writeback.c | 27 +++++++++++++++++++++++++-- 25-akpm/mm/swap_state.c | 1 + 5 files changed, 29 insertions(+), 4 deletions(-) diff -puN fs/buffer.c~O_DIRECT-vs-buffered-fix fs/buffer.c --- 25/fs/buffer.c~O_DIRECT-vs-buffered-fix Mon Mar 1 14:52:24 2004 +++ 25-akpm/fs/buffer.c Mon Mar 1 14:52:24 2004 @@ -1816,8 +1816,7 @@ static int __block_write_full_page(struc lock_buffer(bh); } else { if (test_set_buffer_locked(bh)) { - if (buffer_dirty(bh)) - __set_page_dirty_nobuffers(page); + __set_page_dirty_nobuffers(page); continue; } } diff -puN fs/inode.c~O_DIRECT-vs-buffered-fix fs/inode.c --- 25/fs/inode.c~O_DIRECT-vs-buffered-fix Mon Mar 1 14:52:24 2004 +++ 25-akpm/fs/inode.c Mon Mar 1 14:52:24 2004 @@ -195,6 +195,7 @@ void inode_init_once(struct inode *inode INIT_LIST_HEAD(&inode->i_data.i_mmap_shared); spin_lock_init(&inode->i_lock); i_size_ordered_init(inode); + init_rwsem(&inode->i_data.wb_rwsema); } EXPORT_SYMBOL(inode_init_once); diff -puN include/linux/fs.h~O_DIRECT-vs-buffered-fix include/linux/fs.h --- 25/include/linux/fs.h~O_DIRECT-vs-buffered-fix Mon Mar 1 14:52:24 2004 +++ 25-akpm/include/linux/fs.h Mon Mar 1 14:52:24 2004 @@ -339,6 +339,7 @@ struct address_space { spinlock_t private_lock; /* for use by the address_space */ struct list_head private_list; /* ditto */ struct address_space *assoc_mapping; /* ditto */ + struct rw_semaphore wb_rwsema; /* serialize SYNC writebacks */ }; struct block_device { diff -puN mm/page-writeback.c~O_DIRECT-vs-buffered-fix mm/page-writeback.c --- 25/mm/page-writeback.c~O_DIRECT-vs-buffered-fix Mon Mar 1 14:52:24 2004 +++ 25-akpm/mm/page-writeback.c Mon Mar 1 14:52:24 2004 @@ -482,9 +482,32 @@ void __init page_writeback_init(void) int do_writepages(struct address_space *mapping, struct writeback_control *wbc) { + int ret; + if (wbc->sync_mode == WB_SYNC_NONE) { + if (!down_read_trylock(&mapping->wb_rwsema)) + /* + * SYNC writeback in progress + */ + return 0; + } else { + /* + * Only allow 1 SYNC writeback at a time, to be able + * to wait for all i/o without worrying about racing + * WB_SYNC_NONE writers. + */ + down_write(&mapping->wb_rwsema); + } + if (mapping->a_ops->writepages) - return mapping->a_ops->writepages(mapping, wbc); - return generic_writepages(mapping, wbc); + ret = mapping->a_ops->writepages(mapping, wbc); + else + ret = generic_writepages(mapping, wbc); + if (wbc->sync_mode == WB_SYNC_NONE) { + up_read(&mapping->wb_rwsema); + } else { + up_write(&mapping->wb_rwsema); + } + return ret; } /** diff -puN mm/swap_state.c~O_DIRECT-vs-buffered-fix mm/swap_state.c --- 25/mm/swap_state.c~O_DIRECT-vs-buffered-fix Mon Mar 1 14:52:24 2004 +++ 25-akpm/mm/swap_state.c Mon Mar 1 14:52:24 2004 @@ -38,6 +38,7 @@ struct address_space swapper_space = { .truncate_count = ATOMIC_INIT(0), .private_lock = SPIN_LOCK_UNLOCKED, .private_list = LIST_HEAD_INIT(swapper_space.private_list), + .wb_rwsema = __RWSEM_INITIALIZER(swapper_space.wb_rwsema) }; #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) _