From: Suparna Bhattacharya The attached patch plugs some inaccuracies and associated inefficiencies in sync_page_range handling for AIO O_SYNC writes. - Correctly propagate the return value from sync_page_range when some but not all pages in the range have completed writeback (Otherwise, O_SYNC AIO writes could report completion before all the relevant writeouts actually completed) - Reset and check page dirty settings prior to attempting writeouts to avoid blocking synchronously for a writeback initiated otherwise. - Disable repeated calls to writeout or generic_osync_inode during AIO retries for the same iocb. - Don't block synchronously for data writeouts to complete when writing out inode state for generic_osync_inode in AIO context. The wait_on_page_range call after this will take care of completing the iocb only after the data has been written out. [TBD: Is this particular change OK for data=ordered, or does it need rethinking ?] I have run aio-stress with modifications for AIO O_SYNC writes on ext2, ext3, jfs, xfs and blockdev. fs/fs-writeback.c | 2 +- include/linux/aio.h | 5 +++++ mm/filemap.c | 20 ++++++-------------- mm/page-writeback.c | 20 ++++++++++++++++++-- 4 files changed, 30 insertions(+), 17 deletions(-) diff -puN include/linux/aio.h~aio-osync-fix-2 include/linux/aio.h --- 25/include/linux/aio.h~aio-osync-fix-2 2003-08-26 00:04:42.000000000 -0700 +++ 25-akpm/include/linux/aio.h 2003-08-26 00:04:42.000000000 -0700 @@ -29,21 +29,26 @@ struct kioctx; #define KIF_LOCKED 0 #define KIF_KICKED 1 #define KIF_CANCELLED 2 +#define KIF_SYNCED 3 #define kiocbTryLock(iocb) test_and_set_bit(KIF_LOCKED, &(iocb)->ki_flags) #define kiocbTryKick(iocb) test_and_set_bit(KIF_KICKED, &(iocb)->ki_flags) +#define kiocbTrySync(iocb) test_and_set_bit(KIF_SYNCED, &(iocb)->ki_flags) #define kiocbSetLocked(iocb) set_bit(KIF_LOCKED, &(iocb)->ki_flags) #define kiocbSetKicked(iocb) set_bit(KIF_KICKED, &(iocb)->ki_flags) #define kiocbSetCancelled(iocb) set_bit(KIF_CANCELLED, &(iocb)->ki_flags) +#define kiocbSetSynced(iocb) set_bit(KIF_SYNCED, &(iocb)->ki_flags) #define kiocbClearLocked(iocb) clear_bit(KIF_LOCKED, &(iocb)->ki_flags) #define kiocbClearKicked(iocb) clear_bit(KIF_KICKED, &(iocb)->ki_flags) #define kiocbClearCancelled(iocb) clear_bit(KIF_CANCELLED, &(iocb)->ki_flags) +#define kiocbClearSynced(iocb) clear_bit(KIF_SYNCED, &(iocb)->ki_flags) #define kiocbIsLocked(iocb) test_bit(KIF_LOCKED, &(iocb)->ki_flags) #define kiocbIsKicked(iocb) test_bit(KIF_KICKED, &(iocb)->ki_flags) #define kiocbIsCancelled(iocb) test_bit(KIF_CANCELLED, &(iocb)->ki_flags) +#define kiocbIsSynced(iocb) test_bit(KIF_SYNCED, &(iocb)->ki_flags) struct kiocb { struct list_head ki_run_list; diff -puN mm/filemap.c~aio-osync-fix-2 mm/filemap.c --- 25/mm/filemap.c~aio-osync-fix-2 2003-08-26 00:04:42.000000000 -0700 +++ 25-akpm/mm/filemap.c 2003-08-26 00:04:42.000000000 -0700 @@ -1953,13 +1953,9 @@ generic_file_aio_write_nolock(struct kio osync: if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - ssize_t err; - - err = sync_page_range_nolock(inode, mapping, pos, ret); - if (err < 0) - ret = err; - else - *ppos = pos + err; + ret = sync_page_range_nolock(inode, mapping, pos, ret); + if (ret >= 0) + *ppos = pos + ret; } return ret; } @@ -2023,13 +2019,9 @@ ssize_t generic_file_aio_write(struct ki osync: if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - ssize_t err; - - err = sync_page_range(inode, mapping, pos, ret); - if (err < 0) - ret = err; - else - iocb->ki_pos = pos + err; + ret = sync_page_range(inode, mapping, pos, ret); + if (ret >= 0) + iocb->ki_pos = pos + ret; } return ret; } diff -puN mm/page-writeback.c~aio-osync-fix-2 mm/page-writeback.c --- 25/mm/page-writeback.c~aio-osync-fix-2 2003-08-26 00:04:42.000000000 -0700 +++ 25-akpm/mm/page-writeback.c 2003-08-26 00:04:42.000000000 -0700 @@ -641,6 +641,10 @@ static int page_writer(struct page *page .nr_to_write = 1, }; + if (!test_clear_page_dirty(page)) { + unlock_page(page); + return 0; + } wait_on_page_writeback(page); return page->mapping->a_ops->writepage(page, &wbc); } @@ -662,8 +666,13 @@ write_out_page_range(struct address_spac ssize_t sync_page_range(struct inode *inode, struct address_space *mapping, loff_t pos, size_t count) { - int ret; + int ret = 0; + if (in_aio()) { + /* Already issued writeouts for this iocb ? */ + if (kiocbTrySync(io_wait_to_kiocb(current->io_wait))) + goto do_wait; /* just need to check if done */ + } if (!mapping->a_ops->writepage) return 0; if (mapping->backing_dev_info->memory_backed) @@ -674,6 +683,7 @@ ssize_t sync_page_range(struct inode *in ret = generic_osync_inode(inode, OSYNC_METADATA); up(&inode->i_sem); } +do_wait: if (ret >= 0) ret = wait_on_page_range(mapping, pos, count); return ret; @@ -688,8 +698,13 @@ ssize_t sync_page_range(struct inode *in ssize_t sync_page_range_nolock(struct inode *inode, struct address_space *mapping, loff_t pos, size_t count) { - int ret; + ssize_t ret = 0; + if (in_aio()) { + /* Already issued writeouts for this iocb ? */ + if (kiocbTrySync(io_wait_to_kiocb(current->io_wait))) + goto do_wait; /* just need to check if done */ + } if (!mapping->a_ops->writepage) return 0; if (mapping->backing_dev_info->memory_backed) @@ -698,6 +713,7 @@ ssize_t sync_page_range_nolock(struct in if (ret >= 0) { ret = generic_osync_inode(inode, OSYNC_METADATA); } +do_wait: if (ret >= 0) ret = wait_on_page_range(mapping, pos, count); return ret; _