From: Suparna Bhattacharya OK, have been playing with a patch to retry enable the osync speedup code. This should get us O_SYNC support for aio (for ext2 at least). fs/aio.c | 42 ++++++++++++++++++++++++++++++++++++------ include/linux/pagemap.h | 8 ++++++++ include/linux/writeback.h | 2 +- mm/filemap.c | 4 ++-- mm/page-writeback.c | 23 ++++++++++++++--------- 5 files changed, 61 insertions(+), 18 deletions(-) diff -puN fs/aio.c~aio-09-o_sync fs/aio.c --- 25/fs/aio.c~aio-09-o_sync 2003-06-16 23:11:58.000000000 -0700 +++ 25-akpm/fs/aio.c 2003-06-16 23:11:58.000000000 -0700 @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -1246,16 +1247,21 @@ static ssize_t aio_pread(struct kiocb *i static ssize_t aio_pwrite(struct kiocb *iocb) { struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_dentry->d_inode->i_mapping; + struct inode *inode = mapping->host; ssize_t ret = 0; + if (!iocb->ki_buf) { + ret = iocb->ki_left; + goto retry_osync; + } + ret = file->f_op->aio_write(iocb, iocb->ki_buf, iocb->ki_left, iocb->ki_pos); /* - * TBD: Even if iocb->ki_left = 0, could we need to - * wait for data to be sync'd ? Or can we assume - * that aio_fdsync/aio_fsync would be called explicitly - * as required. + * Even if iocb->ki_left = 0, we may need to wait + * for a balance_dirty_pages to complete */ if (ret > 0) { iocb->ki_buf += ret; @@ -1265,10 +1271,34 @@ static ssize_t aio_pwrite(struct kiocb * } /* This means we must have transferred all that we could */ - /* No need to retry anymore */ - if (ret == 0) + /* No need to retry anymore unless we need to osync data */ + if (ret == 0) { ret = iocb->ki_nbytes - iocb->ki_left; + /* Set things up for potential O_SYNC */ + iocb->ki_buf = NULL; + iocb->ki_pos -= ret; /* back up fpos */ + iocb->ki_left = ret; /* sync only what we have written out */ + iocb->ki_nbytes = ret; + } + +retry_osync: + if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + ssize_t err; + + err = sync_page_range(inode, mapping, iocb->ki_pos, ret); + if (err < 0) + ret = err; + else { + printk("synced %d bytes\n", err); + iocb->ki_pos += err; + iocb->ki_left -= err; + if ((iocb->ki_left) && (err != 0)) + ret = -EIOCBRETRY; + else + ret = iocb->ki_nbytes; + } + } return ret; } diff -puN include/linux/pagemap.h~aio-09-o_sync include/linux/pagemap.h --- 25/include/linux/pagemap.h~aio-09-o_sync 2003-06-16 23:11:58.000000000 -0700 +++ 25-akpm/include/linux/pagemap.h 2003-06-16 23:11:58.000000000 -0700 @@ -183,6 +183,14 @@ static inline void wait_on_page_writebac wait_on_page_bit(page, PG_writeback); } +static inline int wait_on_page_writeback_wq(struct page *page, + wait_queue_t *wait) +{ + if (PageWriteback(page)) + return wait_on_page_bit_wq(page, PG_writeback, wait); + return 0; +} + extern void end_page_writeback(struct page *page); /* diff -puN include/linux/writeback.h~aio-09-o_sync include/linux/writeback.h --- 25/include/linux/writeback.h~aio-09-o_sync 2003-06-16 23:11:58.000000000 -0700 +++ 25-akpm/include/linux/writeback.h 2003-06-16 23:11:58.000000000 -0700 @@ -88,7 +88,7 @@ int balance_dirty_pages(struct address_s int balance_dirty_pages_ratelimited(struct address_space *mapping); int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0); int do_writepages(struct address_space *mapping, struct writeback_control *wbc); -int sync_page_range(struct inode *inode, struct address_space *mapping, +ssize_t sync_page_range(struct inode *inode, struct address_space *mapping, loff_t pos, size_t count); /* pdflush.c */ diff -puN mm/filemap.c~aio-09-o_sync mm/filemap.c --- 25/mm/filemap.c~aio-09-o_sync 2003-06-16 23:11:58.000000000 -0700 +++ 25-akpm/mm/filemap.c 2003-06-16 23:11:58.000000000 -0700 @@ -1858,7 +1858,7 @@ generic_file_aio_write_nolock(struct kio */ if (likely(status >= 0)) { if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - if (!a_ops->writepage || !is_sync_kiocb(iocb)) + if (!a_ops->writepage) status = generic_osync_inode(inode, OSYNC_METADATA|OSYNC_DATA); } @@ -1962,7 +1962,7 @@ ssize_t generic_file_writev(struct file up(&inode->i_sem); if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - int err; + ssize_t err; err = sync_page_range(inode, inode->i_mapping, *ppos - ret, ret); diff -puN mm/page-writeback.c~aio-09-o_sync mm/page-writeback.c --- 25/mm/page-writeback.c~aio-09-o_sync 2003-06-16 23:11:58.000000000 -0700 +++ 25-akpm/mm/page-writeback.c 2003-06-16 23:11:58.000000000 -0700 @@ -569,7 +569,7 @@ int test_clear_page_dirty(struct page *p EXPORT_SYMBOL(test_clear_page_dirty); -static int operate_on_page_range(struct address_space *mapping, +static ssize_t operate_on_page_range(struct address_space *mapping, loff_t pos, size_t count, int (*operator)(struct page *)) { pgoff_t first = pos >> PAGE_CACHE_SHIFT; @@ -593,6 +593,10 @@ static int operate_on_page_range(struct } next = page->index + 1; ret = (*operator)(page); + if (ret == -EIOCBRETRY) { + next--; + break; + } if (PageError(page)) { if (!ret) ret = -EIO; @@ -601,20 +605,21 @@ static int operate_on_page_range(struct break; } pagevec_release(&pvec); - if (next > last) + if ((next > last) || (ret == -EIOCBRETRY)) break; } + if (!ret || (ret == -EIOCBRETRY)) + ret = (next << PAGE_CACHE_SHIFT) - pos; return ret; } static int page_waiter(struct page *page) { unlock_page(page); - wait_on_page_writeback(page); - return 0; + return wait_on_page_writeback_wq(page, current->io_wait); } -static int +static size_t wait_on_page_range(struct address_space *mapping, loff_t pos, size_t count) { return operate_on_page_range(mapping, pos, count, page_waiter); @@ -631,7 +636,7 @@ static int page_writer(struct page *page return page->mapping->a_ops->writepage(page, &wbc); } -static int +static ssize_t write_out_page_range(struct address_space *mapping, loff_t pos, size_t count) { return operate_on_page_range(mapping, pos, count, page_writer); @@ -645,7 +650,7 @@ write_out_page_range(struct address_spac * We need to re-take i_sem during the generic_osync_inode list walk because * it is otherwise livelockable. */ -int sync_page_range(struct inode *inode, struct address_space *mapping, +ssize_t sync_page_range(struct inode *inode, struct address_space *mapping, loff_t pos, size_t count) { int ret; @@ -655,12 +660,12 @@ int sync_page_range(struct inode *inode, if (mapping->backing_dev_info->memory_backed) return 0; ret = write_out_page_range(mapping, pos, count); - if (ret == 0) { + if (ret >= 0) { down(&inode->i_sem); ret = generic_osync_inode(inode, OSYNC_METADATA); up(&inode->i_sem); } - if (ret == 0) + if (ret >= 0) ret = wait_on_page_range(mapping, pos, count); return ret; } _