From: Suparna Bhattacharya OK, have been playing with a patch to retry enable the osync speedup code. This should get us O_SYNC support for aio (for ext2 at least). 25-akpm/fs/aio.c | 44 +++++++++++++++++++++++++++++++------- 25-akpm/include/linux/pagemap.h | 8 ++++++ 25-akpm/include/linux/writeback.h | 2 - 25-akpm/mm/filemap.c | 4 +-- 25-akpm/mm/page-writeback.c | 23 ++++++++++++------- 5 files changed, 62 insertions(+), 19 deletions(-) diff -puN fs/aio.c~aio-09-o_sync fs/aio.c --- 25/fs/aio.c~aio-09-o_sync Tue Jul 8 16:07:18 2003 +++ 25-akpm/fs/aio.c Tue Jul 8 16:07:26 2003 @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -1245,16 +1246,21 @@ static ssize_t aio_pread(struct kiocb *i static ssize_t aio_pwrite(struct kiocb *iocb) { struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_dentry->d_inode->i_mapping; + struct inode *inode = mapping->host; ssize_t ret = 0; + if (!iocb->ki_buf) { + ret = iocb->ki_left; + goto retry_osync; + } + ret = file->f_op->aio_write(iocb, iocb->ki_buf, - iocb->ki_left, iocb->ki_pos); + iocb->ki_left, iocb->ki_pos); /* - * TBD: Even if iocb->ki_left = 0, could we need to - * wait for data to be sync'd ? Or can we assume - * that aio_fdsync/aio_fsync would be called explicitly - * as required. + * Even if iocb->ki_left = 0, we may need to wait + * for a balance_dirty_pages to complete */ if (ret > 0) { iocb->ki_buf += ret; @@ -1264,10 +1270,34 @@ static ssize_t aio_pwrite(struct kiocb * } /* This means we must have transferred all that we could */ - /* No need to retry anymore */ - if (ret == 0) + /* No need to retry anymore unless we need to osync data */ + if (ret == 0) { ret = iocb->ki_nbytes - iocb->ki_left; + /* Set things up for potential O_SYNC */ + iocb->ki_buf = NULL; + iocb->ki_pos -= ret; /* back up fpos */ + iocb->ki_left = ret; /* sync only what we have written out */ + iocb->ki_nbytes = ret; + } + +retry_osync: + if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + ssize_t err; + + err = sync_page_range(inode, mapping, iocb->ki_pos, ret); + if (err < 0) + ret = err; + else { + printk("synced %d bytes\n", err); + iocb->ki_pos += err; + iocb->ki_left -= err; + if ((iocb->ki_left) && (err != 0)) + ret = -EIOCBRETRY; + else + ret = iocb->ki_nbytes; + } + } return ret; } diff -puN include/linux/pagemap.h~aio-09-o_sync include/linux/pagemap.h --- 25/include/linux/pagemap.h~aio-09-o_sync Tue Jul 8 16:07:18 2003 +++ 25-akpm/include/linux/pagemap.h Tue Jul 8 16:07:18 2003 @@ -183,6 +183,14 @@ static inline void wait_on_page_writebac wait_on_page_bit(page, PG_writeback); } +static inline int wait_on_page_writeback_wq(struct page *page, + wait_queue_t *wait) +{ + if (PageWriteback(page)) + return wait_on_page_bit_wq(page, PG_writeback, wait); + return 0; +} + extern void end_page_writeback(struct page *page); /* diff -puN include/linux/writeback.h~aio-09-o_sync include/linux/writeback.h --- 25/include/linux/writeback.h~aio-09-o_sync Tue Jul 8 16:07:18 2003 +++ 25-akpm/include/linux/writeback.h Tue Jul 8 16:07:18 2003 @@ -88,7 +88,7 @@ int balance_dirty_pages(struct address_s int balance_dirty_pages_ratelimited(struct address_space *mapping); int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0); int do_writepages(struct address_space *mapping, struct writeback_control *wbc); -int sync_page_range(struct inode *inode, struct address_space *mapping, +ssize_t sync_page_range(struct inode *inode, struct address_space *mapping, loff_t pos, size_t count); /* pdflush.c */ diff -puN mm/filemap.c~aio-09-o_sync mm/filemap.c --- 25/mm/filemap.c~aio-09-o_sync Tue Jul 8 16:07:18 2003 +++ 25-akpm/mm/filemap.c Tue Jul 8 16:07:24 2003 @@ -1876,7 +1876,7 @@ generic_file_aio_write_nolock(struct kio */ if (likely(status >= 0)) { if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - if (!a_ops->writepage || !is_sync_kiocb(iocb)) + if (!a_ops->writepage) status = generic_osync_inode(inode, OSYNC_METADATA|OSYNC_DATA); } @@ -1980,7 +1980,7 @@ ssize_t generic_file_writev(struct file up(&inode->i_sem); if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - int err; + ssize_t err; err = sync_page_range(inode, inode->i_mapping, *ppos - ret, ret); diff -puN mm/page-writeback.c~aio-09-o_sync mm/page-writeback.c --- 25/mm/page-writeback.c~aio-09-o_sync Tue Jul 8 16:07:18 2003 +++ 25-akpm/mm/page-writeback.c Tue Jul 8 16:07:26 2003 @@ -570,7 +570,7 @@ int test_clear_page_dirty(struct page *p EXPORT_SYMBOL(test_clear_page_dirty); -static int operate_on_page_range(struct address_space *mapping, +static ssize_t operate_on_page_range(struct address_space *mapping, loff_t pos, size_t count, int (*operator)(struct page *)) { pgoff_t first = pos >> PAGE_CACHE_SHIFT; @@ -594,6 +594,10 @@ static int operate_on_page_range(struct } next = page->index + 1; ret = (*operator)(page); + if (ret == -EIOCBRETRY) { + next--; + break; + } if (PageError(page)) { if (!ret) ret = -EIO; @@ -602,20 +606,21 @@ static int operate_on_page_range(struct break; } pagevec_release(&pvec); - if (next > last) + if ((next > last) || (ret == -EIOCBRETRY)) break; } + if (!ret || (ret == -EIOCBRETRY)) + ret = (next << PAGE_CACHE_SHIFT) - pos; return ret; } static int page_waiter(struct page *page) { unlock_page(page); - wait_on_page_writeback(page); - return 0; + return wait_on_page_writeback_wq(page, current->io_wait); } -static int +static size_t wait_on_page_range(struct address_space *mapping, loff_t pos, size_t count) { return operate_on_page_range(mapping, pos, count, page_waiter); @@ -632,7 +637,7 @@ static int page_writer(struct page *page return page->mapping->a_ops->writepage(page, &wbc); } -static int +static ssize_t write_out_page_range(struct address_space *mapping, loff_t pos, size_t count) { return operate_on_page_range(mapping, pos, count, page_writer); @@ -646,7 +651,7 @@ write_out_page_range(struct address_spac * We need to re-take i_sem during the generic_osync_inode list walk because * it is otherwise livelockable. */ -int sync_page_range(struct inode *inode, struct address_space *mapping, +ssize_t sync_page_range(struct inode *inode, struct address_space *mapping, loff_t pos, size_t count) { int ret; @@ -656,12 +661,12 @@ int sync_page_range(struct inode *inode, if (mapping->backing_dev_info->memory_backed) return 0; ret = write_out_page_range(mapping, pos, count); - if (ret == 0) { + if (ret >= 0) { down(&inode->i_sem); ret = generic_osync_inode(inode, OSYNC_METADATA); up(&inode->i_sem); } - if (ret == 0) + if (ret >= 0) ret = wait_on_page_range(mapping, pos, count); return ret; } _