diff -purN -X /home/mbligh/.diff.exclude 540-O_SYNC-speedup/fs/aio.c 545-aio-O_SYNC/fs/aio.c --- 540-O_SYNC-speedup/fs/aio.c 2004-02-20 15:40:46.000000000 -0800 +++ 545-aio-O_SYNC/fs/aio.c 2004-02-20 15:41:01.000000000 -0800 @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -1308,28 +1309,40 @@ static ssize_t aio_pread(struct kiocb *i static ssize_t aio_pwrite(struct kiocb *iocb) { struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; ssize_t ret = 0; ret = file->f_op->aio_write(iocb, iocb->ki_buf, - iocb->ki_left, iocb->ki_pos); + iocb->ki_left, iocb->ki_pos); /* - * TBD: Even if iocb->ki_left = 0, could we need to - * wait for data to be sync'd ? Or can we assume - * that aio_fdsync/aio_fsync would be called explicitly - * as required. + * Even if iocb->ki_left = 0, we may need to wait + * for a balance_dirty_pages to complete */ if (ret > 0) { - iocb->ki_buf += ret; + iocb->ki_buf += iocb->ki_buf ? ret : 0; iocb->ki_left -= ret; ret = -EIOCBRETRY; } /* This means we must have transferred all that we could */ - /* No need to retry anymore */ - if (ret == 0) + /* No need to retry anymore unless we need to osync data */ + if (ret == 0) { ret = iocb->ki_nbytes - iocb->ki_left; + if (!iocb->ki_buf) + return ret; + + /* Set things up for potential O_SYNC */ + if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + iocb->ki_buf = NULL; + iocb->ki_pos -= ret; /* back up fpos */ + iocb->ki_left = ret; /* sync what we have written out */ + iocb->ki_nbytes = ret; + ret = -EIOCBRETRY; + } + } return ret; } diff -purN -X /home/mbligh/.diff.exclude 540-O_SYNC-speedup/include/linux/aio.h 545-aio-O_SYNC/include/linux/aio.h --- 540-O_SYNC-speedup/include/linux/aio.h 2004-02-20 15:40:36.000000000 -0800 +++ 545-aio-O_SYNC/include/linux/aio.h 2004-02-20 15:41:01.000000000 -0800 @@ -29,21 +29,26 @@ struct kioctx; #define KIF_LOCKED 0 #define KIF_KICKED 1 #define KIF_CANCELLED 2 +#define KIF_SYNCED 3 #define kiocbTryLock(iocb) test_and_set_bit(KIF_LOCKED, &(iocb)->ki_flags) #define kiocbTryKick(iocb) test_and_set_bit(KIF_KICKED, &(iocb)->ki_flags) +#define kiocbTrySync(iocb) test_and_set_bit(KIF_SYNCED, &(iocb)->ki_flags) #define kiocbSetLocked(iocb) set_bit(KIF_LOCKED, &(iocb)->ki_flags) #define kiocbSetKicked(iocb) set_bit(KIF_KICKED, &(iocb)->ki_flags) #define kiocbSetCancelled(iocb) set_bit(KIF_CANCELLED, &(iocb)->ki_flags) +#define kiocbSetSynced(iocb) set_bit(KIF_SYNCED, &(iocb)->ki_flags) #define kiocbClearLocked(iocb) clear_bit(KIF_LOCKED, &(iocb)->ki_flags) #define kiocbClearKicked(iocb) clear_bit(KIF_KICKED, &(iocb)->ki_flags) #define kiocbClearCancelled(iocb) clear_bit(KIF_CANCELLED, &(iocb)->ki_flags) +#define kiocbClearSynced(iocb) clear_bit(KIF_SYNCED, &(iocb)->ki_flags) #define kiocbIsLocked(iocb) test_bit(KIF_LOCKED, &(iocb)->ki_flags) #define kiocbIsKicked(iocb) test_bit(KIF_KICKED, &(iocb)->ki_flags) #define kiocbIsCancelled(iocb) test_bit(KIF_CANCELLED, &(iocb)->ki_flags) +#define kiocbIsSynced(iocb) test_bit(KIF_SYNCED, &(iocb)->ki_flags) struct kiocb { struct list_head ki_run_list; diff -purN -X /home/mbligh/.diff.exclude 540-O_SYNC-speedup/include/linux/writeback.h 545-aio-O_SYNC/include/linux/writeback.h --- 540-O_SYNC-speedup/include/linux/writeback.h 2004-02-20 15:40:58.000000000 -0800 +++ 545-aio-O_SYNC/include/linux/writeback.h 2004-02-20 15:41:01.000000000 -0800 @@ -87,8 +87,10 @@ void page_writeback_init(void); void balance_dirty_pages_ratelimited(struct address_space *mapping); int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0); int do_writepages(struct address_space *mapping, struct writeback_control *wbc); -int sync_page_range(struct inode *inode, struct address_space *mapping, +ssize_t sync_page_range(struct inode *inode, struct address_space *mapping, loff_t pos, size_t count); +ssize_t sync_page_range_nolock(struct inode *inode, struct address_space + *mapping, loff_t pos, size_t count); /* pdflush.c */ extern int nr_pdflush_threads; /* Global so it can be exported to sysctl diff -purN -X /home/mbligh/.diff.exclude 540-O_SYNC-speedup/mm/filemap.c 545-aio-O_SYNC/mm/filemap.c --- 540-O_SYNC-speedup/mm/filemap.c 2004-02-20 15:40:58.000000000 -0800 +++ 545-aio-O_SYNC/mm/filemap.c 2004-02-20 15:41:01.000000000 -0800 @@ -944,22 +944,19 @@ __generic_file_aio_read(struct kiocb *io out: return retval; } - EXPORT_SYMBOL(__generic_file_aio_read); -ssize_t -generic_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos) +ssize_t generic_file_aio_read(struct kiocb *iocb, char __user *buf, + size_t count, loff_t pos) { struct iovec local_iov = { .iov_base = buf, .iov_len = count }; - BUG_ON(iocb->ki_pos != pos); return __generic_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos); } - EXPORT_SYMBOL(generic_file_aio_read); -ssize_t -generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) +ssize_t generic_file_read(struct file *filp, char __user *buf, + size_t count, loff_t *ppos) { struct iovec local_iov = { .iov_base = buf, .iov_len = count }; struct kiocb kiocb; @@ -971,10 +968,10 @@ generic_file_read(struct file *filp, cha ret = wait_on_sync_kiocb(&kiocb); return ret; } - EXPORT_SYMBOL(generic_file_read); -int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size) +int file_send_actor(read_descriptor_t * desc, struct page *page, + unsigned long offset, unsigned long size) { ssize_t written; unsigned long count = desc->count; @@ -1821,7 +1818,7 @@ EXPORT_SYMBOL(generic_write_checks); * okir@monad.swb.de */ ssize_t -generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, +__generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t *ppos) { struct file *file = iocb->ki_filp; @@ -1990,7 +1987,7 @@ generic_file_aio_write_nolock(struct kio */ if (likely(status >= 0)) { if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - if (!a_ops->writepage || !is_sync_kiocb(iocb)) + if (!a_ops->writepage) status = generic_osync_inode(inode, mapping, OSYNC_METADATA|OSYNC_DATA); } @@ -2007,6 +2004,55 @@ out: EXPORT_SYMBOL(generic_file_aio_write_nolock); ssize_t +generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + ssize_t ret; + loff_t pos = *ppos; + + if (!iov->iov_base && !is_sync_kiocb(iocb)) { + /* nothing to transfer, may just need to sync data */ + ret = iov->iov_len; /* vector AIO not supported yet */ + goto osync; + } + + ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, ppos); + + /* + * Avoid doing a sync in parts for aio - its more efficient to + * call in again after all the data has been copied + */ + if (!is_sync_kiocb(iocb)) + return ret; + +osync: + if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + ret = sync_page_range_nolock(inode, mapping, pos, ret); + if (ret >= 0) + *ppos = pos + ret; + } + return ret; +} + + +ssize_t +__generic_file_write_nolock(struct file *file, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) +{ + struct kiocb kiocb; + ssize_t ret; + + init_sync_kiocb(&kiocb, file); + ret = __generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos); + if (-EIOCBQUEUED == ret) + ret = wait_on_sync_kiocb(&kiocb); + return ret; +} + +ssize_t generic_file_write_nolock(struct file *file, const struct iovec *iov, unsigned long nr_segs, loff_t *ppos) { @@ -2032,19 +2078,29 @@ ssize_t generic_file_aio_write(struct ki struct iovec local_iov = { .iov_base = (void __user *)buf, .iov_len = count }; - BUG_ON(iocb->ki_pos != pos); + if (!buf && !is_sync_kiocb(iocb)) { + /* nothing to transfer, may just need to sync data */ + ret = count; + goto osync; + } down(&inode->i_sem); - ret = generic_file_aio_write_nolock(iocb, &local_iov, 1, + ret = __generic_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos); up(&inode->i_sem); - if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - ssize_t err; + /* + * Avoid doing a sync in parts for aio - its more efficient to + * call in again after all the data has been copied + */ + if (!is_sync_kiocb(iocb)) + return ret; - err = sync_page_range(inode, mapping, pos, ret); - if (err < 0) - ret = err; +osync: + if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + ret = sync_page_range(inode, mapping, pos, ret); + if (ret >= 0) + iocb->ki_pos = pos + ret; } return ret; } @@ -2060,7 +2116,7 @@ ssize_t generic_file_write(struct file * .iov_len = count }; down(&inode->i_sem); - ret = generic_file_write_nolock(file, &local_iov, 1, ppos); + ret = __generic_file_write_nolock(file, &local_iov, 1, ppos); up(&inode->i_sem); if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { @@ -2097,11 +2153,11 @@ ssize_t generic_file_writev(struct file ssize_t ret; down(&inode->i_sem); - ret = generic_file_write_nolock(file, iov, nr_segs, ppos); + ret = __generic_file_write_nolock(file, iov, nr_segs, ppos); up(&inode->i_sem); if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - int err; + ssize_t err; err = sync_page_range(inode, mapping, *ppos - ret, ret); if (err < 0) diff -purN -X /home/mbligh/.diff.exclude 540-O_SYNC-speedup/mm/page-writeback.c 545-aio-O_SYNC/mm/page-writeback.c --- 540-O_SYNC-speedup/mm/page-writeback.c 2004-02-20 15:40:58.000000000 -0800 +++ 545-aio-O_SYNC/mm/page-writeback.c 2004-02-20 15:41:01.000000000 -0800 @@ -571,16 +571,19 @@ int test_clear_page_dirty(struct page *p EXPORT_SYMBOL(test_clear_page_dirty); -static int operate_on_page_range(struct address_space *mapping, +static ssize_t operate_on_page_range(struct address_space *mapping, loff_t pos, size_t count, int (*operator)(struct page *)) { pgoff_t first = pos >> PAGE_CACHE_SHIFT; pgoff_t last = (pos + count - 1) >> PAGE_CACHE_SHIFT; /* inclusive */ pgoff_t next = first; struct pagevec pvec; - ssize_t ret = 0; + ssize_t ret = 0, bytes = 0; int i; + if (count == 0) + return 0; + pagevec_init(&pvec, 0); while (pagevec_lookup(&pvec, mapping, next, min((pgoff_t)PAGEVEC_SIZE, last - next + 1))) { @@ -595,6 +598,8 @@ static int operate_on_page_range(struct } next = page->index + 1; ret = (*operator)(page); + if (ret == -EIOCBRETRY) + break; if (PageError(page)) { if (!ret) ret = -EIO; @@ -603,20 +608,22 @@ static int operate_on_page_range(struct break; } pagevec_release(&pvec); - if (next > last) + if ((next > last) || (ret == -EIOCBRETRY)) break; } - return ret; + bytes = (next << PAGE_CACHE_SHIFT) - pos; + if (bytes > count) + bytes = count; + return (bytes && (!ret || (ret == -EIOCBRETRY))) ? bytes : ret; } static int page_waiter(struct page *page) { unlock_page(page); - wait_on_page_writeback(page); - return 0; + return wait_on_page_writeback_wq(page, current->io_wait); } -static int +static size_t wait_on_page_range(struct address_space *mapping, loff_t pos, size_t count) { return operate_on_page_range(mapping, pos, count, page_waiter); @@ -629,11 +636,15 @@ static int page_writer(struct page *page .nr_to_write = 1, }; + if (!test_clear_page_dirty(page)) { + unlock_page(page); + return 0; + } wait_on_page_writeback(page); return page->mapping->a_ops->writepage(page, &wbc); } -static int +static ssize_t write_out_page_range(struct address_space *mapping, loff_t pos, size_t count) { return operate_on_page_range(mapping, pos, count, page_writer); @@ -647,22 +658,58 @@ write_out_page_range(struct address_spac * We need to re-take i_sem during the generic_osync_inode list walk because * it is otherwise livelockable. */ -int sync_page_range(struct inode *inode, struct address_space *mapping, +ssize_t sync_page_range(struct inode *inode, struct address_space *mapping, loff_t pos, size_t count) { - int ret; + int ret = 0; + if (in_aio()) { + /* Already issued writeouts for this iocb ? */ + if (kiocbTrySync(io_wait_to_kiocb(current->io_wait))) + goto do_wait; /* just need to check if done */ + } if (!mapping->a_ops->writepage) return 0; if (mapping->backing_dev_info->memory_backed) return 0; ret = write_out_page_range(mapping, pos, count); - if (ret == 0) { + if (ret >= 0) { down(&inode->i_sem); ret = generic_osync_inode(inode, mapping, OSYNC_METADATA); up(&inode->i_sem); } - if (ret == 0) +do_wait: + if (ret >= 0) + ret = wait_on_page_range(mapping, pos, count); + return ret; +} + +/* + * It is really better to use sync_page_range, rather than call + * sync_page_range_nolock while holding i_sem, if you don't + * want to block parallel O_SYNC writes until the pages in this + * range are written out. + */ +ssize_t sync_page_range_nolock(struct inode *inode, struct address_space + *mapping, loff_t pos, size_t count) +{ + ssize_t ret = 0; + + if (in_aio()) { + /* Already issued writeouts for this iocb ? */ + if (kiocbTrySync(io_wait_to_kiocb(current->io_wait))) + goto do_wait; /* just need to check if done */ + } + if (!mapping->a_ops->writepage) + return 0; + if (mapping->backing_dev_info->memory_backed) + return 0; + ret = write_out_page_range(mapping, pos, count); + if (ret >= 0) { + ret = generic_osync_inode(inode, mapping, OSYNC_METADATA); + } +do_wait: + if (ret >= 0) ret = wait_on_page_range(mapping, pos, count); return ret; }