From: Suparna Bhattacharya The current implementation of O_SYNC support for AIO calls sync_page_range explicitly in the high level aio write handlers after invoking the fops aio_write method. This a duplication of logic across sync i/o and aio (not good from a maintenance perspective, and more error prone, e.g. forgets to check for O_DIRECT). More importantly, it doesn't take into account the possibility that different filesystems may have their own O_SYNC implementations in their corresponding aio methods. This is a problem for example in the case of ext3 which may commit the journal after data and meta-data are written out. This patch fixes the above problems by moving the call to sync_page_range out of the high level aio_pwrite handler, and back into generic_file_aio_write for both aio and sync i/o. It takes care to make sure that sync_page_range is called only after all the data is copied in the AIO case, rather than on every retry. DESC aio-O_SYNC-fix bits got lost EDESC From: Suparna Bhattacharya Patch containing just the missing hunks DESC aio: writev nr_segs fix EDESC From: Suparna Bhattacharya I'm not sure if this would help here, but there is one bug which I just spotted which would affect writev from XFS. I wasn't passing the nr_segs down properly. fs/aio.c | 36 +++++------------- include/linux/writeback.h | 2 + mm/filemap.c | 89 ++++++++++++++++++++++++++++++++++++++++------ mm/page-writeback.c | 24 ++++++++++++ 4 files changed, 114 insertions(+), 37 deletions(-) diff -puN include/linux/writeback.h~aio-O_SYNC-fix include/linux/writeback.h --- 25/include/linux/writeback.h~aio-O_SYNC-fix 2003-09-14 23:09:14.000000000 -0700 +++ 25-akpm/include/linux/writeback.h 2003-09-14 23:09:39.000000000 -0700 @@ -89,6 +89,8 @@ int pdflush_operation(void (*fn)(unsigne int do_writepages(struct address_space *mapping, struct writeback_control *wbc); ssize_t sync_page_range(struct inode *inode, struct address_space *mapping, loff_t pos, size_t count); +ssize_t sync_page_range_nolock(struct inode *inode, struct address_space + *mapping, loff_t pos, size_t count); /* pdflush.c */ extern int nr_pdflush_threads; /* Global so it can be exported to sysctl diff -puN mm/filemap.c~aio-O_SYNC-fix mm/filemap.c --- 25/mm/filemap.c~aio-O_SYNC-fix 2003-09-14 23:09:14.000000000 -0700 +++ 25-akpm/mm/filemap.c 2003-09-14 23:10:10.000000000 -0700 @@ -889,19 +889,18 @@ out: return retval; } -ssize_t -generic_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos) +ssize_t generic_file_aio_read(struct kiocb *iocb, char __user *buf, + size_t count, loff_t pos) { struct iovec local_iov = { .iov_base = buf, .iov_len = count }; - BUG_ON(iocb->ki_pos != pos); return __generic_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos); } EXPORT_SYMBOL(generic_file_aio_read); EXPORT_SYMBOL(__generic_file_aio_read); -ssize_t -generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) +ssize_t generic_file_read(struct file *filp, char __user *buf, + size_t count, loff_t *ppos) { struct iovec local_iov = { .iov_base = buf, .iov_len = count }; struct kiocb kiocb; @@ -914,7 +913,8 @@ generic_file_read(struct file *filp, cha return ret; } -int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size) +int file_send_actor(read_descriptor_t * desc, struct page *page, + unsigned long offset, unsigned long size) { ssize_t written; unsigned long count = desc->count; @@ -1725,7 +1725,7 @@ EXPORT_SYMBOL(generic_write_checks); * okir@monad.swb.de */ ssize_t -generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, +__generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t *ppos) { struct file *file = iocb->ki_filp; @@ -1917,6 +1917,59 @@ out: } ssize_t +generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) +{ + struct file *file = iocb->ki_filp; + struct address_space * mapping = file->f_dentry->d_inode->i_mapping; + struct inode *inode = mapping->host; + ssize_t ret; + loff_t pos = *ppos; + + if (!iov->iov_base && !is_sync_kiocb(iocb)) { + /* nothing to transfer, may just need to sync data */ + ret = iov->iov_len; /* vector AIO not supported yet */ + goto osync; + } + + ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, ppos); + + /* + * Avoid doing a sync in parts for aio - its more efficient to + * call in again after all the data has been copied + */ + if (!is_sync_kiocb(iocb)) + return ret; + +osync: + if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + ssize_t err; + + err = sync_page_range_nolock(inode, mapping, pos, ret); + if (err < 0) + ret = err; + else + *ppos = pos + err; + } + return ret; +} + + +ssize_t +__generic_file_write_nolock(struct file *file, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) +{ + struct kiocb kiocb; + ssize_t ret; + + init_sync_kiocb(&kiocb, file); + ret = __generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos); + if (-EIOCBQUEUED == ret) + ret = wait_on_sync_kiocb(&kiocb); + return ret; +} + +ssize_t generic_file_write_nolock(struct file *file, const struct iovec *iov, unsigned long nr_segs, loff_t *ppos) { @@ -1940,19 +1993,33 @@ ssize_t generic_file_aio_write(struct ki struct iovec local_iov = { .iov_base = (void __user *)buf, .iov_len = count }; - BUG_ON(iocb->ki_pos != pos); + if (!buf && !is_sync_kiocb(iocb)) { + /* nothing to transfer, may just need to sync data */ + ret = count; + goto osync; + } down(&inode->i_sem); - ret = generic_file_aio_write_nolock(iocb, &local_iov, 1, + ret = __generic_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos); up(&inode->i_sem); + /* + * Avoid doing a sync in parts for aio - its more efficient to + * call in again after all the data has been copied + */ + if (!is_sync_kiocb(iocb)) + return ret; + +osync: if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { ssize_t err; err = sync_page_range(inode, mapping, pos, ret); if (err < 0) ret = err; + else + iocb->ki_pos = pos + err; } return ret; } @@ -1969,7 +2036,7 @@ ssize_t generic_file_write(struct file * .iov_len = count }; down(&inode->i_sem); - ret = generic_file_write_nolock(file, &local_iov, 1, ppos); + ret = __generic_file_write_nolock(file, &local_iov, 1, ppos); up(&inode->i_sem); if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { @@ -2002,7 +2069,7 @@ ssize_t generic_file_writev(struct file ssize_t ret; down(&inode->i_sem); - ret = generic_file_write_nolock(file, iov, nr_segs, ppos); + ret = __generic_file_write_nolock(file, iov, nr_segs, ppos); up(&inode->i_sem); if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { diff -puN mm/page-writeback.c~aio-O_SYNC-fix mm/page-writeback.c --- 25/mm/page-writeback.c~aio-O_SYNC-fix 2003-09-14 23:09:14.000000000 -0700 +++ 25-akpm/mm/page-writeback.c 2003-09-14 23:09:39.000000000 -0700 @@ -678,3 +678,27 @@ ssize_t sync_page_range(struct inode *in ret = wait_on_page_range(mapping, pos, count); return ret; } + +/* + * It is really better to use sync_page_range, rather than call + * sync_page_range_nolock while holding i_sem, if you don't + * want to block parallel O_SYNC writes until the pages in this + * range are written out. + */ +ssize_t sync_page_range_nolock(struct inode *inode, struct address_space + *mapping, loff_t pos, size_t count) +{ + int ret; + + if (!mapping->a_ops->writepage) + return 0; + if (mapping->backing_dev_info->memory_backed) + return 0; + ret = write_out_page_range(mapping, pos, count); + if (ret >= 0) { + ret = generic_osync_inode(inode, OSYNC_METADATA); + } + if (ret >= 0) + ret = wait_on_page_range(mapping, pos, count); + return ret; +} diff -puN fs/aio.c~aio-O_SYNC-fix fs/aio.c --- 25/fs/aio.c~aio-O_SYNC-fix 2003-09-14 23:09:20.000000000 -0700 +++ 25-akpm/fs/aio.c 2003-09-14 23:09:20.000000000 -0700 @@ -1289,11 +1289,6 @@ static ssize_t aio_pwrite(struct kiocb * struct inode *inode = mapping->host; ssize_t ret = 0; - if (!iocb->ki_buf) { - ret = iocb->ki_left; - goto retry_osync; - } - ret = file->f_op->aio_write(iocb, iocb->ki_buf, iocb->ki_left, iocb->ki_pos); @@ -1302,7 +1297,7 @@ static ssize_t aio_pwrite(struct kiocb * * for a balance_dirty_pages to complete */ if (ret > 0) { - iocb->ki_buf += ret; + iocb->ki_buf += iocb->ki_buf ? ret : 0; iocb->ki_left -= ret; ret = -EIOCBRETRY; @@ -1312,30 +1307,19 @@ static ssize_t aio_pwrite(struct kiocb * /* No need to retry anymore unless we need to osync data */ if (ret == 0) { ret = iocb->ki_nbytes - iocb->ki_left; - /* Set things up for potential O_SYNC */ - iocb->ki_buf = NULL; - iocb->ki_pos -= ret; /* back up fpos */ - iocb->ki_left = ret; /* sync only what we have written out */ - iocb->ki_nbytes = ret; - } + if (!iocb->ki_buf) + return ret; - -retry_osync: - if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - ssize_t err; - - err = sync_page_range(inode, mapping, iocb->ki_pos, ret); - if (err <= 0) { - ret = err; - } else { - BUG_ON(err > iocb->ki_left); - iocb->ki_pos += err; - iocb->ki_left -= err; + /* Set things up for potential O_SYNC */ + if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + iocb->ki_buf = NULL; + iocb->ki_pos -= ret; /* back up fpos */ + iocb->ki_left = ret; /* sync what we have written out */ + iocb->ki_nbytes = ret; ret = -EIOCBRETRY; } } - if (ret == 0) - ret = iocb->ki_nbytes - iocb->ki_left; + return ret; } _