From: Suparna Bhattacharya The current implementation of O_SYNC support for AIO calls sync_page_range explicitly in the high level aio write handlers after invoking the fops aio_write method. This a duplication of logic across sync i/o and aio (not good from a maintenance perspective, and more error prone, e.g. forgets to check for O_DIRECT). More importantly, it doesn't take into account the possibility that different filesystems may have their own O_SYNC implementations in their corresponding aio methods. This is a problem for example in the case of ext3 which may commit the journal after data and meta-data are written out. This patch fixes the above problems by moving the call to sync_page_range out of the high level aio_pwrite handler, and back into generic_file_aio_write for both aio and sync i/o. It takes care to make sure that sync_page_range is called only after all the data is copied in the AIO case, rather than on every retry. fs/aio.c | 36 ++++++++++-------------------------- mm/filemap.c | 16 ++++++++++++++++ 2 files changed, 26 insertions(+), 26 deletions(-) diff -puN fs/aio.c~aio-O_SYNC-fix fs/aio.c --- 25/fs/aio.c~aio-O_SYNC-fix 2003-08-02 12:48:04.000000000 -0700 +++ 25-akpm/fs/aio.c 2003-08-02 12:48:04.000000000 -0700 @@ -1288,11 +1288,6 @@ static ssize_t aio_pwrite(struct kiocb * struct inode *inode = mapping->host; ssize_t ret = 0; - if (!iocb->ki_buf) { - ret = iocb->ki_left; - goto retry_osync; - } - ret = file->f_op->aio_write(iocb, iocb->ki_buf, iocb->ki_left, iocb->ki_pos); @@ -1301,7 +1296,7 @@ static ssize_t aio_pwrite(struct kiocb * * for a balance_dirty_pages to complete */ if (ret > 0) { - iocb->ki_buf += ret; + iocb->ki_buf += iocb->ki_buf ? ret : 0; iocb->ki_left -= ret; ret = -EIOCBRETRY; @@ -1311,30 +1306,19 @@ static ssize_t aio_pwrite(struct kiocb * /* No need to retry anymore unless we need to osync data */ if (ret == 0) { ret = iocb->ki_nbytes - iocb->ki_left; - /* Set things up for potential O_SYNC */ - iocb->ki_buf = NULL; - iocb->ki_pos -= ret; /* back up fpos */ - iocb->ki_left = ret; /* sync only what we have written out */ - iocb->ki_nbytes = ret; - } + if (!iocb->ki_buf) + return ret; - -retry_osync: - if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - ssize_t err; - - err = sync_page_range(inode, mapping, iocb->ki_pos, ret); - if (err <= 0) { - ret = err; - } else { - BUG_ON(err > iocb->ki_left); - iocb->ki_pos += err; - iocb->ki_left -= err; + /* Set things up for potential O_SYNC */ + if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + iocb->ki_buf = NULL; + iocb->ki_pos -= ret; /* back up fpos */ + iocb->ki_left = ret; /* sync what we have written out */ + iocb->ki_nbytes = ret; ret = -EIOCBRETRY; } } - if (ret == 0) - ret = iocb->ki_nbytes - iocb->ki_left; + return ret; } diff -puN mm/filemap.c~aio-O_SYNC-fix mm/filemap.c --- 25/mm/filemap.c~aio-O_SYNC-fix 2003-08-02 12:48:04.000000000 -0700 +++ 25-akpm/mm/filemap.c 2003-08-02 12:48:04.000000000 -0700 @@ -1919,17 +1919,33 @@ ssize_t generic_file_aio_write(struct ki BUG_ON(iocb->ki_pos != pos); + if (!buf && !is_sync_kiocb(iocb)) { + /* nothing to transfer, may just need to sync data */ + ret = count; + goto osync; + } + down(&inode->i_sem); ret = generic_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos); up(&inode->i_sem); + /* + * Avoid doing a sync in parts for aio - its more efficient to + * call in again after all the data has been copied + */ + if (!is_sync_kiocb(iocb)) + return ret; + +osync: if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { ssize_t err; err = sync_page_range(inode, mapping, pos, ret); if (err < 0) ret = err; + else + iocb->ki_pos = pos + err; } return ret; } _