diff -purN -X /home/mbligh/.diff.exclude 535-lazy-readahead-adapt/include/linux/buffer_head.h 540-O_SYNC-speedup/include/linux/buffer_head.h --- 535-lazy-readahead-adapt/include/linux/buffer_head.h 2004-02-04 16:24:32.000000000 -0800 +++ 540-O_SYNC-speedup/include/linux/buffer_head.h 2004-02-20 15:40:58.000000000 -0800 @@ -206,12 +206,6 @@ int nobh_prepare_write(struct page*, uns int nobh_commit_write(struct file *, struct page *, unsigned, unsigned); int nobh_truncate_page(struct address_space *, loff_t); -#define OSYNC_METADATA (1<<0) -#define OSYNC_DATA (1<<1) -#define OSYNC_INODE (1<<2) -int generic_osync_inode(struct inode *, struct address_space *, int); - - /* * inline definitions */ diff -purN -X /home/mbligh/.diff.exclude 535-lazy-readahead-adapt/include/linux/fs.h 540-O_SYNC-speedup/include/linux/fs.h --- 535-lazy-readahead-adapt/include/linux/fs.h 2004-02-20 15:40:56.000000000 -0800 +++ 540-O_SYNC-speedup/include/linux/fs.h 2004-02-20 15:40:58.000000000 -0800 @@ -764,6 +764,11 @@ extern int vfs_rename(struct inode *, st #define DT_SOCK 12 #define DT_WHT 14 +#define OSYNC_METADATA (1<<0) +#define OSYNC_DATA (1<<1) +#define OSYNC_INODE (1<<2) +int generic_osync_inode(struct inode *, struct address_space *, int); + /* * This is the "filldir" function type, used by readdir() to let * the kernel specify what kind of dirent layout it wants to have. diff -purN -X /home/mbligh/.diff.exclude 535-lazy-readahead-adapt/include/linux/writeback.h 540-O_SYNC-speedup/include/linux/writeback.h --- 535-lazy-readahead-adapt/include/linux/writeback.h 2003-10-01 11:48:26.000000000 -0700 +++ 540-O_SYNC-speedup/include/linux/writeback.h 2004-02-20 15:40:58.000000000 -0800 @@ -87,6 +87,8 @@ void page_writeback_init(void); void balance_dirty_pages_ratelimited(struct address_space *mapping); int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0); int do_writepages(struct address_space *mapping, struct writeback_control *wbc); +int sync_page_range(struct inode *inode, struct address_space *mapping, + loff_t pos, size_t count); /* pdflush.c */ extern int nr_pdflush_threads; /* Global so it can be exported to sysctl diff -purN -X /home/mbligh/.diff.exclude 535-lazy-readahead-adapt/mm/filemap.c 540-O_SYNC-speedup/mm/filemap.c --- 535-lazy-readahead-adapt/mm/filemap.c 2004-02-20 15:40:53.000000000 -0800 +++ 540-O_SYNC-speedup/mm/filemap.c 2004-02-20 15:40:58.000000000 -0800 @@ -1876,7 +1876,6 @@ generic_file_aio_write_nolock(struct kio if (err) goto out; - if (count == 0) goto out; @@ -1989,11 +1988,13 @@ generic_file_aio_write_nolock(struct kio /* * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC */ - if (status >= 0) { - if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) - status = generic_osync_inode(inode, mapping, - OSYNC_METADATA|OSYNC_DATA); - } + if (likely(status >= 0)) { + if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + if (!a_ops->writepage || !is_sync_kiocb(iocb)) + status = generic_osync_inode(inode, mapping, + OSYNC_METADATA|OSYNC_DATA); + } + } out_status: err = written ? written : status; @@ -2025,36 +2026,52 @@ ssize_t generic_file_aio_write(struct ki size_t count, loff_t pos) { struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - ssize_t err; - struct iovec local_iov = { .iov_base = (void __user *)buf, .iov_len = count }; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + ssize_t ret; + struct iovec local_iov = { .iov_base = (void __user *)buf, + .iov_len = count }; BUG_ON(iocb->ki_pos != pos); down(&inode->i_sem); - err = generic_file_aio_write_nolock(iocb, &local_iov, 1, + ret = generic_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos); up(&inode->i_sem); - return err; -} + if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + ssize_t err; + err = sync_page_range(inode, mapping, pos, ret); + if (err < 0) + ret = err; + } + return ret; +} EXPORT_SYMBOL(generic_file_aio_write); ssize_t generic_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct inode *inode = file->f_mapping->host; - ssize_t err; - struct iovec local_iov = { .iov_base = (void __user *)buf, .iov_len = count }; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + ssize_t ret; + struct iovec local_iov = { .iov_base = (void __user *)buf, + .iov_len = count }; down(&inode->i_sem); - err = generic_file_write_nolock(file, &local_iov, 1, ppos); + ret = generic_file_write_nolock(file, &local_iov, 1, ppos); up(&inode->i_sem); - return err; -} + if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + ssize_t err; + err = sync_page_range(inode, mapping, *ppos - ret, ret); + if (err < 0) + ret = err; + } + return ret; +} EXPORT_SYMBOL(generic_file_write); ssize_t generic_file_readv(struct file *filp, const struct iovec *iov, @@ -2073,14 +2090,23 @@ ssize_t generic_file_readv(struct file * EXPORT_SYMBOL(generic_file_readv); ssize_t generic_file_writev(struct file *file, const struct iovec *iov, - unsigned long nr_segs, loff_t * ppos) + unsigned long nr_segs, loff_t *ppos) { - struct inode *inode = file->f_mapping->host; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; ssize_t ret; down(&inode->i_sem); ret = generic_file_write_nolock(file, iov, nr_segs, ppos); up(&inode->i_sem); + + if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + int err; + + err = sync_page_range(inode, mapping, *ppos - ret, ret); + if (err < 0) + ret = err; + } return ret; } diff -purN -X /home/mbligh/.diff.exclude 535-lazy-readahead-adapt/mm/page-writeback.c 540-O_SYNC-speedup/mm/page-writeback.c --- 535-lazy-readahead-adapt/mm/page-writeback.c 2004-02-04 16:24:35.000000000 -0800 +++ 540-O_SYNC-speedup/mm/page-writeback.c 2004-02-20 15:40:58.000000000 -0800 @@ -28,6 +28,7 @@ #include #include #include +#include /* * The maximum number of pages to writeout in a single bdflush/kupdate @@ -568,3 +569,100 @@ int test_clear_page_dirty(struct page *p return 0; } EXPORT_SYMBOL(test_clear_page_dirty); + + +static int operate_on_page_range(struct address_space *mapping, + loff_t pos, size_t count, int (*operator)(struct page *)) +{ + pgoff_t first = pos >> PAGE_CACHE_SHIFT; + pgoff_t last = (pos + count - 1) >> PAGE_CACHE_SHIFT; /* inclusive */ + pgoff_t next = first; + struct pagevec pvec; + ssize_t ret = 0; + int i; + + pagevec_init(&pvec, 0); + while (pagevec_lookup(&pvec, mapping, next, + min((pgoff_t)PAGEVEC_SIZE, last - next + 1))) { + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + + lock_page(page); /* stabilise ->index */ + if (!page->mapping) { /* truncated */ + unlock_page(page); + next++; + continue; + } + next = page->index + 1; + ret = (*operator)(page); + if (PageError(page)) { + if (!ret) + ret = -EIO; + } + if (next > last) + break; + } + pagevec_release(&pvec); + if (next > last) + break; + } + return ret; +} + +static int page_waiter(struct page *page) +{ + unlock_page(page); + wait_on_page_writeback(page); + return 0; +} + +static int +wait_on_page_range(struct address_space *mapping, loff_t pos, size_t count) +{ + return operate_on_page_range(mapping, pos, count, page_waiter); +} + +static int page_writer(struct page *page) +{ + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 1, + }; + + wait_on_page_writeback(page); + return page->mapping->a_ops->writepage(page, &wbc); +} + +static int +write_out_page_range(struct address_space *mapping, loff_t pos, size_t count) +{ + return operate_on_page_range(mapping, pos, count, page_writer); +} + +/* + * Write and wait upon all the pages in the passed range. This is a "data + * integrity" operation. It waits upon in-flight writeout before starting and + * waiting upon new writeout. If there was an IO error, return it. + * + * We need to re-take i_sem during the generic_osync_inode list walk because + * it is otherwise livelockable. + */ +int sync_page_range(struct inode *inode, struct address_space *mapping, + loff_t pos, size_t count) +{ + int ret; + + if (!mapping->a_ops->writepage) + return 0; + if (mapping->backing_dev_info->memory_backed) + return 0; + ret = write_out_page_range(mapping, pos, count); + if (ret == 0) { + down(&inode->i_sem); + ret = generic_osync_inode(inode, mapping, OSYNC_METADATA); + up(&inode->i_sem); + } + if (ret == 0) + ret = wait_on_page_range(mapping, pos, count); + return ret; +}