In databases it is common to have multiple threads or processes performng O_SYNC writes against different parts of the same file. Our performance at this is poor, because each writer blocks access to the file by waiting on I/O completion while holding i_sem: everything is serialised. The patch improves things by moving the waiting outside i_sem. So other threads can get in and submit their I/O and permit the disk scheduler to optimise the IO patterns better. Also, the O_SYNC writer only writes and waits on the pages which he wrote, rather than writing and waiting on all dirty pages in the file. The reason we haven't been able to do this before is that the required walk of the address_space page lists is easily livelockable without the i_sem serialisation. But in this patch we perform the waiting via a radix-tree walk of the affected pages. This cannot be livelocked. The sync of the inode's metadata is still performed inside i_sem. This is because it is list-based and is hence still livelockable. However it is usually the case that databases are overwriting existing file blocks and there will be no dirty buffers attached to the address_space anyway. And if there _are_ dirty buffers, we have alrady scheduled the IO for the pages, so the elevator can merge that IO against the metadata. (At present there are two separate write-and-wait cycles). mm/filemap.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 files changed, 88 insertions(+), 8 deletions(-) diff -puN mm/filemap.c~O_SYNC-speedup mm/filemap.c --- 25/mm/filemap.c~O_SYNC-speedup 2003-05-28 04:32:28.000000000 -0700 +++ 25-akpm/mm/filemap.c 2003-05-28 04:33:01.000000000 -0700 @@ -1687,6 +1687,7 @@ generic_file_aio_write_nolock(struct kio size_t iov_base = 0; /* offset in the current iovec */ unsigned long seg; char __user *buf; + int (*writepage)(struct page *page, struct writeback_control *wbc); ocount = 0; for (seg = 0; seg < nr_segs; seg++) { @@ -1720,7 +1721,6 @@ generic_file_aio_write_nolock(struct kio if (err) goto out; - if (count == 0) goto out; @@ -1753,6 +1753,10 @@ generic_file_aio_write_nolock(struct kio goto out_status; } + writepage = NULL; + if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) + writepage = a_ops->writepage; /* This may be NULL */ + buf = iov->iov_base; do { unsigned long index; @@ -1825,7 +1829,19 @@ generic_file_aio_write_nolock(struct kio if (!PageReferenced(page)) SetPageReferenced(page); - unlock_page(page); + if (unlikely(writepage != NULL)) { + if (status >= 0) { + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 1, + }; + + wait_on_page_writeback(page); + status = (*writepage)(page, &wbc); + } + } else { + unlock_page(page); + } page_cache_release(page); if (status < 0) break; @@ -1844,10 +1860,16 @@ generic_file_aio_write_nolock(struct kio /* * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC */ - if (status >= 0) { - if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) - status = generic_osync_inode(inode, - OSYNC_METADATA|OSYNC_DATA); + if (likely(status >= 0)) { + if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + int flags; + + if (writepage) + flags = OSYNC_METADATA; + else + flags = OSYNC_METADATA|OSYNC_DATA; + status = generic_osync_inode(inode, flags); + } } out_status: @@ -1878,7 +1900,8 @@ ssize_t generic_file_aio_write(struct ki struct file *file = iocb->ki_filp; struct inode *inode = file->f_dentry->d_inode->i_mapping->host; ssize_t err; - struct iovec local_iov = { .iov_base = (void __user *)buf, .iov_len = count }; + struct iovec local_iov = { .iov_base = (void __user *)buf, + .iov_len = count }; BUG_ON(iocb->ki_pos != pos); @@ -1892,17 +1915,74 @@ ssize_t generic_file_aio_write(struct ki EXPORT_SYMBOL(generic_file_aio_write); EXPORT_SYMBOL(generic_file_aio_write_nolock); +/* + * This is for O_SYNC: wait for writeout to complete against all the pagecache + * pages in the passed range. If there was an IO error, return it. + */ +static ssize_t +wait_on_page_range(struct address_space *mapping, loff_t pos, size_t count) +{ + pgoff_t first = pos >> PAGE_CACHE_SHIFT; + pgoff_t last = (pos + count - 1) >> PAGE_CACHE_SHIFT; /* inclusive */ + pgoff_t next = first; + struct pagevec pvec; + ssize_t ret = 0; + int i; + + pagevec_init(&pvec, 0); + while (pagevec_lookup(&pvec, mapping, next, + min((pgoff_t)PAGEVEC_SIZE, last - next + 1))) { + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + + lock_page(page); /* stabilise ->index */ + if (!page->mapping) { /* truncated */ + unlock_page(page); + next++; + continue; + } + next = page->index + 1; + unlock_page(page); + wait_on_page_writeback(page); + if (PageError(page)) + ret = -EIO; + if (next > last) + break; + } + pagevec_release(&pvec); + if (next > last) + break; + } + return ret; +} + ssize_t generic_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct inode *inode = file->f_dentry->d_inode->i_mapping->host; ssize_t err; - struct iovec local_iov = { .iov_base = (void __user *)buf, .iov_len = count }; + struct iovec local_iov = { .iov_base = (void __user *)buf, + .iov_len = count }; down(&inode->i_sem); err = generic_file_write_nolock(file, &local_iov, 1, ppos); up(&inode->i_sem); + /* + * Now we've dropped i_sem, we can wait on any O_SYNC writeout + */ + if (err <= 0) + goto out; + if (!inode->i_mapping->a_ops->writepage) + goto out; + if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) { + ssize_t err2; + + err2 = wait_on_page_range(inode->i_mapping, *ppos - err, err); + if (err2 < 0) + err = err2; + } +out: return err; } _