In databases it is common to have multiple threads or processes performng
O_SYNC writes against different parts of the same file.

Our performance at this is poor, because each writer blocks access to the
file by waiting on I/O completion while holding i_sem: everything is
serialised.

The patch improves things by moving the waiting outside i_sem.  So other
threads can get in and submit their I/O and permit the disk scheduler to
optimise the IO patterns better.

Also, the O_SYNC writer only writes and waits on the pages which he wrote,
rather than writing and waiting on all dirty pages in the file.


The reason we haven't been able to do this before is that the required walk
of the address_space page lists is easily livelockable without the i_sem
serialisation.  But in this patch we perform the waiting via a radix-tree
walk of the affected pages.  This cannot be livelocked.

The sync of the inode's metadata is still performed inside i_sem.  This is
because it is list-based and is hence still livelockable.  However it is
usually the case that databases are overwriting existing file blocks and
there will be no dirty buffers attached to the address_space anyway.

And if there _are_ dirty buffers, we have alrady scheduled the IO for the
pages, so the elevator can merge that IO against the metadata.  (At present
there are two separate write-and-wait cycles).


 mm/filemap.c |   96 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 files changed, 88 insertions(+), 8 deletions(-)

diff -puN mm/filemap.c~O_SYNC-speedup mm/filemap.c
--- 25/mm/filemap.c~O_SYNC-speedup	2003-05-28 04:32:28.000000000 -0700
+++ 25-akpm/mm/filemap.c	2003-05-28 04:33:01.000000000 -0700
@@ -1687,6 +1687,7 @@ generic_file_aio_write_nolock(struct kio
 	size_t		iov_base = 0;	   /* offset in the current iovec */
 	unsigned long	seg;
 	char __user	*buf;
+	int (*writepage)(struct page *page, struct writeback_control *wbc);
 
 	ocount = 0;
 	for (seg = 0; seg < nr_segs; seg++) {
@@ -1720,7 +1721,6 @@ generic_file_aio_write_nolock(struct kio
 	if (err)
 		goto out;
 
-
 	if (count == 0)
 		goto out;
 
@@ -1753,6 +1753,10 @@ generic_file_aio_write_nolock(struct kio
 		goto out_status;
 	}
 
+	writepage = NULL;
+	if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode)))
+		writepage = a_ops->writepage;	/* This may be NULL */
+
 	buf = iov->iov_base;
 	do {
 		unsigned long index;
@@ -1825,7 +1829,19 @@ generic_file_aio_write_nolock(struct kio
 
 		if (!PageReferenced(page))
 			SetPageReferenced(page);
-		unlock_page(page);
+		if (unlikely(writepage != NULL)) {
+			if (status >= 0) {
+				struct writeback_control wbc = {
+					.sync_mode = WB_SYNC_ALL,
+					.nr_to_write = 1,
+				};
+
+				wait_on_page_writeback(page);
+				status = (*writepage)(page, &wbc);
+			}
+		} else {
+			unlock_page(page);
+		}
 		page_cache_release(page);
 		if (status < 0)
 			break;
@@ -1844,10 +1860,16 @@ generic_file_aio_write_nolock(struct kio
 	/*
 	 * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
 	 */
-	if (status >= 0) {
-		if ((file->f_flags & O_SYNC) || IS_SYNC(inode))
-			status = generic_osync_inode(inode,
-					OSYNC_METADATA|OSYNC_DATA);
+	if (likely(status >= 0)) {
+		if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+			int flags;
+
+			if (writepage)
+				flags = OSYNC_METADATA;
+			else
+				flags = OSYNC_METADATA|OSYNC_DATA;
+			status = generic_osync_inode(inode, flags);
+		}
 	}
 	
 out_status:	
@@ -1878,7 +1900,8 @@ ssize_t generic_file_aio_write(struct ki
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
 	ssize_t err;
-	struct iovec local_iov = { .iov_base = (void __user *)buf, .iov_len = count };
+	struct iovec local_iov = { .iov_base = (void __user *)buf,
+					.iov_len = count };
 
 	BUG_ON(iocb->ki_pos != pos);
 
@@ -1892,17 +1915,74 @@ ssize_t generic_file_aio_write(struct ki
 EXPORT_SYMBOL(generic_file_aio_write);
 EXPORT_SYMBOL(generic_file_aio_write_nolock);
 
+/*
+ * This is for O_SYNC: wait for writeout to complete against all the pagecache
+ * pages in the passed range.  If there was an IO error, return it.
+ */
+static ssize_t
+wait_on_page_range(struct address_space *mapping, loff_t pos, size_t count)
+{
+	pgoff_t first = pos >> PAGE_CACHE_SHIFT;
+	pgoff_t last = (pos + count - 1) >> PAGE_CACHE_SHIFT;	/* inclusive */
+	pgoff_t next = first;
+	struct pagevec pvec;
+	ssize_t ret = 0;
+	int i;
+
+	pagevec_init(&pvec, 0);
+	while (pagevec_lookup(&pvec, mapping, next,
+				min((pgoff_t)PAGEVEC_SIZE, last - next + 1))) {
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+
+			lock_page(page);	/* stabilise ->index */
+			if (!page->mapping) {	/* truncated */
+				unlock_page(page);
+				next++;
+				continue;
+			}
+			next = page->index + 1;
+			unlock_page(page);
+			wait_on_page_writeback(page);
+			if (PageError(page))
+				ret = -EIO;
+			if (next > last)
+				break;
+		}
+		pagevec_release(&pvec);
+		if (next > last)
+			break;
+	}
+	return ret;
+}
+
 ssize_t generic_file_write(struct file *file, const char __user *buf,
 			   size_t count, loff_t *ppos)
 {
 	struct inode	*inode = file->f_dentry->d_inode->i_mapping->host;
 	ssize_t		err;
-	struct iovec local_iov = { .iov_base = (void __user *)buf, .iov_len = count };
+	struct iovec local_iov = { .iov_base = (void __user *)buf,
+					.iov_len = count };
 
 	down(&inode->i_sem);
 	err = generic_file_write_nolock(file, &local_iov, 1, ppos);
 	up(&inode->i_sem);
 
+	/*
+	 * Now we've dropped i_sem, we can wait on any O_SYNC writeout
+	 */
+	if (err <= 0)
+		goto out;
+	if (!inode->i_mapping->a_ops->writepage)
+		goto out;
+	if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
+		ssize_t err2;
+
+		err2 = wait_on_page_range(inode->i_mapping, *ppos - err, err);
+		if (err2 < 0)
+			err = err2;
+	}
+out:
 	return err;
 }
 

_