We hold i_sem during the various sync() operations to prevent livelocks: if another thread is dirtying the file, a sync() may never return. Or at least, that used to be true when we were using the per-address_space page lists. Since writeback has used radix tree traversal it is not possible to livelock the sync() operations, because they only visit each page a single time. sync_page_range() (used by O_SYNC writes) has not been holding i_sem for quite some time, for the above reasons. The patch converts fsync(), fdatasync() and msync() to also not hold i_sem during the radix-tree-based writeback. Now, we _do_ still need to hold i_sem across the file->f_op->fsync() call, because that is still based on a list_head walk, and is still livelockable. But in the case of msync() I deliberately left i_sem untaken. This is because we're currently deadlockable in msync, because mmap_sem is already held, and mmap_sem nexts inside i_sem, due to direct-io.c. And yes, the ranking of down_read() veruss down() does matter: Task A Task B Task C down_read(rwsem) down(sem) down_write(rwsem) down(sem) down_read(rwsem) C's down_write() will cause B's down_read to block. B holds `sem', so A will never release `rwsem'. So the patch fixes a hard-to-hit triple-task deadlock, but adds a possible livelock in msync(). It is possible to fix sys_msync() so that it takes i_sem outside i_mmap_sem. Later. Signed-off-by: Andrew Morton --- 25-akpm/fs/buffer.c | 14 +++++++++----- 25-akpm/mm/msync.c | 6 ++++-- mm/filemap.c | 0 3 files changed, 13 insertions(+), 7 deletions(-) diff -puN mm/msync.c~file-sync-no-i_sem mm/msync.c --- 25/mm/msync.c~file-sync-no-i_sem 2004-11-24 19:19:33.252879016 -0800 +++ 25-akpm/mm/msync.c 2004-11-24 19:29:15.311392672 -0800 @@ -186,9 +186,12 @@ static int msync_interval(struct vm_area struct address_space *mapping = file->f_mapping; int err; - down(&mapping->host->i_sem); ret = filemap_fdatawrite(mapping); if (file->f_op && file->f_op->fsync) { + /* + * We don't take i_sem here because mmap_sem + * is already held. + */ err = file->f_op->fsync(file,file->f_dentry,1); if (err && !ret) ret = err; @@ -196,7 +199,6 @@ static int msync_interval(struct vm_area err = filemap_fdatawait(mapping); if (!ret) ret = err; - up(&mapping->host->i_sem); } } return ret; diff -puN fs/buffer.c~file-sync-no-i_sem fs/buffer.c --- 25/fs/buffer.c~file-sync-no-i_sem 2004-11-24 19:19:33.268876584 -0800 +++ 25-akpm/fs/buffer.c 2004-11-24 19:22:48.176246160 -0800 @@ -347,18 +347,22 @@ asmlinkage long sys_fsync(unsigned int f goto out_putf; } - /* We need to protect against concurrent writers.. */ - down(&mapping->host->i_sem); current->flags |= PF_SYNCWRITE; ret = filemap_fdatawrite(mapping); + + /* + * We need to protect against concurrent writers, + * which could cause livelocks in fsync_buffers_list + */ + down(&mapping->host->i_sem); err = file->f_op->fsync(file, file->f_dentry, 0); if (!ret) ret = err; + up(&mapping->host->i_sem); err = filemap_fdatawait(mapping); if (!ret) ret = err; current->flags &= ~PF_SYNCWRITE; - up(&mapping->host->i_sem); out_putf: fput(file); @@ -383,17 +387,17 @@ asmlinkage long sys_fdatasync(unsigned i mapping = file->f_mapping; - down(&mapping->host->i_sem); current->flags |= PF_SYNCWRITE; ret = filemap_fdatawrite(mapping); + down(&mapping->host->i_sem); err = file->f_op->fsync(file, file->f_dentry, 1); if (!ret) ret = err; + up(&mapping->host->i_sem); err = filemap_fdatawait(mapping); if (!ret) ret = err; current->flags &= ~PF_SYNCWRITE; - up(&mapping->host->i_sem); out_putf: fput(file); diff -puN mm/filemap.c~file-sync-no-i_sem mm/filemap.c _