From: Suparna Bhattacharya I picked up Badari's forward port of Stephen Tweedie's DIO fixes from 2.4, and have updated it with the following: - Drop i_sem after i/o submission for DIO reads/writes. - Release i_alloc_sem on DIO completion. - Modify other write paths i.e. generic_file_aio_write, generic_file_writev and xfs write to handle acquistion of i_alloc_sem and align with i_sem and i_alloc_sem being released directly by the DIO code, rather than by the caller. I had to debate a bit about how the _write_nolock() cases should be handled. The assumption I made is that for a regular file, the caller would have already acquired i_alloc_sem and i_sem. When its not a regular file the DIO code doesn't touch i_sem/i_alloc_sem anyway. Let me know how this looks to you. I'm not quite sure how to test these changes to be sure it all works. I have run aio-stress for O_DIRECT reads and writes. I still need to play with running fsx (there seem to be some problems there as I'm seeing short writes - need to investigate whether the problem exists with mm5 as such or if its due to this patch). fs/buffer.c | 32 ++++++++++++++++---------- fs/direct-io.c | 21 ++++++++++++++++- fs/inode.c | 1 fs/open.c | 2 + fs/xfs/linux/xfs_file.c | 2 + include/linux/fs.h | 2 + mm/filemap.c | 58 +++++++++++++++++++++++++++++++++++++++++++++--- 7 files changed, 102 insertions(+), 16 deletions(-) diff -puN fs/buffer.c~O_DIRECT-race-fixes fs/buffer.c --- 25/fs/buffer.c~O_DIRECT-race-fixes 2003-09-10 08:34:37.000000000 -0700 +++ 25-akpm/fs/buffer.c 2003-09-10 08:34:37.000000000 -0700 @@ -374,26 +374,18 @@ out: return ret; } -asmlinkage long sys_fdatasync(unsigned int fd) +int do_fdatasync(struct file * file) { - struct file * file; struct dentry * dentry; struct inode * inode; int ret, err; - ret = -EBADF; - file = fget(fd); - if (!file) - goto out; + if (unlikely(!file->f_op || !file->f_op->fsync)) + return -EINVAL; dentry = file->f_dentry; inode = dentry->d_inode; - ret = -EINVAL; - if (!file->f_op || !file->f_op->fsync) - goto out_putf; - - down(&inode->i_sem); current->flags |= PF_SYNCWRITE; ret = filemap_fdatawrite(inode->i_mapping); err = file->f_op->fsync(file, dentry, 1); @@ -403,8 +395,24 @@ asmlinkage long sys_fdatasync(unsigned i if (!ret) ret = err; current->flags &= ~PF_SYNCWRITE; - up(&inode->i_sem); + return ret; +} + +asmlinkage long sys_fdatasync(unsigned int fd) +{ + struct file * file; + struct inode * inode; + int ret, err; + + ret = -EBADF; + file = fget(fd); + if (!file) + goto out; + inode = file->f_dentry->d_inode; + down(&inode->i_sem); + ret = do_fdatasync(file); + up(&inode->i_sem); out_putf: fput(file); out: diff -puN fs/direct-io.c~O_DIRECT-race-fixes fs/direct-io.c --- 25/fs/direct-io.c~O_DIRECT-race-fixes 2003-09-10 08:34:37.000000000 -0700 +++ 25-akpm/fs/direct-io.c 2003-09-10 08:34:37.000000000 -0700 @@ -58,6 +58,7 @@ struct dio { struct bio *bio; /* bio under assembly */ struct inode *inode; int rw; + int is_reg_file; unsigned blkbits; /* doesn't change */ unsigned blkfactor; /* When we're using an alignment which is finer than the filesystem's soft @@ -205,6 +206,8 @@ static void dio_complete(struct dio *dio { if (dio->end_io) dio->end_io(dio->inode, offset, bytes, dio->map_bh.b_private); + if (dio->is_reg_file) + up_read(&dio->inode->i_alloc_sem); } /* @@ -448,6 +451,7 @@ static int get_more_blocks(struct dio *d unsigned long fs_count; /* Number of filesystem-sized blocks */ unsigned long dio_count;/* Number of dio_block-sized blocks */ unsigned long blkmask; + int beyond_eof = 0; /* * If there was a memory error and we've overwritten all the @@ -465,8 +469,13 @@ static int get_more_blocks(struct dio *d if (dio_count & blkmask) fs_count++; + if (dio->is_reg_file) { + if (dio->block_in_file > dio->inode->i_size) { + beyond_eof = 1; + } + } ret = (*dio->get_blocks)(dio->inode, fs_startblk, fs_count, - map_bh, dio->rw == WRITE); + map_bh, (dio->rw == WRITE) && beyond_eof); } return ret; } @@ -773,6 +782,10 @@ do_holes: if (!buffer_mapped(map_bh)) { char *kaddr; + if (dio->rw == WRITE) { + return -ENOTBLK; + } + if (dio->block_in_file >= i_size_read(dio->inode)>>blkbits) { /* We hit eof */ @@ -863,6 +876,10 @@ direct_io_worker(int rw, struct kiocb *i dio->start_zero_done = 0; dio->block_in_file = offset >> blkbits; dio->blocks_available = 0; + if (S_ISREG(inode->i_mode)) + dio->is_reg_file = 1; + else + dio->is_reg_file = 0; dio->cur_page = NULL; @@ -949,6 +966,8 @@ direct_io_worker(int rw, struct kiocb *i * OK, all BIOs are submitted, so we can decrement bio_count to truly * reflect the number of to-be-processed BIOs. */ + if (dio->is_reg_file) + up(&dio->inode->i_sem); /* Don't need i_sem any more */ if (dio->is_async) { if (ret == 0) ret = dio->result; /* Bytes written */ diff -puN fs/inode.c~O_DIRECT-race-fixes fs/inode.c --- 25/fs/inode.c~O_DIRECT-race-fixes 2003-09-10 08:34:37.000000000 -0700 +++ 25-akpm/fs/inode.c 2003-09-10 08:34:37.000000000 -0700 @@ -183,6 +183,7 @@ void inode_init_once(struct inode *inode INIT_LIST_HEAD(&inode->i_dentry); INIT_LIST_HEAD(&inode->i_devices); sema_init(&inode->i_sem, 1); + init_rwsem(&inode->i_alloc_sem); INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); spin_lock_init(&inode->i_data.page_lock); init_MUTEX(&inode->i_data.i_shared_sem); diff -puN fs/open.c~O_DIRECT-race-fixes fs/open.c --- 25/fs/open.c~O_DIRECT-race-fixes 2003-09-10 08:34:37.000000000 -0700 +++ 25-akpm/fs/open.c 2003-09-10 08:34:37.000000000 -0700 @@ -187,11 +187,13 @@ int do_truncate(struct dentry *dentry, l if (length < 0) return -EINVAL; + down_write(&dentry->d_inode->i_alloc_sem); newattrs.ia_size = length; newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; down(&dentry->d_inode->i_sem); err = notify_change(dentry, &newattrs); up(&dentry->d_inode->i_sem); + up_write(&dentry->d_inode->i_alloc_sem); return err; } diff -puN fs/xfs/linux/xfs_file.c~O_DIRECT-race-fixes fs/xfs/linux/xfs_file.c --- 25/fs/xfs/linux/xfs_file.c~O_DIRECT-race-fixes 2003-09-10 08:34:37.000000000 -0700 +++ 25-akpm/fs/xfs/linux/xfs_file.c 2003-09-10 08:34:37.000000000 -0700 @@ -94,6 +94,8 @@ linvfs_write( BUG_ON(iocb->ki_pos != pos); if (direct) { + down_read(&inode->i_alloc_sem); + down(&inode->i_sem); VOP_WRITE(vp, iocb, &iov, 1, &iocb->ki_pos, NULL, error); } else { down(&inode->i_sem); diff -puN include/linux/fs.h~O_DIRECT-race-fixes include/linux/fs.h --- 25/include/linux/fs.h~O_DIRECT-race-fixes 2003-09-10 08:34:37.000000000 -0700 +++ 25-akpm/include/linux/fs.h 2003-09-10 08:34:37.000000000 -0700 @@ -388,6 +388,7 @@ struct inode { unsigned short i_bytes; spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ struct semaphore i_sem; + struct rw_semaphore i_alloc_sem; struct inode_operations *i_op; struct file_operations *i_fop; /* former ->i_op->default_file_ops */ struct super_block *i_sb; @@ -1204,6 +1205,7 @@ static inline void invalidate_remote_ino } extern void invalidate_inode_pages2(struct address_space *mapping); extern void write_inode_now(struct inode *, int); +extern int do_fdatasync(struct file *); extern int filemap_fdatawrite(struct address_space *); extern int filemap_flush(struct address_space *); extern int filemap_fdatawait(struct address_space *); diff -puN mm/filemap.c~O_DIRECT-race-fixes mm/filemap.c --- 25/mm/filemap.c~O_DIRECT-race-fixes 2003-09-10 08:34:37.000000000 -0700 +++ 25-akpm/mm/filemap.c 2003-09-10 08:34:37.000000000 -0700 @@ -877,6 +877,10 @@ __generic_file_aio_read(struct kiocb *io retval = 0; if (!count) goto out; /* skip atime */ + if (S_ISREG(inode->i_mode)) { + down_read(&inode->i_alloc_sem); + down(&inode->i_sem); + } size = i_size_read(inode); if (pos < size) { retval = generic_file_direct_IO(READ, iocb, @@ -887,6 +891,12 @@ __generic_file_aio_read(struct kiocb *io *ppos = pos + retval; } update_atime(filp->f_dentry->d_inode); + /* + * i_sem and i_alloc_sem release are handled by + * the direct i/o code, the former is released after + * i/o submission, the latter set up to be released + * on actual i/o completion. + */ goto out; } @@ -1833,10 +1843,18 @@ __generic_file_aio_write_nolock(struct k status = generic_osync_inode(inode, OSYNC_METADATA); if (written >= 0 && !is_sync_kiocb(iocb)) written = -EIOCBQUEUED; + if (written == -ENOTBLK) { + if (S_ISREG(inode->i_mode)) { + down(&inode->i_sem); + } + goto do_buffer_io; + } goto out_status; } +do_buffer_io: buf = iov->iov_base; + written = 0; /* Reset in case this is a fallback from O_DIRECT */ do { unsigned long index; unsigned long offset; @@ -1932,6 +1950,14 @@ __generic_file_aio_write_nolock(struct k } } + if (unlikely(file->f_flags & O_DIRECT)) { + if (written) + do_fdatasync(file); + if (S_ISREG(inode->i_mode)) { + up(&inode->i_sem); + } + } + out_status: err = written ? written : status; out: @@ -2020,9 +2046,16 @@ ssize_t generic_file_aio_write(struct ki } down(&inode->i_sem); + if (file->f_flags & O_DIRECT) + down_read(&inode->i_alloc_sem); ret = __generic_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos); - up(&inode->i_sem); + /* + * For O_DIRECT, i_sem would have already been released and + * i_alloc_sem is released on i/o completion. + */ + if (!(file->f_flags & O_DIRECT)) + up(&inode->i_sem); /* * Avoid doing a sync in parts for aio - its more efficient to @@ -2051,9 +2084,16 @@ ssize_t generic_file_write(struct file * struct iovec local_iov = { .iov_base = (void __user *)buf, .iov_len = count }; + if (file->f_flags & O_DIRECT) + down_read(&inode->i_alloc_sem); down(&inode->i_sem); ret = __generic_file_write_nolock(file, &local_iov, 1, ppos); - up(&inode->i_sem); + /* + * For O_DIRECT, i_sem would have already been released and + * i_alloc_sem is released on i/o completion + */ + if (!(file->f_flags & O_DIRECT)) + up(&inode->i_sem); if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { ssize_t err; @@ -2084,9 +2124,16 @@ ssize_t generic_file_writev(struct file struct inode *inode = file->f_dentry->d_inode; ssize_t ret; + if (file->f_flags & O_DIRECT) + down_read(&inode->i_alloc_sem); down(&inode->i_sem); ret = __generic_file_write_nolock(file, iov, nr_segs, ppos); - up(&inode->i_sem); + /* + * For O_DIRECT, i_sem would have already been released and + * i_alloc_sem is released on i/o completion. + */ + if (!(file->f_flags & O_DIRECT)) + up(&inode->i_sem); if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { ssize_t err; @@ -2099,6 +2146,11 @@ ssize_t generic_file_writev(struct file return ret; } +/* + * For regular files, i_sem and i_alloc_sem should be held already. + * i_sem may be dropped later once we've mapped the new IO. + * i_alloc_sem is kept until the IO completes. + */ ssize_t generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) _