From: Suparna Bhattacharya I picked up Badari's forward port of Stephen Tweedie's DIO fixes from 2.4, and have updated it with the following: - Drop i_sem after i/o submission for DIO reads/writes. - Release i_alloc_sem on DIO completion. - Modify other write paths i.e. generic_file_aio_write, generic_file_writev and xfs write to handle acquistion of i_alloc_sem and align with i_sem and i_alloc_sem being released directly by the DIO code, rather than by the caller. I had to debate a bit about how the _write_nolock() cases should be handled. The assumption I made is that for a regular file, the caller would have already acquired i_alloc_sem and i_sem. When its not a regular file the DIO code doesn't touch i_sem/i_alloc_sem anyway. Let me know how this looks to you. I'm not quite sure how to test these changes to be sure it all works. I have run aio-stress for O_DIRECT reads and writes. I still need to play with running fsx (there seem to be some problems there as I'm seeing short writes - need to investigate whether the problem exists with mm5 as such or if its due to this patch). DESC O_DIRECT race fixes comments EDESC From: Suparna Bhattacharya DESC O_DRIECT race fixes fix fix fix EDESC Fix various deadlocks due to not releasing various locks on various error paths. NFS direct-IO is totally broken. DESC DIO locking rework EDESC From: Badari Pulavarty 1) I noticed that i_alloc_sem and i_sem locking ordering is not=20 consistent. So, I changed the locking order to i_sem (first) and i_alloc_sem (next). I hope this is okay. This allowed me to push the locking down to blockdev_direct_IO. 2) Pushed locking down to blockdev_direct_IO() to allow NFS direct-io. Here is the locking: (i) generic_file_*_write() holds i_sem (as before) and calls blockdev_direct_IO(). We get i_alloc_sem and call direct_io_worker(). (ii) generic_file_*_read() does not hold any locks. blockdev_direct_IO() gets i_sem and then i_alloc_sem and calls direct_io_worker() to do the work (iii) direct_io_worker() does the work and drops i_sem after submitting IOs and drops i_alloc_sem after completing IOs. DESC O_DIRECT XFS fix EDESC XFS deadlocks, and doesn't need the fancy locking anyway. So provide a separate direct-IO path for XFS, which avoids all the extra locking. fs/direct-io.c | 90 +++++++++++++++++++++++++++++++++++++++++------- fs/inode.c | 1 fs/open.c | 2 + fs/xfs/linux/xfs_aops.c | 3 + include/linux/fs.h | 31 ++++++++++++++-- mm/filemap.c | 53 +++++++++++++++++++++------- 6 files changed, 151 insertions(+), 29 deletions(-) diff -puN fs/direct-io.c~O_DIRECT-race-fixes-rollup fs/direct-io.c --- 25/fs/direct-io.c~O_DIRECT-race-fixes-rollup 2004-01-05 21:59:10.000000000 -0800 +++ 25-akpm/fs/direct-io.c 2004-01-05 21:59:12.000000000 -0800 @@ -52,6 +52,10 @@ * * If blkfactor is zero then the user's request was aligned to the filesystem's * blocksize. + * + * needs_locking is set for regular files on direct-IO-naive filesystems. It + * determines whether we need to do the fancy locking which prevents direct-IO + * from being able to read uninitialised disk blocks. */ struct dio { @@ -59,6 +63,7 @@ struct dio { struct bio *bio; /* bio under assembly */ struct inode *inode; int rw; + int needs_locking; /* doesn't change */ unsigned blkbits; /* doesn't change */ unsigned blkfactor; /* When we're using an alignment which is finer than the filesystem's soft @@ -206,6 +211,8 @@ static void dio_complete(struct dio *dio { if (dio->end_io) dio->end_io(dio->inode, offset, bytes, dio->map_bh.b_private); + if (dio->needs_locking) + up_read(&dio->inode->i_alloc_sem); } /* @@ -449,6 +456,7 @@ static int get_more_blocks(struct dio *d unsigned long fs_count; /* Number of filesystem-sized blocks */ unsigned long dio_count;/* Number of dio_block-sized blocks */ unsigned long blkmask; + int beyond_eof = 0; /* * If there was a memory error and we've overwritten all the @@ -466,8 +474,19 @@ static int get_more_blocks(struct dio *d if (dio_count & blkmask) fs_count++; + if (dio->needs_locking) { + if (dio->block_in_file >= (i_size_read(dio->inode) >> + dio->blkbits)) + beyond_eof = 1; + } + /* + * For writes inside i_size we forbid block creations: only + * overwrites are permitted. We fall back to buffered writes + * at a higher level for inside-i_size block-instantiating + * writes. + */ ret = (*dio->get_blocks)(dio->inode, fs_startblk, fs_count, - map_bh, dio->rw == WRITE); + map_bh, (dio->rw == WRITE) && beyond_eof); } return ret; } @@ -774,6 +793,10 @@ do_holes: if (!buffer_mapped(map_bh)) { char *kaddr; + /* AKPM: eargh, -ENOTBLK is a hack */ + if (dio->rw == WRITE) + return -ENOTBLK; + if (dio->block_in_file >= i_size_read(dio->inode)>>blkbits) { /* We hit eof */ @@ -839,21 +862,21 @@ out: return ret; } +/* + * Releases both i_sem and i_alloc_sem + */ static int direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, const struct iovec *iov, loff_t offset, unsigned long nr_segs, - unsigned blkbits, get_blocks_t get_blocks, dio_iodone_t end_io) + unsigned blkbits, get_blocks_t get_blocks, dio_iodone_t end_io, + struct dio *dio) { unsigned long user_addr; int seg; int ret = 0; int ret2; - struct dio *dio; size_t bytes; - dio = kmalloc(sizeof(*dio), GFP_KERNEL); - if (!dio) - return -ENOMEM; dio->is_async = !is_sync_kiocb(iocb); dio->bio = NULL; @@ -864,7 +887,6 @@ direct_io_worker(int rw, struct kiocb *i dio->start_zero_done = 0; dio->block_in_file = offset >> blkbits; dio->blocks_available = 0; - dio->cur_page = NULL; dio->boundary = 0; @@ -953,6 +975,13 @@ direct_io_worker(int rw, struct kiocb *i dio_cleanup(dio); /* + * All new block allocations have been performed. We can let i_sem + * go now. + */ + if (dio->needs_locking) + up(&dio->inode->i_sem); + + /* * OK, all BIOs are submitted, so we can decrement bio_count to truly * reflect the number of to-be-processed BIOs. */ @@ -987,11 +1016,17 @@ direct_io_worker(int rw, struct kiocb *i /* * This is a library function for use by filesystem drivers. + * + * For writes to S_ISREG files, we are called under i_sem and return with i_sem + * held, even though it is internally dropped. + * + * For writes to S_ISBLK files, i_sem is not held on entry; it is never taken. */ int -blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, +__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, - unsigned long nr_segs, get_blocks_t get_blocks, dio_iodone_t end_io) + unsigned long nr_segs, get_blocks_t get_blocks, dio_iodone_t end_io, + int needs_special_locking) { int seg; size_t size; @@ -1000,6 +1035,8 @@ blockdev_direct_IO(int rw, struct kiocb unsigned bdev_blkbits = 0; unsigned blocksize_mask = (1 << blkbits) - 1; ssize_t retval = -EINVAL; + struct dio *dio; + int needs_locking; if (bdev) bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev)); @@ -1025,10 +1062,37 @@ blockdev_direct_IO(int rw, struct kiocb } } - retval = direct_io_worker(rw, iocb, inode, iov, offset, - nr_segs, blkbits, get_blocks, end_io); + dio = kmalloc(sizeof(*dio), GFP_KERNEL); + retval = -ENOMEM; + if (!dio) + goto out; + + /* + * For regular files, + * readers need to grab i_sem and i_alloc_sem + * writers need to grab i_alloc_sem only (i_sem is already held) + */ + needs_locking = 0; + if (S_ISREG(inode->i_mode) && needs_special_locking) { + needs_locking = 1; + if (rw == READ) { + down(&inode->i_sem); + retval = filemap_write_and_wait(inode->i_mapping); + if (retval) { + up(&inode->i_sem); + kfree(dio); + goto out; + } + } + down_read(&inode->i_alloc_sem); + } + dio->needs_locking = needs_locking; + + retval = direct_io_worker(rw, iocb, inode, iov, offset, + nr_segs, blkbits, get_blocks, end_io, dio); + if (needs_locking && rw == WRITE) + down(&inode->i_sem); out: return retval; } - -EXPORT_SYMBOL(blockdev_direct_IO); +EXPORT_SYMBOL(__blockdev_direct_IO); diff -puN fs/inode.c~O_DIRECT-race-fixes-rollup fs/inode.c --- 25/fs/inode.c~O_DIRECT-race-fixes-rollup 2004-01-05 21:59:10.000000000 -0800 +++ 25-akpm/fs/inode.c 2004-01-05 21:59:10.000000000 -0800 @@ -183,6 +183,7 @@ void inode_init_once(struct inode *inode INIT_LIST_HEAD(&inode->i_dentry); INIT_LIST_HEAD(&inode->i_devices); sema_init(&inode->i_sem, 1); + init_rwsem(&inode->i_alloc_sem); INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); spin_lock_init(&inode->i_data.page_lock); init_MUTEX(&inode->i_data.i_shared_sem); diff -puN fs/open.c~O_DIRECT-race-fixes-rollup fs/open.c --- 25/fs/open.c~O_DIRECT-race-fixes-rollup 2004-01-05 21:59:10.000000000 -0800 +++ 25-akpm/fs/open.c 2004-01-05 21:59:10.000000000 -0800 @@ -192,7 +192,9 @@ int do_truncate(struct dentry *dentry, l newattrs.ia_size = length; newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; down(&dentry->d_inode->i_sem); + down_write(&dentry->d_inode->i_alloc_sem); err = notify_change(dentry, &newattrs); + up_write(&dentry->d_inode->i_alloc_sem); up(&dentry->d_inode->i_sem); return err; } diff -puN include/linux/fs.h~O_DIRECT-race-fixes-rollup include/linux/fs.h --- 25/include/linux/fs.h~O_DIRECT-race-fixes-rollup 2004-01-05 21:59:10.000000000 -0800 +++ 25-akpm/include/linux/fs.h 2004-01-05 21:59:10.000000000 -0800 @@ -390,6 +390,7 @@ struct inode { unsigned short i_bytes; spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ struct semaphore i_sem; + struct rw_semaphore i_alloc_sem; struct inode_operations *i_op; struct file_operations *i_fop; /* former ->i_op->default_file_ops */ struct super_block *i_sb; @@ -1209,6 +1210,7 @@ extern void write_inode_now(struct inode extern int filemap_fdatawrite(struct address_space *); extern int filemap_flush(struct address_space *); extern int filemap_fdatawait(struct address_space *); +extern int filemap_write_and_wait(struct address_space *mapping); extern void sync_supers(void); extern void sync_filesystems(int wait); extern void emergency_sync(void); @@ -1320,9 +1322,6 @@ extern void file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); extern ssize_t generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs); -extern int blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, - struct block_device *bdev, const struct iovec *iov, loff_t offset, - unsigned long nr_segs, get_blocks_t *get_blocks, dio_iodone_t *end_io); extern ssize_t generic_file_readv(struct file *filp, const struct iovec *iov, unsigned long nr_segs, loff_t *ppos); ssize_t generic_file_writev(struct file *filp, const struct iovec *iov, @@ -1344,6 +1343,32 @@ static inline void do_generic_file_read( actor); } +int __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, + struct block_device *bdev, const struct iovec *iov, loff_t offset, + unsigned long nr_segs, get_blocks_t get_blocks, dio_iodone_t end_io, + int needs_special_locking); + +/* + * For filesystems which need locking between buffered and direct access + */ +static inline int blockdev_direct_IO(int rw, struct kiocb *iocb, + struct inode *inode, struct block_device *bdev, const struct iovec *iov, + loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks, + dio_iodone_t end_io) +{ + return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, + nr_segs, get_blocks, end_io, 1); +} + +static inline int blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb, + struct inode *inode, struct block_device *bdev, const struct iovec *iov, + loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks, + dio_iodone_t end_io) +{ + return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, + nr_segs, get_blocks, end_io, 0); +} + extern struct file_operations generic_ro_fops; #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) diff -puN mm/filemap.c~O_DIRECT-race-fixes-rollup mm/filemap.c --- 25/mm/filemap.c~O_DIRECT-race-fixes-rollup 2004-01-05 21:59:10.000000000 -0800 +++ 25-akpm/mm/filemap.c 2004-01-05 21:59:10.000000000 -0800 @@ -74,6 +74,9 @@ * ->mmap_sem * ->i_sem (msync) * + * ->i_sem + * ->i_alloc_sem (various) + * * ->inode_lock * ->sb_lock (fs/fs-writeback.c) * ->mapping->page_lock (__sync_single_inode) @@ -227,6 +230,18 @@ restart: EXPORT_SYMBOL(filemap_fdatawait); +int filemap_write_and_wait(struct address_space *mapping) +{ + int retval = 0; + + if (mapping->nrpages) { + retval = filemap_fdatawrite(mapping); + if (retval == 0) + retval = filemap_fdatawait(mapping); + } + return retval; +} + /* * This adds a page to the page cache, starting out as locked, unreferenced, * not uptodate and with no errors. @@ -1813,6 +1828,7 @@ EXPORT_SYMBOL(generic_write_checks); /* * Write to a file through the page cache. + * Called under i_sem for S_ISREG files. * * We put everything into the page cache prior to writing it. This is not a * problem when writing full pages. With partial pages, however, we first have @@ -1901,12 +1917,19 @@ generic_file_aio_write_nolock(struct kio /* * Sync the fs metadata but not the minor inode changes and * of course not the data as we did direct DMA for the IO. + * i_sem is held, which protects generic_osync_inode() from + * livelocking. */ if (written >= 0 && file->f_flags & O_SYNC) status = generic_osync_inode(inode, mapping, OSYNC_METADATA); if (written >= 0 && !is_sync_kiocb(iocb)) written = -EIOCBQUEUED; - goto out_status; + if (written != -ENOTBLK) + goto out_status; + /* + * direct-io write to a hole: fall through to buffered I/O + */ + written = 0; } buf = iov->iov_base; @@ -1995,6 +2018,14 @@ generic_file_aio_write_nolock(struct kio OSYNC_METADATA|OSYNC_DATA); } + /* + * If we get here for O_DIRECT writes then we must have fallen through + * to buffered writes (block instantiation inside i_size). So we sync + * the file data here, to try to honour O_DIRECT expectations. + */ + if (unlikely(file->f_flags & O_DIRECT) && written) + status = filemap_write_and_wait(mapping); + out_status: err = written ? written : status; out: @@ -2086,6 +2117,9 @@ ssize_t generic_file_writev(struct file EXPORT_SYMBOL(generic_file_writev); +/* + * Called under i_sem for writes to S_ISREG files + */ ssize_t generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) @@ -2094,18 +2128,13 @@ generic_file_direct_IO(int rw, struct ki struct address_space *mapping = file->f_mapping; ssize_t retval; - if (mapping->nrpages) { - retval = filemap_fdatawrite(mapping); - if (retval == 0) - retval = filemap_fdatawait(mapping); - if (retval) - goto out; + retval = filemap_write_and_wait(mapping); + if (retval == 0) { + retval = mapping->a_ops->direct_IO(rw, iocb, iov, + offset, nr_segs); + if (rw == WRITE && mapping->nrpages) + invalidate_inode_pages2(mapping); } - - retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs); - if (rw == WRITE && mapping->nrpages) - invalidate_inode_pages2(mapping); -out: return retval; } diff -puN fs/xfs/linux/xfs_aops.c~O_DIRECT-race-fixes-rollup fs/xfs/linux/xfs_aops.c --- 25/fs/xfs/linux/xfs_aops.c~O_DIRECT-race-fixes-rollup 2004-01-05 21:59:10.000000000 -0800 +++ 25-akpm/fs/xfs/linux/xfs_aops.c 2004-01-05 21:59:10.000000000 -0800 @@ -984,7 +984,8 @@ linvfs_direct_IO( if (error) return -error; - return blockdev_direct_IO(rw, iocb, inode, pbmap.pbm_target->pbr_bdev, + return blockdev_direct_IO_no_locking(rw, iocb, inode, + pbmap.pbm_target->pbr_bdev, iov, offset, nr_segs, linvfs_get_blocks_direct, linvfs_unwritten_convert_direct); _