From: Suparna Bhattacharya I picked up Badari's forward port of Stephen Tweedie's DIO fixes from 2.4, and have updated it with the following: - Drop i_sem after i/o submission for DIO reads/writes. - Release i_alloc_sem on DIO completion. - Modify other write paths i.e. generic_file_aio_write, generic_file_writev and xfs write to handle acquistion of i_alloc_sem and align with i_sem and i_alloc_sem being released directly by the DIO code, rather than by the caller. I had to debate a bit about how the _write_nolock() cases should be handled. The assumption I made is that for a regular file, the caller would have already acquired i_alloc_sem and i_sem. When its not a regular file the DIO code doesn't touch i_sem/i_alloc_sem anyway. Let me know how this looks to you. I'm not quite sure how to test these changes to be sure it all works. I have run aio-stress for O_DIRECT reads and writes. I still need to play with running fsx (there seem to be some problems there as I'm seeing short writes - need to investigate whether the problem exists with mm5 as such or if its due to this patch). DESC O_DIRECT race fixes comments EDESC From: Suparna Bhattacharya DESC O_DRIECT race fixes fix fix fix EDESC Fix various deadlocks due to not releasing various locks on various error paths. NFS direct-IO is totally broken. DESC DIO locking rework EDESC From: Badari Pulavarty 1) I noticed that i_alloc_sem and i_sem locking ordering is not=20 consistent. So, I changed the locking order to i_sem (first) and i_alloc_sem (next). I hope this is okay. This allowed me to push the locking down to blockdev_direct_IO. 2) Pushed locking down to blockdev_direct_IO() to allow NFS direct-io. Here is the locking: (i) generic_file_*_write() holds i_sem (as before) and calls blockdev_direct_IO(). We get i_alloc_sem and call direct_io_worker(). (ii) generic_file_*_read() does not hold any locks. blockdev_direct_IO() gets i_sem and then i_alloc_sem and calls direct_io_worker() to do the work (iii) direct_io_worker() does the work and drops i_sem after submitting IOs and drops i_alloc_sem after completing IOs. fs/direct-io.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++------- fs/inode.c | 1 fs/open.c | 2 + include/linux/fs.h | 2 + mm/filemap.c | 53 +++++++++++++++++++++++++++--------- 5 files changed, 115 insertions(+), 21 deletions(-) diff -puN fs/direct-io.c~O_DIRECT-race-fixes-rollup fs/direct-io.c --- 25/fs/direct-io.c~O_DIRECT-race-fixes-rollup 2003-10-03 22:17:17.000000000 -0700 +++ 25-akpm/fs/direct-io.c 2003-10-03 22:17:17.000000000 -0700 @@ -121,6 +121,11 @@ struct dio { int result; /* IO result */ }; +static inline int dio_is_reg(struct dio *dio) +{ + return S_ISREG(dio->inode->i_mode); +} + /* * How many pages are in the queue? */ @@ -205,6 +210,8 @@ static void dio_complete(struct dio *dio { if (dio->end_io) dio->end_io(dio->inode, offset, bytes, dio->map_bh.b_private); + if (dio_is_reg(dio)) + up_read(&dio->inode->i_alloc_sem); } /* @@ -448,6 +455,7 @@ static int get_more_blocks(struct dio *d unsigned long fs_count; /* Number of filesystem-sized blocks */ unsigned long dio_count;/* Number of dio_block-sized blocks */ unsigned long blkmask; + int beyond_eof = 0; /* * If there was a memory error and we've overwritten all the @@ -465,8 +473,19 @@ static int get_more_blocks(struct dio *d if (dio_count & blkmask) fs_count++; + if (dio_is_reg(dio)) { + if (dio->block_in_file >= (i_size_read(dio->inode) >> + dio->blkbits)) + beyond_eof = 1; + } + /* + * For writes inside i_size we forbid block creations: only + * overwrites are permitted. We fall back to buffered writes + * at a higher level for inside-i_size block-instantiating + * writes. + */ ret = (*dio->get_blocks)(dio->inode, fs_startblk, fs_count, - map_bh, dio->rw == WRITE); + map_bh, (dio->rw == WRITE) && beyond_eof); } return ret; } @@ -773,6 +792,10 @@ do_holes: if (!buffer_mapped(map_bh)) { char *kaddr; + /* AKPM: eargh, -ENOTBLK is a hack */ + if (dio->rw == WRITE) + return -ENOTBLK; + if (dio->block_in_file >= i_size_read(dio->inode)>>blkbits) { /* We hit eof */ @@ -838,21 +861,21 @@ out: return ret; } +/* + * Releases both i_sem and i_alloc_sem + */ static int direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, const struct iovec *iov, loff_t offset, unsigned long nr_segs, - unsigned blkbits, get_blocks_t get_blocks, dio_iodone_t end_io) + unsigned blkbits, get_blocks_t get_blocks, dio_iodone_t end_io, + struct dio *dio) { unsigned long user_addr; int seg; int ret = 0; int ret2; - struct dio *dio; size_t bytes; - dio = kmalloc(sizeof(*dio), GFP_KERNEL); - if (!dio) - return -ENOMEM; dio->is_async = !is_sync_kiocb(iocb); dio->bio = NULL; @@ -863,7 +886,6 @@ direct_io_worker(int rw, struct kiocb *i dio->start_zero_done = 0; dio->block_in_file = offset >> blkbits; dio->blocks_available = 0; - dio->cur_page = NULL; dio->boundary = 0; @@ -946,6 +968,13 @@ direct_io_worker(int rw, struct kiocb *i dio_bio_submit(dio); /* + * All new block allocations have been performed. We can let i_sem + * go now. + */ + if (dio_is_reg(dio)) + up(&dio->inode->i_sem); + + /* * OK, all BIOs are submitted, so we can decrement bio_count to truly * reflect the number of to-be-processed BIOs. */ @@ -980,6 +1009,11 @@ direct_io_worker(int rw, struct kiocb *i /* * This is a library function for use by filesystem drivers. + * + * For writes to S_ISREG files, we are called under i_sem and return with i_sem + * held, even though it is internally dropped. + * + * For writes to S_ISBLK files, i_sem is not held on entry; it is never taken. */ int blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, @@ -993,6 +1027,7 @@ blockdev_direct_IO(int rw, struct kiocb unsigned bdev_blkbits = 0; unsigned blocksize_mask = (1 << blkbits) - 1; ssize_t retval = -EINVAL; + struct dio *dio; if (bdev) bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev)); @@ -1018,8 +1053,33 @@ blockdev_direct_IO(int rw, struct kiocb } } - retval = direct_io_worker(rw, iocb, inode, iov, offset, - nr_segs, blkbits, get_blocks, end_io); + dio = kmalloc(sizeof(*dio), GFP_KERNEL); + retval = -ENOMEM; + if (!dio) + goto out; + + /* + * For regular files, + * readers need to grab i_sem and i_alloc_sem + * writers need to grab i_alloc_sem only (i_sem is already held) + */ + if (S_ISREG(inode->i_mode)) { + if (rw == READ) { + down(&inode->i_sem); + retval = filemap_write_and_wait(inode->i_mapping); + if (retval) { + up(&inode->i_sem); + kfree(dio); + goto out; + } + } + down_read(&inode->i_alloc_sem); + } + + retval = direct_io_worker(rw, iocb, inode, iov, offset, + nr_segs, blkbits, get_blocks, end_io, dio); + if (S_ISREG(inode->i_mode) && rw == WRITE) + down(&inode->i_sem); out: return retval; } diff -puN fs/inode.c~O_DIRECT-race-fixes-rollup fs/inode.c --- 25/fs/inode.c~O_DIRECT-race-fixes-rollup 2003-10-03 22:17:17.000000000 -0700 +++ 25-akpm/fs/inode.c 2003-10-03 22:17:17.000000000 -0700 @@ -183,6 +183,7 @@ void inode_init_once(struct inode *inode INIT_LIST_HEAD(&inode->i_dentry); INIT_LIST_HEAD(&inode->i_devices); sema_init(&inode->i_sem, 1); + init_rwsem(&inode->i_alloc_sem); INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); spin_lock_init(&inode->i_data.page_lock); init_MUTEX(&inode->i_data.i_shared_sem); diff -puN fs/open.c~O_DIRECT-race-fixes-rollup fs/open.c --- 25/fs/open.c~O_DIRECT-race-fixes-rollup 2003-10-03 22:17:17.000000000 -0700 +++ 25-akpm/fs/open.c 2003-10-03 22:17:17.000000000 -0700 @@ -190,7 +190,9 @@ int do_truncate(struct dentry *dentry, l newattrs.ia_size = length; newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; down(&dentry->d_inode->i_sem); + down_write(&dentry->d_inode->i_alloc_sem); err = notify_change(dentry, &newattrs); + up_write(&dentry->d_inode->i_alloc_sem); up(&dentry->d_inode->i_sem); return err; } diff -puN include/linux/fs.h~O_DIRECT-race-fixes-rollup include/linux/fs.h --- 25/include/linux/fs.h~O_DIRECT-race-fixes-rollup 2003-10-03 22:17:17.000000000 -0700 +++ 25-akpm/include/linux/fs.h 2003-10-03 22:17:17.000000000 -0700 @@ -388,6 +388,7 @@ struct inode { unsigned short i_bytes; spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ struct semaphore i_sem; + struct rw_semaphore i_alloc_sem; struct inode_operations *i_op; struct file_operations *i_fop; /* former ->i_op->default_file_ops */ struct super_block *i_sb; @@ -1203,6 +1204,7 @@ extern void write_inode_now(struct inode extern int filemap_fdatawrite(struct address_space *); extern int filemap_flush(struct address_space *); extern int filemap_fdatawait(struct address_space *); +extern int filemap_write_and_wait(struct address_space *mapping); extern void sync_supers(void); extern void sync_filesystems(int wait); extern void emergency_sync(void); diff -puN mm/filemap.c~O_DIRECT-race-fixes-rollup mm/filemap.c --- 25/mm/filemap.c~O_DIRECT-race-fixes-rollup 2003-10-03 22:17:17.000000000 -0700 +++ 25-akpm/mm/filemap.c 2003-10-03 22:17:17.000000000 -0700 @@ -73,6 +73,9 @@ * ->mmap_sem * ->i_sem (msync) * + * ->i_sem + * ->i_alloc_sem (various) + * * ->inode_lock * ->sb_lock (fs/fs-writeback.c) * ->mapping->page_lock (__sync_single_inode) @@ -223,6 +226,18 @@ restart: EXPORT_SYMBOL(filemap_fdatawait); +int filemap_write_and_wait(struct address_space *mapping) +{ + int retval = 0; + + if (mapping->nrpages) { + retval = filemap_fdatawrite(mapping); + if (retval == 0) + retval = filemap_fdatawait(mapping); + } + return retval; +} + /* * This adds a page to the page cache, starting out as locked, unreferenced, * not uptodate and with no errors. @@ -1683,6 +1698,7 @@ EXPORT_SYMBOL(generic_write_checks); /* * Write to a file through the page cache. + * Called under i_sem for S_ISREG files. * * We put everything into the page cache prior to writing it. This is not a * problem when writing full pages. With partial pages, however, we first have @@ -1771,12 +1787,19 @@ generic_file_aio_write_nolock(struct kio /* * Sync the fs metadata but not the minor inode changes and * of course not the data as we did direct DMA for the IO. + * i_sem is held, which protects generic_osync_inode() from + * livelocking. */ if (written >= 0 && file->f_flags & O_SYNC) status = generic_osync_inode(inode, mapping, OSYNC_METADATA); if (written >= 0 && !is_sync_kiocb(iocb)) written = -EIOCBQUEUED; - goto out_status; + if (written != -ENOTBLK) + goto out_status; + /* + * direct-io write to a hole: fall through to buffered I/O + */ + written = 0; } buf = iov->iov_base; @@ -1865,6 +1888,14 @@ generic_file_aio_write_nolock(struct kio OSYNC_METADATA|OSYNC_DATA); } + /* + * If we get here for O_DIRECT writes then we must have fallen through + * to buffered writes (block instantiation inside i_size). So we sync + * the file data here, to try to honour O_DIRECT expectations. + */ + if (unlikely(file->f_flags & O_DIRECT) && written) + status = filemap_write_and_wait(mapping); + out_status: err = written ? written : status; out: @@ -1956,6 +1987,9 @@ ssize_t generic_file_writev(struct file EXPORT_SYMBOL(generic_file_writev); +/* + * Called under i_sem for writes to S_ISREG files + */ ssize_t generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) @@ -1964,18 +1998,13 @@ generic_file_direct_IO(int rw, struct ki struct address_space *mapping = file->f_mapping; ssize_t retval; - if (mapping->nrpages) { - retval = filemap_fdatawrite(mapping); - if (retval == 0) - retval = filemap_fdatawait(mapping); - if (retval) - goto out; + retval = filemap_write_and_wait(mapping); + if (retval == 0) { + retval = mapping->a_ops->direct_IO(rw, iocb, iov, + offset, nr_segs); + if (rw == WRITE && mapping->nrpages) + invalidate_inode_pages2(mapping); } - - retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs); - if (rw == WRITE && mapping->nrpages) - invalidate_inode_pages2(mapping); -out: return retval; } _