From: Badari Pulavarty, Suparna Bhattacharya, Andrew Morton Forward port of Stephen Tweedie's DIO fixes from 2.4, to fix various DIO vs buffered IO exposures involving races causing: (a) stale data from uninstantiated blocks to be read, e.g. - O_DIRECT reads against buffered writes to a sparse region - O_DIRECT writes to a sparse region against buffered reads (b) potential data corruption with - O_DIRECT IOs against truncate due to writes to truncated blocks (which may have been reallocated to another file). Summary of fixes: 1) All the changes affect only regular files. RAW/O_DIRECT on block are unaffected. 2) The DIO code will not fill in sparse regions on a write. Instead -ENOTBLK is returned and the generic file write code would fallthrough to buffered IO in this case followed by writing through the pages to disk using filemap_fdatawrite/wait. 3) i_sem is held during both DIO reads and writes. For reads, and writes to already allocated blocks, it is released right after IO is issued, while for writes to newly allocated blocks (e.g file extending writes and hole overwrites) it is held all the way through until IO completes (and data is committed to disk). 4) filemap_fdatawrite/wait are called under i_sem to synchronize buffered pages to disk blocks before issuing DIO. 5) A new rwsem (i_alloc_sem) is held in shared mode all the while a DIO (read or write) is in progress, and in exclusive mode by truncate to guard against deallocation of data blocks during DIO. 6) All this new locking has been pushed down into blockdev_direct_IO to avoid interfering with NFS direct IO. The locks are taken in the order i_sem followed by i_alloc_sem. While i_sem may be released after IO submission in some cases, i_alloc_sem is held through until dio_complete (in the case of AIO-DIO this happens through the IO completion callback). 7) i_sem and i_alloc_sem are not held for the _nolock versions of write routines, as used by blockdev and XFS. Filesystems can specify the needs_special_locking parameter to __blockdev_direct_IO from their direct IO address space op accordingly. Note from Badari: Here is the locking (when needs_special_locking is true): (1) generic_file_*_write() holds i_sem (as before) and calls ->direct_IO(). blockdev_direct_IO gets i_alloc_sem and call direct_io_worker(). (2) generic_file_*_read() does not hold any locks. blockdev_direct_IO() gets i_sem and then i_alloc_sem and calls direct_io_worker() to do the work (3) direct_io_worker() does the work and drops i_sem after submitting IOs if appropriate and drops i_alloc_sem after completing IOs. --- 25-akpm/fs/direct-io.c | 93 ++++++++++++++++++++++++++++++++++------ 25-akpm/fs/inode.c | 1 25-akpm/fs/open.c | 2 25-akpm/fs/xfs/linux/xfs_aops.c | 3 - 25-akpm/include/linux/fs.h | 31 ++++++++++++- 25-akpm/mm/filemap.c | 53 +++++++++++++++++----- 6 files changed, 154 insertions(+), 29 deletions(-) diff -puN fs/direct-io.c~O_DIRECT-race-fixes-rollup fs/direct-io.c --- 25/fs/direct-io.c~O_DIRECT-race-fixes-rollup 2004-04-03 03:00:09.338811840 -0800 +++ 25-akpm/fs/direct-io.c 2004-04-03 03:00:09.350810016 -0800 @@ -52,6 +52,10 @@ * * If blkfactor is zero then the user's request was aligned to the filesystem's * blocksize. + * + * needs_locking is set for regular files on direct-IO-naive filesystems. It + * determines whether we need to do the fancy locking which prevents direct-IO + * from being able to read uninitialised disk blocks. */ struct dio { @@ -59,6 +63,7 @@ struct dio { struct bio *bio; /* bio under assembly */ struct inode *inode; int rw; + int needs_locking; /* doesn't change */ unsigned blkbits; /* doesn't change */ unsigned blkfactor; /* When we're using an alignment which is finer than the filesystem's soft @@ -206,6 +211,8 @@ static void dio_complete(struct dio *dio { if (dio->end_io) dio->end_io(dio->inode, offset, bytes, dio->map_bh.b_private); + if (dio->needs_locking) + up_read(&dio->inode->i_alloc_sem); } /* @@ -449,6 +456,7 @@ static int get_more_blocks(struct dio *d unsigned long fs_count; /* Number of filesystem-sized blocks */ unsigned long dio_count;/* Number of dio_block-sized blocks */ unsigned long blkmask; + int beyond_eof = 0; /* * If there was a memory error and we've overwritten all the @@ -466,8 +474,19 @@ static int get_more_blocks(struct dio *d if (dio_count & blkmask) fs_count++; + if (dio->needs_locking) { + if (dio->block_in_file >= (i_size_read(dio->inode) >> + dio->blkbits)) + beyond_eof = 1; + } + /* + * For writes inside i_size we forbid block creations: only + * overwrites are permitted. We fall back to buffered writes + * at a higher level for inside-i_size block-instantiating + * writes. + */ ret = (*dio->get_blocks)(dio->inode, fs_startblk, fs_count, - map_bh, dio->rw == WRITE); + map_bh, (dio->rw == WRITE) && beyond_eof); } return ret; } @@ -774,6 +793,10 @@ do_holes: if (!buffer_mapped(map_bh)) { char *kaddr; + /* AKPM: eargh, -ENOTBLK is a hack */ + if (dio->rw == WRITE) + return -ENOTBLK; + if (dio->block_in_file >= i_size_read(dio->inode)>>blkbits) { /* We hit eof */ @@ -839,21 +862,21 @@ out: return ret; } +/* + * Releases both i_sem and i_alloc_sem + */ static int direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, const struct iovec *iov, loff_t offset, unsigned long nr_segs, - unsigned blkbits, get_blocks_t get_blocks, dio_iodone_t end_io) + unsigned blkbits, get_blocks_t get_blocks, dio_iodone_t end_io, + struct dio *dio) { unsigned long user_addr; int seg; int ret = 0; int ret2; - struct dio *dio; size_t bytes; - dio = kmalloc(sizeof(*dio), GFP_KERNEL); - if (!dio) - return -ENOMEM; dio->is_async = !is_sync_kiocb(iocb); dio->bio = NULL; @@ -864,7 +887,6 @@ direct_io_worker(int rw, struct kiocb *i dio->start_zero_done = 0; dio->block_in_file = offset >> blkbits; dio->blocks_available = 0; - dio->cur_page = NULL; dio->boundary = 0; @@ -953,6 +975,13 @@ direct_io_worker(int rw, struct kiocb *i dio_cleanup(dio); /* + * All new block allocations have been performed. We can let i_sem + * go now. + */ + if (dio->needs_locking) + up(&dio->inode->i_sem); + + /* * OK, all BIOs are submitted, so we can decrement bio_count to truly * reflect the number of to-be-processed BIOs. */ @@ -987,11 +1016,17 @@ direct_io_worker(int rw, struct kiocb *i /* * This is a library function for use by filesystem drivers. + * + * For writes to S_ISREG files, we are called under i_sem and return with i_sem + * held, even though it is internally dropped. + * + * For writes to S_ISBLK files, i_sem is not held on entry; it is never taken. */ int -blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, +__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, - unsigned long nr_segs, get_blocks_t get_blocks, dio_iodone_t end_io) + unsigned long nr_segs, get_blocks_t get_blocks, dio_iodone_t end_io, + int needs_special_locking) { int seg; size_t size; @@ -1000,6 +1035,8 @@ blockdev_direct_IO(int rw, struct kiocb unsigned bdev_blkbits = 0; unsigned blocksize_mask = (1 << blkbits) - 1; ssize_t retval = -EINVAL; + struct dio *dio; + int needs_locking; if (bdev) bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev)); @@ -1025,10 +1062,40 @@ blockdev_direct_IO(int rw, struct kiocb } } - retval = direct_io_worker(rw, iocb, inode, iov, offset, - nr_segs, blkbits, get_blocks, end_io); + dio = kmalloc(sizeof(*dio), GFP_KERNEL); + retval = -ENOMEM; + if (!dio) + goto out; + + /* + * For regular files, + * readers need to grab i_sem and i_alloc_sem + * writers need to grab i_alloc_sem only (i_sem is already held) + */ + needs_locking = 0; + if (S_ISREG(inode->i_mode) && needs_special_locking) { + needs_locking = 1; + if (rw == READ) { + struct address_space *mapping; + + mapping = iocb->ki_filp->f_mapping; + down(&inode->i_sem); + retval = filemap_write_and_wait(mapping); + if (retval) { + up(&inode->i_sem); + kfree(dio); + goto out; + } + } + down_read(&inode->i_alloc_sem); + } + dio->needs_locking = needs_locking; + + retval = direct_io_worker(rw, iocb, inode, iov, offset, + nr_segs, blkbits, get_blocks, end_io, dio); + if (needs_locking && rw == WRITE) + down(&inode->i_sem); out: return retval; } - -EXPORT_SYMBOL(blockdev_direct_IO); +EXPORT_SYMBOL(__blockdev_direct_IO); diff -puN fs/inode.c~O_DIRECT-race-fixes-rollup fs/inode.c --- 25/fs/inode.c~O_DIRECT-race-fixes-rollup 2004-04-03 03:00:09.340811536 -0800 +++ 25-akpm/fs/inode.c 2004-04-03 03:00:09.351809864 -0800 @@ -185,6 +185,7 @@ void inode_init_once(struct inode *inode INIT_LIST_HEAD(&inode->i_dentry); INIT_LIST_HEAD(&inode->i_devices); sema_init(&inode->i_sem, 1); + init_rwsem(&inode->i_alloc_sem); INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); spin_lock_init(&inode->i_data.page_lock); init_MUTEX(&inode->i_data.i_shared_sem); diff -puN fs/open.c~O_DIRECT-race-fixes-rollup fs/open.c --- 25/fs/open.c~O_DIRECT-race-fixes-rollup 2004-04-03 03:00:09.341811384 -0800 +++ 25-akpm/fs/open.c 2004-04-03 03:00:09.352809712 -0800 @@ -192,7 +192,9 @@ int do_truncate(struct dentry *dentry, l newattrs.ia_size = length; newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; down(&dentry->d_inode->i_sem); + down_write(&dentry->d_inode->i_alloc_sem); err = notify_change(dentry, &newattrs); + up_write(&dentry->d_inode->i_alloc_sem); up(&dentry->d_inode->i_sem); return err; } diff -puN include/linux/fs.h~O_DIRECT-race-fixes-rollup include/linux/fs.h --- 25/include/linux/fs.h~O_DIRECT-race-fixes-rollup 2004-04-03 03:00:09.343811080 -0800 +++ 25-akpm/include/linux/fs.h 2004-04-03 03:00:09.354809408 -0800 @@ -397,6 +397,7 @@ struct inode { unsigned short i_bytes; spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ struct semaphore i_sem; + struct rw_semaphore i_alloc_sem; struct inode_operations *i_op; struct file_operations *i_fop; /* former ->i_op->default_file_ops */ struct super_block *i_sb; @@ -1235,6 +1236,7 @@ extern void write_inode_now(struct inode extern int filemap_fdatawrite(struct address_space *); extern int filemap_flush(struct address_space *); extern int filemap_fdatawait(struct address_space *); +extern int filemap_write_and_wait(struct address_space *mapping); extern void sync_supers(void); extern void sync_filesystems(int wait); extern void emergency_sync(void); @@ -1347,9 +1349,6 @@ extern void file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); extern ssize_t generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs); -extern int blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, - struct block_device *bdev, const struct iovec *iov, loff_t offset, - unsigned long nr_segs, get_blocks_t *get_blocks, dio_iodone_t *end_io); extern ssize_t generic_file_readv(struct file *filp, const struct iovec *iov, unsigned long nr_segs, loff_t *ppos); ssize_t generic_file_writev(struct file *filp, const struct iovec *iov, @@ -1371,6 +1370,32 @@ static inline void do_generic_file_read( actor); } +int __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, + struct block_device *bdev, const struct iovec *iov, loff_t offset, + unsigned long nr_segs, get_blocks_t get_blocks, dio_iodone_t end_io, + int needs_special_locking); + +/* + * For filesystems which need locking between buffered and direct access + */ +static inline int blockdev_direct_IO(int rw, struct kiocb *iocb, + struct inode *inode, struct block_device *bdev, const struct iovec *iov, + loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks, + dio_iodone_t end_io) +{ + return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, + nr_segs, get_blocks, end_io, 1); +} + +static inline int blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb, + struct inode *inode, struct block_device *bdev, const struct iovec *iov, + loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks, + dio_iodone_t end_io) +{ + return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, + nr_segs, get_blocks, end_io, 0); +} + extern struct file_operations generic_ro_fops; #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) diff -puN mm/filemap.c~O_DIRECT-race-fixes-rollup mm/filemap.c --- 25/mm/filemap.c~O_DIRECT-race-fixes-rollup 2004-04-03 03:00:09.344810928 -0800 +++ 25-akpm/mm/filemap.c 2004-04-03 03:00:09.356809104 -0800 @@ -73,6 +73,9 @@ * ->mmap_sem * ->i_sem (msync) * + * ->i_sem + * ->i_alloc_sem (various) + * * ->inode_lock * ->sb_lock (fs/fs-writeback.c) * ->mapping->page_lock (__sync_single_inode) @@ -228,6 +231,18 @@ restart: EXPORT_SYMBOL(filemap_fdatawait); +int filemap_write_and_wait(struct address_space *mapping) +{ + int retval = 0; + + if (mapping->nrpages) { + retval = filemap_fdatawrite(mapping); + if (retval == 0) + retval = filemap_fdatawait(mapping); + } + return retval; +} + /* * This adds a page to the page cache, starting out as locked, unreferenced, * not uptodate and with no errors. @@ -1716,6 +1731,7 @@ EXPORT_SYMBOL(generic_write_checks); /* * Write to a file through the page cache. + * Called under i_sem for S_ISREG files. * * We put everything into the page cache prior to writing it. This is not a * problem when writing full pages. With partial pages, however, we first have @@ -1806,12 +1822,19 @@ generic_file_aio_write_nolock(struct kio /* * Sync the fs metadata but not the minor inode changes and * of course not the data as we did direct DMA for the IO. + * i_sem is held, which protects generic_osync_inode() from + * livelocking. */ if (written >= 0 && file->f_flags & O_SYNC) status = generic_osync_inode(inode, mapping, OSYNC_METADATA); if (written >= 0 && !is_sync_kiocb(iocb)) written = -EIOCBQUEUED; - goto out_status; + if (written != -ENOTBLK) + goto out_status; + /* + * direct-io write to a hole: fall through to buffered I/O + */ + written = 0; } buf = iov->iov_base; @@ -1900,6 +1923,14 @@ generic_file_aio_write_nolock(struct kio OSYNC_METADATA|OSYNC_DATA); } + /* + * If we get here for O_DIRECT writes then we must have fallen through + * to buffered writes (block instantiation inside i_size). So we sync + * the file data here, to try to honour O_DIRECT expectations. + */ + if (unlikely(file->f_flags & O_DIRECT) && written) + status = filemap_write_and_wait(mapping); + out_status: err = written ? written : status; out: @@ -1991,6 +2022,9 @@ ssize_t generic_file_writev(struct file EXPORT_SYMBOL(generic_file_writev); +/* + * Called under i_sem for writes to S_ISREG files + */ ssize_t generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) @@ -1999,18 +2033,13 @@ generic_file_direct_IO(int rw, struct ki struct address_space *mapping = file->f_mapping; ssize_t retval; - if (mapping->nrpages) { - retval = filemap_fdatawrite(mapping); - if (retval == 0) - retval = filemap_fdatawait(mapping); - if (retval) - goto out; + retval = filemap_write_and_wait(mapping); + if (retval == 0) { + retval = mapping->a_ops->direct_IO(rw, iocb, iov, + offset, nr_segs); + if (rw == WRITE && mapping->nrpages) + invalidate_inode_pages2(mapping); } - - retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs); - if (rw == WRITE && mapping->nrpages) - invalidate_inode_pages2(mapping); -out: return retval; } diff -puN fs/xfs/linux/xfs_aops.c~O_DIRECT-race-fixes-rollup fs/xfs/linux/xfs_aops.c --- 25/fs/xfs/linux/xfs_aops.c~O_DIRECT-race-fixes-rollup 2004-04-03 03:00:09.345810776 -0800 +++ 25-akpm/fs/xfs/linux/xfs_aops.c 2004-04-03 03:00:09.357808952 -0800 @@ -1032,7 +1032,8 @@ linvfs_direct_IO( if (error) return -error; - return blockdev_direct_IO(rw, iocb, inode, iomap.iomap_target->pbr_bdev, + return blockdev_direct_IO_no_locking(rw, iocb, inode, + iomap.iomap_target->pbr_bdev, iov, offset, nr_segs, linvfs_get_blocks_direct, linvfs_unwritten_convert_direct); _