From: Nathan Scott This patch adds a mechanism by which a filesystem can register an interest in the completion of direct I/O. The completion routine will be given the inode, an offset and a length, and an optional filesystem-private field. We have extended the use of the buffer_head-based interface (i.e. get_block_t) for direct I/O such that the b_private field is now utilised. It is defined to be initially zero at the start of I/O, and will be passed into the filesystem unmodified by the VFS with each map request, while setting up the direct I/O. Once I/O has completed the final value of this pointer will be passed into a filesystems I/O completion handler. This mechanism can be used to keep track of all of the mapping requests which encompass an individual direct I/O request. This has been implemented specifically for XFS, but is done so as to be as generic as possible. XFS uses this mechanism to provide support for unwritten extents - these are file extents which have been pre-allocated on-disk, but not yet written to (once written, these become regular file extents, but only once I/O is complete). 25-akpm/fs/block_dev.c | 2 - 25-akpm/fs/direct-io.c | 28 +++++++++++++++++++++++---- 25-akpm/fs/ext2/inode.c | 2 - 25-akpm/fs/ext3/inode.c | 3 +- 25-akpm/fs/jfs/inode.c | 2 - 25-akpm/fs/xfs/linux/xfs_aops.c | 41 +++++++++++++++++++++++++++++++++------- 25-akpm/include/linux/fs.h | 4 ++- 7 files changed, 66 insertions(+), 16 deletions(-) diff -puN fs/block_dev.c~xfs-dio-unwritten-extents fs/block_dev.c --- 25/fs/block_dev.c~xfs-dio-unwritten-extents Wed Jul 30 14:16:24 2003 +++ 25-akpm/fs/block_dev.c Wed Jul 30 14:16:24 2003 @@ -125,7 +125,7 @@ blkdev_direct_IO(int rw, struct kiocb *i struct inode *inode = file->f_dentry->d_inode->i_mapping->host; return blockdev_direct_IO(rw, iocb, inode, inode->i_bdev, iov, offset, - nr_segs, blkdev_get_blocks); + nr_segs, blkdev_get_blocks, NULL); } static int blkdev_writepage(struct page *page, struct writeback_control *wbc) diff -puN fs/direct-io.c~xfs-dio-unwritten-extents fs/direct-io.c --- 25/fs/direct-io.c~xfs-dio-unwritten-extents Wed Jul 30 14:16:24 2003 +++ 25-akpm/fs/direct-io.c Wed Jul 30 14:16:24 2003 @@ -15,6 +15,8 @@ * added support for non-aligned IO. * 06Nov2002 pbadari@us.ibm.com * added asynchronous IO support. + * 21Jul2003 nathans@sgi.com + * added IO completion notifier. */ #include @@ -74,6 +76,7 @@ struct dio { int boundary; /* prev block is at a boundary */ int reap_counter; /* rate limit reaping */ get_blocks_t *get_blocks; /* block mapping function */ + dio_iodone_t *end_io; /* IO completion function */ sector_t final_block_in_bio; /* current final block in bio + 1 */ sector_t next_block_for_io; /* next block to be put under IO, in dio_blocks units */ @@ -193,13 +196,27 @@ static struct page *dio_get_page(struct } /* + * Called when all DIO BIO I/O has been completed - let the filesystem + * know, if it registered an interest earlier via get_blocks. Pass the + * private field of the map buffer_head so that filesystems can use it + * to hold additional state between get_blocks calls and dio_complete. + */ +static void dio_complete(struct dio *dio, loff_t offset, ssize_t bytes) +{ + if (dio->end_io) + dio->end_io(dio->inode, offset, bytes, dio->map_bh.b_private); +} + +/* * Called when a BIO has been processed. If the count goes to zero then IO is * complete and we can signal this to the AIO layer. */ static void finished_one_bio(struct dio *dio) { if (atomic_dec_and_test(&dio->bio_count)) { - if(dio->is_async) { + if (dio->is_async) { + dio_complete(dio, dio->block_in_file << dio->blkbits, + dio->result); aio_complete(dio->iocb, dio->result, 0); kfree(dio); } @@ -824,7 +841,7 @@ out: static int direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, const struct iovec *iov, loff_t offset, unsigned long nr_segs, - unsigned blkbits, get_blocks_t get_blocks) + unsigned blkbits, get_blocks_t get_blocks, dio_iodone_t end_io) { unsigned long user_addr; int seg; @@ -852,6 +869,8 @@ direct_io_worker(int rw, struct kiocb *i dio->boundary = 0; dio->reap_counter = 0; dio->get_blocks = get_blocks; + dio->end_io = end_io; + dio->map_bh.b_private = NULL; dio->final_block_in_bio = -1; dio->next_block_for_io = -1; @@ -953,6 +972,7 @@ direct_io_worker(int rw, struct kiocb *i if (rw == READ && (offset + ret > i_size)) ret = i_size - offset; } + dio_complete(dio, offset, ret); kfree(dio); } return ret; @@ -964,7 +984,7 @@ direct_io_worker(int rw, struct kiocb *i int blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, - unsigned long nr_segs, get_blocks_t get_blocks) + unsigned long nr_segs, get_blocks_t get_blocks, dio_iodone_t end_io) { int seg; size_t size; @@ -999,7 +1019,7 @@ blockdev_direct_IO(int rw, struct kiocb } retval = direct_io_worker(rw, iocb, inode, iov, offset, - nr_segs, blkbits, get_blocks); + nr_segs, blkbits, get_blocks, end_io); out: return retval; } diff -puN fs/ext2/inode.c~xfs-dio-unwritten-extents fs/ext2/inode.c --- 25/fs/ext2/inode.c~xfs-dio-unwritten-extents Wed Jul 30 14:16:24 2003 +++ 25-akpm/fs/ext2/inode.c Wed Jul 30 14:16:24 2003 @@ -662,7 +662,7 @@ ext2_direct_IO(int rw, struct kiocb *ioc struct inode *inode = file->f_dentry->d_inode->i_mapping->host; return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, - offset, nr_segs, ext2_get_blocks); + offset, nr_segs, ext2_get_blocks, NULL); } static int diff -puN fs/ext3/inode.c~xfs-dio-unwritten-extents fs/ext3/inode.c --- 25/fs/ext3/inode.c~xfs-dio-unwritten-extents Wed Jul 30 14:16:24 2003 +++ 25-akpm/fs/ext3/inode.c Wed Jul 30 14:16:24 2003 @@ -1562,7 +1562,8 @@ static int ext3_direct_IO(int rw, struct } ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, - offset, nr_segs, ext3_direct_io_get_blocks); + offset, nr_segs, + ext3_direct_io_get_blocks, NULL); out_stop: if (handle) { diff -puN fs/jfs/inode.c~xfs-dio-unwritten-extents fs/jfs/inode.c --- 25/fs/jfs/inode.c~xfs-dio-unwritten-extents Wed Jul 30 14:16:24 2003 +++ 25-akpm/fs/jfs/inode.c Wed Jul 30 14:16:24 2003 @@ -308,7 +308,7 @@ static int jfs_direct_IO(int rw, struct struct inode *inode = file->f_dentry->d_inode->i_mapping->host; return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, - offset, nr_segs, jfs_get_blocks); + offset, nr_segs, jfs_get_blocks, NULL); } struct address_space_operations jfs_aops = { diff -puN fs/xfs/linux/xfs_aops.c~xfs-dio-unwritten-extents fs/xfs/linux/xfs_aops.c --- 25/fs/xfs/linux/xfs_aops.c~xfs-dio-unwritten-extents Wed Jul 30 14:16:24 2003 +++ 25-akpm/fs/xfs/linux/xfs_aops.c Wed Jul 30 14:16:24 2003 @@ -76,10 +76,10 @@ linvfs_unwritten_done( /* * Issue transactions to convert a buffer range from unwritten - * to written extents. + * to written extents (buffered IO). */ STATIC void -linvfs_unwritten_conv( +linvfs_unwritten_convert( xfs_buf_t *bp) { vnode_t *vp = XFS_BUF_FSPRIVATE(bp, vnode_t *); @@ -96,6 +96,30 @@ linvfs_unwritten_conv( pagebuf_iodone(bp, 0, 0); } +/* + * Issue transactions to convert a buffer range from unwritten + * to written extents (direct IO). + */ +STATIC void +linvfs_unwritten_convert_direct( + struct inode *inode, + loff_t offset, + ssize_t size, + void *private) +{ + ASSERT(!private || inode == (struct inode *)private); + + /* private indicates an unwritten extent lay beneath this IO, + * see linvfs_get_block_core. + */ + if (private && size > 0) { + vnode_t *vp = LINVFS_GET_VP(inode); + int error; + + VOP_BMAP(vp, offset, size, BMAP_UNWRITTEN, NULL, NULL, error); + } +} + STATIC int map_blocks( struct inode *inode, @@ -456,7 +480,7 @@ map_unwritten( XFS_BUF_SET_SIZE(pb, size); XFS_BUF_SET_OFFSET(pb, offset); XFS_BUF_SET_FSPRIVATE(pb, LINVFS_GET_VP(inode)); - XFS_BUF_SET_IODONE_FUNC(pb, linvfs_unwritten_conv); + XFS_BUF_SET_IODONE_FUNC(pb, linvfs_unwritten_convert); if (atomic_dec_and_test(&pb->pb_io_remaining) == 1) { pagebuf_iodone(pb, 1, 1); @@ -804,7 +828,7 @@ STATIC int linvfs_get_block_core( struct inode *inode, sector_t iblock, - int blocks, + unsigned long blocks, struct buffer_head *bh_result, int create, int direct, @@ -854,8 +878,11 @@ linvfs_get_block_core( set_buffer_mapped(bh_result); } if (pbmap.pbm_flags & PBMF_UNWRITTEN) { - if (create) + if (create) { + if (direct) + bh_result->b_private = inode; set_buffer_mapped(bh_result); + } set_buffer_unwritten(bh_result); set_buffer_delay(bh_result); } @@ -935,8 +962,8 @@ linvfs_direct_IO( struct file *file = iocb->ki_filp; struct inode *inode = file->f_dentry->d_inode->i_mapping->host; - return blockdev_direct_IO(rw, iocb, inode, NULL, - iov, offset, nr_segs, linvfs_get_blocks_direct); + return blockdev_direct_IO(rw, iocb, inode, NULL, iov, offset, nr_segs, + linvfs_get_blocks_direct, linvfs_unwritten_convert_direct); } diff -puN include/linux/fs.h~xfs-dio-unwritten-extents include/linux/fs.h --- 25/include/linux/fs.h~xfs-dio-unwritten-extents Wed Jul 30 14:16:24 2003 +++ 25-akpm/include/linux/fs.h Wed Jul 30 14:16:24 2003 @@ -219,6 +219,8 @@ typedef int (get_block_t)(struct inode * typedef int (get_blocks_t)(struct inode *inode, sector_t iblock, unsigned long max_blocks, struct buffer_head *bh_result, int create); +typedef void (dio_iodone_t)(struct inode *inode, loff_t offset, + ssize_t bytes, void *private); /* * Attribute flags. These should be or-ed together to figure out what @@ -1291,7 +1293,7 @@ extern ssize_t generic_file_direct_IO(in const struct iovec *iov, loff_t offset, unsigned long nr_segs); extern int blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, - unsigned long nr_segs, get_blocks_t *get_blocks); + unsigned long nr_segs, get_blocks_t *get_blocks, dio_iodone_t *end_io); extern ssize_t generic_file_readv(struct file *filp, const struct iovec *iov, unsigned long nr_segs, loff_t *ppos); ssize_t generic_file_writev(struct file *filp, const struct iovec *iov, _