From: Nathan Scott Here's a patch which fixes the deadlock Brad Fitzpatrick reported here: http://lkml.org/lkml/2004/11/14/98 The meat of the problem is a locking order reversal between the XFS I/O lock and the inode semaphore (i_sem). When we mix a number of threads doing both direct reads and writes, we hit an ABBA deadlock after a while because direct-io.c is taking and dropping i_sem after the XFS read path has already taken its I/O lock. This is the wrong way around from XFS's point of view, in particular its the opposite order to the XFS write path. So this patch changes the logic for direct reads in the DIO_OWN_LOCKING case (i.e. XFS-only case), but leaves things as is for the other two types of locking. Not real pretty, but fixes up the lock ordering and deadlock. Oh, I have tested direct reads and writes on ext3 and block devices as well, to ensure they still function correctly with the change (i.e. they don't regress, should be a no-op there) which covers the other locking cases in __blockdev_direct_IO. Signed-off-by: Andrew Morton --- 25-akpm/fs/direct-io.c | 42 +++++++++++++++++++++++++++++-------- 25-akpm/fs/xfs/linux-2.6/xfs_lrw.c | 17 +++++++++----- 2 files changed, 45 insertions(+), 14 deletions(-) diff -puN fs/direct-io.c~fix-an-xfs-direct-i-o-deadlock fs/direct-io.c --- 25/fs/direct-io.c~fix-an-xfs-direct-i-o-deadlock 2004-11-24 18:47:17.870101920 -0800 +++ 25-akpm/fs/direct-io.c 2004-11-24 18:47:17.881100248 -0800 @@ -1147,11 +1147,23 @@ direct_io_worker(int rw, struct kiocb *i /* * This is a library function for use by filesystem drivers. + * The locking rules are governed by the dio_lock_type parameter. * - * For writes to S_ISREG files, we are called under i_sem and return with i_sem - * held, even though it is internally dropped. + * DIO_NO_LOCKING (no locking, for raw block device access) + * For writes, i_sem is not held on entry; it is never taken. * - * For writes to S_ISBLK files, i_sem is not held on entry; it is never taken. + * DIO_LOCKING (simple locking for regular files) + * For writes we are called under i_sem and return with i_sem held, even though + * it is internally dropped. + * For reads, i_sem is not held on entry, but it is taken and dropped before + * returning. + * + * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of + * uninitialised data, allowing parallel direct readers and writers) + * For writes we are called without i_sem, return without it, never touch it. + * For reads, i_sem is held on entry and will be released before returning. + * + * Additional i_alloc_sem locking requirements described inline below. */ ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, @@ -1168,6 +1180,7 @@ __blockdev_direct_IO(int rw, struct kioc ssize_t retval = -EINVAL; loff_t end = offset; struct dio *dio; + int reader_with_isem = (rw == READ && dio_lock_type == DIO_OWN_LOCKING); if (bdev) bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev)); @@ -1200,12 +1213,14 @@ __blockdev_direct_IO(int rw, struct kioc goto out; /* + * For block device access DIO_NO_LOCKING is used, + * neither readers nor writers do any locking at all * For regular files using DIO_LOCKING, * readers need to grab i_sem and i_alloc_sem * writers need to grab i_alloc_sem only (i_sem is already held) * For regular files using DIO_OWN_LOCKING, - * both readers and writers need to grab i_alloc_sem - * neither readers nor writers hold i_sem on entry (nor exit) + * readers need to grab i_alloc_sem only (i_sem is already held) + * writers need to grab i_alloc_sem only */ dio->lock_type = dio_lock_type; if (dio_lock_type != DIO_NO_LOCKING) { @@ -1213,20 +1228,25 @@ __blockdev_direct_IO(int rw, struct kioc struct address_space *mapping; mapping = iocb->ki_filp->f_mapping; - down(&inode->i_sem); + if (dio_lock_type != DIO_OWN_LOCKING) { + down(&inode->i_sem); + reader_with_isem = 1; + } retval = filemap_write_and_wait(mapping); if (retval) { - up(&inode->i_sem); kfree(dio); goto out; } down_read(&inode->i_alloc_sem); - if (dio_lock_type == DIO_OWN_LOCKING) + if (dio_lock_type == DIO_OWN_LOCKING) { up(&inode->i_sem); + reader_with_isem = 0; + } } else { down_read(&inode->i_alloc_sem); } } + /* * For file extending writes updating i_size before data * writeouts complete can expose uninitialized blocks. So @@ -1238,7 +1258,13 @@ __blockdev_direct_IO(int rw, struct kioc retval = direct_io_worker(rw, iocb, inode, iov, offset, nr_segs, blkbits, get_blocks, end_io, dio); + + if (rw == READ && dio_lock_type == DIO_LOCKING) + reader_with_isem = 0; + out: + if (reader_with_isem) + up(&inode->i_sem); return retval; } EXPORT_SYMBOL(__blockdev_direct_IO); diff -puN fs/xfs/linux-2.6/xfs_lrw.c~fix-an-xfs-direct-i-o-deadlock fs/xfs/linux-2.6/xfs_lrw.c --- 25/fs/xfs/linux-2.6/xfs_lrw.c~fix-an-xfs-direct-i-o-deadlock 2004-11-24 18:47:17.876101008 -0800 +++ 25-akpm/fs/xfs/linux-2.6/xfs_lrw.c 2004-11-24 18:47:17.880100400 -0800 @@ -244,6 +244,7 @@ xfs_read( cred_t *credp) { struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; size_t size = 0; ssize_t ret; xfs_fsize_t n; @@ -272,7 +273,7 @@ xfs_read( } /* END copy & waste from filemap.c */ - if (ioflags & IO_ISDIRECT) { + if (unlikely(ioflags & IO_ISDIRECT)) { xfs_buftarg_t *target = (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ? mp->m_rtdev_targp : mp->m_ddev_targp; @@ -296,18 +297,20 @@ xfs_read( return -EIO; } + if (unlikely(ioflags & IO_ISDIRECT)) + down(&inode->i_sem); xfs_ilock(ip, XFS_IOLOCK_SHARED); if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) { vrwlock_t locktype = VRWLOCK_READ; - ret = XFS_SEND_DATA(mp, DM_EVENT_READ, + ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp), *offset, size, FILP_DELAY_FLAG(file), &locktype); if (ret) { xfs_iunlock(ip, XFS_IOLOCK_SHARED); - return -ret; + goto unlock_isem; } } @@ -316,15 +319,17 @@ xfs_read( ret = __generic_file_aio_read(iocb, iovp, segs, offset); if (ret == -EIOCBQUEUED) ret = wait_on_sync_kiocb(iocb); - - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - if (ret > 0) XFS_STATS_ADD(xs_read_bytes, ret); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + if (likely(!(ioflags & IO_INVIS))) xfs_ichgtime(ip, XFS_ICHGTIME_ACC); +unlock_isem: + if (unlikely(ioflags & IO_ISDIRECT)) + up(&inode->i_sem); return ret; } _