aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/ntfs.txt5
-rw-r--r--fs/ntfs/ChangeLog42
-rw-r--r--fs/ntfs/mft.c29
3 files changed, 65 insertions, 11 deletions
diff --git a/Documentation/filesystems/ntfs.txt b/Documentation/filesystems/ntfs.txt
index 1415b96ed491fe..eef4aca0c75304 100644
--- a/Documentation/filesystems/ntfs.txt
+++ b/Documentation/filesystems/ntfs.txt
@@ -451,9 +451,12 @@ Note, a technical ChangeLog aimed at kernel hackers is in fs/ntfs/ChangeLog.
- Implement extension of resident files using the normal file write
code paths, i.e. most very small files can be extended to be a little
bit bigger but not by much.
+ - Add new mount option "disable_sparse". (See list of mount options
+ above for details.)
- Improve handling of ntfs volumes with errors and strange boot sectors
in particular.
- - Fix various bugs.
+ - Fix various bugs including a nasty deadlock that appeared in recent
+ kernels (around 2.6.11-2.6.12 timeframe).
2.1.22:
- Improve handling of ntfs volumes with errors.
- Fix various bugs and race conditions.
diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index 3d2cac4061d651..9709fac6531dd7 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -132,6 +132,48 @@ ToDo/Notes:
the already mapped runlist fragment which causes
ntfs_mapping_pairs_decompress() to fail and return error. Update
ntfs_attr_find_vcn_nolock() accordingly.
+ - Fix a nasty deadlock that appeared in recent kernels.
+ The situation: VFS inode X on a mounted ntfs volume is dirty. For
+ same inode X, the ntfs_inode is dirty and thus corresponding on-disk
+ inode, i.e. mft record, which is in a dirty PAGE_CACHE_PAGE belonging
+ to the table of inodes, i.e. $MFT, inode 0.
+ What happens:
+ Process 1: sys_sync()/umount()/whatever... calls
+ __sync_single_inode() for $MFT -> do_writepages() -> write_page for
+ the dirty page containing the on-disk inode X, the page is now locked
+ -> ntfs_write_mst_block() which clears PageUptodate() on the page to
+ prevent anyone else getting hold of it whilst it does the write out.
+ This is necessary as the on-disk inode needs "fixups" applied before
+ the write to disk which are removed again after the write and
+ PageUptodate is then set again. It then analyses the page looking
+ for dirty on-disk inodes and when it finds one it calls
+ ntfs_may_write_mft_record() to see if it is safe to write this
+ on-disk inode. This then calls ilookup5() to check if the
+ corresponding VFS inode is in icache(). This in turn calls ifind()
+ which waits on the inode lock via wait_on_inode whilst holding the
+ global inode_lock.
+ Process 2: pdflush results in a call to __sync_single_inode for the
+ same VFS inode X on the ntfs volume. This locks the inode (I_LOCK)
+ then calls write-inode -> ntfs_write_inode -> map_mft_record() ->
+ read_cache_page() for the page (in page cache of table of inodes
+ $MFT, inode 0) containing the on-disk inode. This page has
+ PageUptodate() clear because of Process 1 (see above) so
+ read_cache_page() blocks when it tries to take the page lock for the
+ page so it can call ntfs_read_page().
+ Thus Process 1 is holding the page lock on the page containing the
+ on-disk inode X and it is waiting on the inode X to be unlocked in
+ ifind() so it can write the page out and then unlock the page.
+ And Process 2 is holding the inode lock on inode X and is waiting for
+ the page to be unlocked so it can call ntfs_readpage() or discover
+ that Process 1 set PageUptodate() again and use the page.
+ Thus we have a deadlock due to ifind() waiting on the inode lock.
+ The solution: The fix is to use the newly introduced
+ ilookup5_nowait() which does not wait on the inode's lock and hence
+ avoids the deadlock. This is safe as we do not care about the VFS
+ inode and only use the fact that it is in the VFS inode cache and the
+ fact that the vfs and ntfs inodes are one struct in memory to find
+ the ntfs inode in memory if present. Also, the ntfs inode has its
+ own locking so it does not matter if the vfs inode is locked.
2.1.22 - Many bug and race fixes and error handling improvements.
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 3d0ba8e60adc52..ac9ff39aa8343e 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -948,20 +948,23 @@ BOOL ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
na.name_len = 0;
na.type = AT_UNUSED;
/*
- * For inode 0, i.e. $MFT itself, we cannot use ilookup5() from here or
- * we deadlock because the inode is already locked by the kernel
- * (fs/fs-writeback.c::__sync_single_inode()) and ilookup5() waits
- * until the inode is unlocked before returning it and it never gets
- * unlocked because ntfs_should_write_mft_record() never returns. )-:
- * Fortunately, we have inode 0 pinned in icache for the duration of
- * the mount so we can access it directly.
+ * Optimize inode 0, i.e. $MFT itself, since we have it in memory and
+ * we get here for it rather often.
*/
if (!mft_no) {
/* Balance the below iput(). */
vi = igrab(mft_vi);
BUG_ON(vi != mft_vi);
- } else
- vi = ilookup5(sb, mft_no, (test_t)ntfs_test_inode, &na);
+ } else {
+ /*
+ * Have to use ilookup5_nowait() since ilookup5() waits for the
+ * inode lock which causes ntfs to deadlock when a concurrent
+ * inode write via the inode dirty code paths and the page
+ * dirty code path of the inode dirty code path when writing
+ * $MFT occurs.
+ */
+ vi = ilookup5_nowait(sb, mft_no, (test_t)ntfs_test_inode, &na);
+ }
if (vi) {
ntfs_debug("Base inode 0x%lx is in icache.", mft_no);
/* The inode is in icache. */
@@ -1016,7 +1019,13 @@ BOOL ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
na.mft_no = MREF_LE(m->base_mft_record);
ntfs_debug("Mft record 0x%lx is an extent record. Looking for base "
"inode 0x%lx in icache.", mft_no, na.mft_no);
- vi = ilookup5(sb, na.mft_no, (test_t)ntfs_test_inode, &na);
+ if (!na.mft_no) {
+ /* Balance the below iput(). */
+ vi = igrab(mft_vi);
+ BUG_ON(vi != mft_vi);
+ } else
+ vi = ilookup5_nowait(sb, na.mft_no, (test_t)ntfs_test_inode,
+ &na);
if (!vi) {
/*
* The base inode is not in icache, write this extent mft