bk://linux-ntfs.bkbits.net/ntfs-2.6-devel aia21@cantab.net|ChangeSet|20040608103654|38230 aia21 # This is a BitKeeper generated diff -Nru style patch. # # ChangeSet # 2004/06/08 11:36:54+01:00 aia21@cantab.net # NTFS: 2.1.13 - Enable overwriting of resident files and housekeeping of system files. # - Mark the volume dirty when (re)mounting read-write and mark it clean # when unmounting or remounting read-only. If any volume errors are # found, the volume is left marked dirty to force chkdsk to run. # - Add code to set the NT4 compatibility flag when (re)mounting # read-write for newer NTFS versions but leave it commented out for now # since we do not make any modifications that are NTFS 1.2 specific yet # and since setting this flag breaks Captive-NTFS which is not nice. # This code must be enabled once we start writing NTFS 1.2 specific # changes otherwise Windows NTFS driver might crash / cause corruption. # - Fix a silly bug that caused a deadlock in ntfs_mft_writepage(). # For inode 0, i.e. $MFT itself, we cannot use ilookup5() from # there because the inode is already locked by the kernel # (fs/fs-writeback.c::__sync_single_inode()) and ilookup5() waits # until the inode is unlocked before returning it and it never gets # unlocked because ntfs_mft_writepage() never returns. )-: # Fortunately, we have inode 0 pinned in icache for the duration # of the mount so we can access it directly. # # Signed-off-by: Anton Altaparmakov # # fs/ntfs/super.c # 2004/06/08 11:36:48+01:00 aia21@cantab.net +139 -35 # - Mark the volume dirty when (re)mounting read-write and mark it clean # when unmounting or remounting read-only. If any volume errors are # found, the volume is left marked dirty to force chkdsk to run. # - Add code to set the NT4 compatibility flag when (re)mounting # read-write for newer NTFS versions but leave it commented out for now # since we do not make any modifications that are NTFS 1.2 specific yet # and since setting this flag breaks Captive-NTFS which is not nice. # This code must be enabled once we start writing NTFS 1.2 specific # changes otherwise Windows NTFS driver might crash / cause corruption. # # fs/ntfs/mft.c # 2004/06/08 11:36:48+01:00 aia21@cantab.net +41 -5 # Fix a silly bug that caused a deadlock in ntfs_mft_writepage(). # For inode 0, i.e. $MFT itself, we cannot use ilookup5() from # there because the inode is already locked by the kernel # (fs/fs-writeback.c::__sync_single_inode()) and ilookup5() waits # until the inode is unlocked before returning it and it never gets # unlocked because ntfs_mft_writepage() never returns. )-: # Fortunately, we have inode 0 pinned in icache for the duration # of the mount so we can access it directly. # # fs/ntfs/ChangeLog # 2004/06/08 11:36:48+01:00 aia21@cantab.net +1 -1 # Missed a line. # # fs/ntfs/Makefile # 2004/06/08 09:36:50+01:00 aia21@cantab.net +1 -1 # Bump version to 2.1.13. # # fs/ntfs/ChangeLog # 2004/06/08 09:36:50+01:00 aia21@cantab.net +12 -1 # Update for 2.1.13 release. # # Documentation/filesystems/ntfs.txt # 2004/06/08 09:36:50+01:00 aia21@cantab.net +13 -0 # Update for 2.1.13 release. # # ChangeSet # 2004/06/07 10:40:50+01:00 aia21@cantab.net # NTFS: Add functions ntfs_{clear,set}_volume_flags(), to modify the volume # information flags (fs/ntfs/super.c). # # Signed-off-by: Anton Altaparmakov # # fs/ntfs/super.c # 2004/06/07 10:40:44+01:00 aia21@cantab.net +95 -0 # Add functions ntfs_{clear,set}_volume_flags(), to modify the volume # information flags. # # fs/ntfs/ChangeLog # 2004/06/07 10:40:44+01:00 aia21@cantab.net +2 -0 # Update # # ChangeSet # 2004/06/04 16:59:48+01:00 aia21@cantab.net # NTFS: Implement ntfs_mft_writepage() so it now checks if any of the mft # records in the page are dirty and if so redirties the page and # returns. Otherwise it just returns (after doing set_page_writeback(), # unlock_page(), end_page_writeback() or the radix-tree tag # PAGECACHE_TAG_DIRTY remains set even though the page is clean), thus # alowing the VM to do with the page as it pleases. Also, at umount # time, now only throw away dirty mft (meta)data pages if dirty inodes # are present and ask the user to email us if they see this happening. # # Signed-off-by: Anton Altaparmakov # # fs/ntfs/super.c # 2004/06/04 16:59:41+01:00 aia21@cantab.net +29 -11 # Only throw away dirty mft (meta)data page cache pages if dirty # inodes are present as this should never happen any more with the # new ntfs_mft_writepage() implementation. Ask the user to email # us if they see this happening. # # fs/ntfs/mft.c # 2004/06/04 16:59:41+01:00 aia21@cantab.net +167 -5 # Implement ntfs_mft_writepage() so that it checks if any of the mft # records in the page are dirty and if so redirties the page before # unlocking it. Otherwise it just returns (after doing # set_page_writeback(), unlock_page(), end_page_writeback() or the # radix-tree tag PAGECACHE_TAG_DIRTY remains set even though the # page is clean). # # fs/ntfs/ChangeLog # 2004/06/04 16:59:41+01:00 aia21@cantab.net +8 -0 # Update # # ChangeSet # 2004/06/04 16:35:54+01:00 aia21@cantab.net # NTFS: Use set_page_writeback()/end_page_writeback() in ntfs_writepage() # resident attribute write code path as otherwise the radix-tree tag # PAGECACHE_TAG_DIRTY remains set even though the page is clean. # # Signed-off-by: Anton Altaparmakov # # fs/ntfs/aops.c # 2004/06/04 16:35:48+01:00 aia21@cantab.net +15 -12 # - Use set_page_writeback()/end_page_writeback() in ntfs_writepage() # resident attribute write code path as otherwise the radix-tree tag # PAGECACHE_TAG_DIRTY remains set even though the page is clean. # - Cleanup some debug output. # # fs/ntfs/ChangeLog # 2004/06/04 16:35:47+01:00 aia21@cantab.net +4 -0 # Update # # ChangeSet # 2004/06/01 17:00:58+01:00 aia21@cantab.net # NTFS: - Implement fs/ntfs/mft.[hc]::{,__}mark_mft_record_dirty() and make # fs/ntfs/aops.c::ntfs_writepage() and ntfs_commit_write() use it, thus # finally enabling resident file overwrite! (-8 This also includes a # placeholder for ->writepage (ntfs_mft_writepage()), which for now # just redirties the page and returns. Also, at umount time, we for # now throw away all mft data page cache pages after the last call to # ntfs_commit_inode() in the hope that all inodes will have been # written out by then and hence no dirty (meta)data will be lost. We # also check for this case and emit an error message telling the user # to run chkdsk. # - If the user is trying to enable (dir)atime updates, warn about the # fact that we are disabling them. # # Signed-off-by: Anton Altaparmakov # # fs/ntfs/super.c # 2004/06/01 17:00:52+01:00 aia21@cantab.net +27 -2 # - At umount time, we for now throw away all mft data page cache # pages after the last call to ntfs_commit_inode() in the hope # that all inodes will have been written out by then and hence # no dirty (meta)data will be lost. We also check for this case # and emit an error message telling the user to run chkdsk. # - If the user is trying to enable (dir)atime updates, warn about # the fact that we are disabling them. # # fs/ntfs/mft.h # 2004/06/01 17:00:52+01:00 aia21@cantab.net +19 -0 # Implement {,__}mark_mft_record_dirty(). # # fs/ntfs/mft.c # 2004/06/01 17:00:52+01:00 aia21@cantab.net +77 -0 # Implement __mark_mft_record_dirty() and a placeholder for # ->writepage (ntfs_mft_writepage()), which for now just # redirties the page and returns. # # fs/ntfs/aops.c # 2004/06/01 17:00:52+01:00 aia21@cantab.net +14 -19 # Use mark_mft_record_dirty() in ntfs_writepage() and ntfs_commit_write(), # thus finally enabling resident file overwrite! (-8 # # fs/ntfs/ChangeLog # 2004/06/01 17:00:52+01:00 aia21@cantab.net +10 -1 # Update # # ChangeSet # 2004/05/28 16:24:23+01:00 aia21@cantab.net # NTFS: Implement ->write_inode (fs/ntfs/inode.c::ntfs_write_inode()) for the # ntfs super operations. This gives us inode writing via the VFS inode # dirty code paths. Note: Access time updates are not implemented yet. # # Signed-off-by: Anton Altaparmakov # # fs/ntfs/super.c # 2004/05/28 16:24:17+01:00 aia21@cantab.net +2 -2 # Set ntfs_write_inode() to be our sops->write_inode. # # fs/ntfs/inode.c # 2004/05/28 16:24:17+01:00 aia21@cantab.net +101 -16 # Implement ntfs_write_inode(). # # fs/ntfs/ChangeLog # 2004/05/28 16:24:17+01:00 aia21@cantab.net +7 -1 # Update. # # ChangeSet # 2004/05/28 12:38:37+01:00 aia21@cantab.net # NTFS: Commit open system inodes at umount time. This should make it # virtually impossible for sync_mft_mirror_umount() to ever be needed. # # Signed-off-by: Anton Altaparmakov # # fs/ntfs/super.c # 2004/05/28 12:38:31+01:00 aia21@cantab.net +35 -0 # Commit open system inodes at umount time. # # fs/ntfs/ChangeLog # 2004/05/28 12:38:31+01:00 aia21@cantab.net +2 -0 # Update. # # ChangeSet # 2004/05/28 12:29:35+01:00 aia21@cantab.net # NTFS: Implement writing of mft records (fs/ntfs/mft.[hc]), which includes # keeping the mft mirror in sync with the mft when mirrored mft records # are written. The functions are write_mft_record{,_nolock}(). The # implementation is quite rudimentary for now with lots of things not # implemented yet but I am not sure any of them can actually occur so # I will wait for people to hit each one and only then implement it. # # Signed-off-by: Anton Altaparmakov # # fs/ntfs/mft.h # 2004/05/28 12:29:29+01:00 aia21@cantab.net +35 -0 # Add write_mft_record{,_nolock}(). # # fs/ntfs/mft.c # 2004/05/28 12:29:29+01:00 aia21@cantab.net +387 -0 # Add write_mft_record{,_nolock}(). # # fs/ntfs/compress.c # 2004/05/28 12:29:29+01:00 aia21@cantab.net +2 -2 # Error messages typo fixes. # # fs/ntfs/attrib.c # 2004/05/28 12:29:29+01:00 aia21@cantab.net +2 -2 # Debug and error messages typo fixes. # # fs/ntfs/aops.c # 2004/05/28 12:29:29+01:00 aia21@cantab.net +1 -1 # Debug message typo fix. # # fs/ntfs/Makefile # 2004/05/28 12:29:29+01:00 aia21@cantab.net +1 -1 # Update. # # fs/ntfs/ChangeLog # 2004/05/28 12:29:29+01:00 aia21@cantab.net +15 -0 # Update. # diff -Nru a/Documentation/filesystems/ntfs.txt b/Documentation/filesystems/ntfs.txt --- a/Documentation/filesystems/ntfs.txt 2004-06-08 21:54:24 -07:00 +++ b/Documentation/filesystems/ntfs.txt 2004-06-08 21:54:24 -07:00 @@ -273,6 +273,19 @@ Note, a technical ChangeLog aimed at kernel hackers is in fs/ntfs/ChangeLog. +2.1.13: + - Implement writing of inodes (access time updates are not implemented + yet so mounting with -o noatime,nodiratime is enforced). + - Enable writing out of resident files so you can now overwrite any + uncompressed, unencrypted, nonsparse file as long as you do not + change the file size. + - Add housekeeping of ntfs system files so that ntfsfix no longer needs + to be run after writing to an NTFS volume. + NOTE: This still leaves quota tracking and user space journalling on + the side but they should not cause data corruption. In the worst + case the charged quotas will be out of date ($Quota) and some + userspace applications might get confused due to the out of date + userspace journal ($UsnJrnl). 2.1.12: - Fix the second fix to the decompression engine from the 2.1.9 release and some further internals cleanups. diff -Nru a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog --- a/fs/ntfs/ChangeLog 2004-06-08 21:54:24 -07:00 +++ b/fs/ntfs/ChangeLog 2004-06-08 21:54:24 -07:00 @@ -1,4 +1,4 @@ -ToDo: +ToDo/Notes: - Find and fix bugs. - Either invalidate quotas or update the quota charges on NTFS 3.x volumes with quota tracking enabled ($Quota). @@ -11,8 +11,10 @@ pages as nothing can dirty a page other than ourselves. Should this change, we will really need to roll our own ->set_page_dirty(). - Implement sops->dirty_inode() to implement {a,m,c}time updates and - such things. - - Implement sops->write_inode(). + such things. This should probably just flag the ntfs inode such that + sops->write_inode(), i.e. ntfs_write_inode(), will copy the times + when it is invoked rather than having to update the mft record + every time. - In between ntfs_prepare/commit_write, need exclusion between simultaneous file extensions. Need perhaps an NInoResizeUnderway() flag which we can set in ntfs_prepare_write() and clear again in @@ -24,6 +26,61 @@ OTOH, perhaps i_sem, which is held accross generic_file_write is sufficient for synchronisation here. We then just need to make sure ntfs_readpage/writepage/truncate interoperate properly with us. + - Implement mft.c::sync_mft_mirror_umount(). We currently will just + leave the volume dirty on umount if the final iput(vol->mft_ino) + causes a write of any mirrored mft records due to the mft mirror + inode having been discarded already. Whether this can actually ever + happen is unclear however so it is worth waiting until someone hits + the problem. + - Enable the code for setting the NT4 compatibility flag when we start + making NTFS 1.2 specific modifications. + +2.1.13 - Enable overwriting of resident files and housekeeping of system files. + + - Implement writing of mft records (fs/ntfs/mft.[hc]), which includes + keeping the mft mirror in sync with the mft when mirrored mft records + are written. The functions are write_mft_record{,_nolock}(). The + implementation is quite rudimentary for now with lots of things not + implemented yet but I am not sure any of them can actually occur so + I will wait for people to hit each one and only then implement it. + - Commit open system inodes at umount time. This should make it + virtually impossible for sync_mft_mirror_umount() to ever be needed. + - Implement ->write_inode (fs/ntfs/inode.c::ntfs_write_inode()) for the + ntfs super operations. This gives us inode writing via the VFS inode + dirty code paths. Note: Access time updates are not implemented yet. + - Implement fs/ntfs/mft.[hc]::{,__}mark_mft_record_dirty() and make + fs/ntfs/aops.c::ntfs_writepage() and ntfs_commit_write() use it, thus + finally enabling resident file overwrite! (-8 This also includes a + placeholder for ->writepage (ntfs_mft_writepage()), which for now + just redirties the page and returns. Also, at umount time, we for + now throw away all mft data page cache pages after the last call to + ntfs_commit_inode() in the hope that all inodes will have been + written out by then and hence no dirty (meta)data will be lost. We + also check for this case and emit an error message telling the user + to run chkdsk. + - Use set_page_writeback() and end_page_writeback() in the resident + attribute code path of fs/ntfs/aops.c::ntfs_writepage() otherwise + the radix-tree tag PAGECACHE_TAG_DIRTY remains set even though the + page is clean. + - Implement ntfs_mft_writepage() so it now checks if any of the mft + records in the page are dirty and if so redirties the page and + returns. Otherwise it just returns (after doing set_page_writeback(), + unlock_page(), end_page_writeback() or the radix-tree tag + PAGECACHE_TAG_DIRTY remains set even though the page is clean), thus + alowing the VM to do with the page as it pleases. Also, at umount + time, now only throw away dirty mft (meta)data pages if dirty inodes + are present and ask the user to email us if they see this happening. + - Add functions ntfs_{clear,set}_volume_flags(), to modify the volume + information flags (fs/ntfs/super.c). + - Mark the volume dirty when (re)mounting read-write and mark it clean + when unmounting or remounting read-only. If any volume errors are + found, the volume is left marked dirty to force chkdsk to run. + - Add code to set the NT4 compatibility flag when (re)mounting + read-write for newer NTFS versions but leave it commented out for now + since we do not make any modifications that are NTFS 1.2 specific yet + and since setting this flag breaks Captive-NTFS which is not nice. + This code must be enabled once we start writing NTFS 1.2 specific + changes otherwise Windows NTFS driver might crash / cause corruption. 2.1.12 - Fix the second fix to the decompression engine and some cleanups. diff -Nru a/fs/ntfs/Makefile b/fs/ntfs/Makefile --- a/fs/ntfs/Makefile 2004-06-08 21:54:24 -07:00 +++ b/fs/ntfs/Makefile 2004-06-08 21:54:24 -07:00 @@ -5,7 +5,7 @@ ntfs-objs := aops.o attrib.o compress.o debug.o dir.o file.o inode.o mft.o \ mst.o namei.o super.o sysctl.o unistr.o upcase.o -EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.12\" +EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.13\" ifeq ($(CONFIG_NTFS_DEBUG),y) EXTRA_CFLAGS += -DDEBUG diff -Nru a/fs/ntfs/aops.c b/fs/ntfs/aops.c --- a/fs/ntfs/aops.c 2004-06-08 21:54:24 -07:00 +++ b/fs/ntfs/aops.c 2004-06-08 21:54:24 -07:00 @@ -478,8 +478,8 @@ ni = NTFS_I(vi); vol = ni->vol; - ntfs_debug("Entering for inode %li, attribute type 0x%x, page index " - "0x%lx.\n", vi->i_ino, ni->type, page->index); + ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index " + "0x%lx.", vi->i_ino, ni->type, page->index); BUG_ON(!NInoNonResident(ni)); BUG_ON(NInoMstProtected(ni)); @@ -778,9 +778,8 @@ * * For resident attributes, OTOH, ntfs_writepage() writes the @page by copying * the data to the mft record (which at this stage is most likely in memory). - * Thus, in this case, I/O is synchronous, as even if the mft record is not - * cached at this point in time, we need to wait for it to be read in before we - * can do the copy. + * The mft record is then marked dirty and written out asynchronously via the + * vfs inode dirty code path. * * Note the caller clears the page dirty flag before calling ntfs_writepage(). * @@ -875,16 +874,6 @@ BUG_ON(page_has_buffers(page)); BUG_ON(!PageUptodate(page)); - // TODO: Consider using PageWriteback() + unlock_page() in 2.5 once the - // "VM fiddling has ended". Note, don't forget to replace all the - // unlock_page() calls further below with end_page_writeback() ones. - // FIXME: Make sure it is ok to SetPageError() on unlocked page under - // writeback before doing the change! -#if 0 - set_page_writeback(page); - unlock_page(page); -#endif - if (!NInoAttr(ni)) base_ni = ni; else @@ -935,6 +924,14 @@ bytes = PAGE_CACHE_SIZE; /* + * Keep the VM happy. This must be done otherwise the radix-tree tag + * PAGECACHE_TAG_DIRTY remains set even though the page is clean. + */ + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + unlock_page(page); + + /* * Here, we don't need to zero the out of bounds area everytime because * the below memcpy() already takes care of the mmap-at-end-of-file * requirements. If the file is converted to a non-resident one, then @@ -948,7 +945,10 @@ * expose data to userspace/disk which should never have been exposed. * * FIXME: Ensure that i_size increases do the zeroing/overwriting and - * if we cannot guarantee that, then enable the zeroing below. + * if we cannot guarantee that, then enable the zeroing below. If the + * zeroing below is enabled, we MUST move the unlock_page() from above + * to after the kunmap_atomic(), i.e. just before the + * end_page_writeback(). */ kaddr = kmap_atomic(page, KM_USER0); @@ -966,11 +966,10 @@ #endif kunmap_atomic(kaddr, KM_USER0); - unlock_page(page); + end_page_writeback(page); - // TODO: Mark mft record dirty so it gets written back. - ntfs_error(vi->i_sb, "Writing to resident files is not supported yet. " - "Wrote to memory only..."); + /* Mark the mft record dirty, so it gets written back. */ + mark_mft_record_dirty(ctx->ntfs_ino); put_attr_search_ctx(ctx); unmap_mft_record(base_ni); @@ -1022,7 +1021,7 @@ ni = NTFS_I(vi); vol = ni->vol; - ntfs_debug("Entering for inode %li, attribute type 0x%x, page index " + ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index " "0x%lx, from = %u, to = %u.", vi->i_ino, ni->type, page->index, from, to); @@ -1379,7 +1378,7 @@ struct inode *vi = page->mapping->host; ntfs_inode *ni = NTFS_I(vi); - ntfs_debug("Entering for inode %li, attribute type 0x%x, page index " + ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index " "0x%lx, from = %u, to = %u.", vi->i_ino, ni->type, page->index, from, to); @@ -1487,7 +1486,7 @@ vi = page->mapping->host; - ntfs_debug("Entering for inode %li, attribute type 0x%x, page index " + ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index " "0x%lx, from = %u, to = %u.", vi->i_ino, NTFS_I(vi)->type, page->index, from, to); @@ -1583,7 +1582,7 @@ vi = page->mapping->host; ni = NTFS_I(vi); - ntfs_debug("Entering for inode %li, attribute type 0x%x, page index " + ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index " "0x%lx, from = %u, to = %u.", vi->i_ino, ni->type, page->index, from, to); @@ -1734,9 +1733,8 @@ } kunmap_atomic(kaddr, KM_USER0); - // TODO: Mark mft record dirty so it gets written back. - ntfs_error(vi->i_sb, "Writing to resident files is not supported yet. " - "Wrote to memory only..."); + /* Mark the mft record dirty, so it gets written back. */ + mark_mft_record_dirty(ctx->ntfs_ino); put_attr_search_ctx(ctx); unmap_mft_record(base_ni); diff -Nru a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c --- a/fs/ntfs/attrib.c 2004-06-08 21:54:24 -07:00 +++ b/fs/ntfs/attrib.c 2004-06-08 21:54:24 -07:00 @@ -624,7 +624,7 @@ if (drl[ds].vcn == marker_vcn) { ntfs_debug("Old marker = 0x%llx, replacing " - "with LCN_ENOENT.\n", + "with LCN_ENOENT.", (unsigned long long) drl[ds].lcn); drl[ds].lcn = (LCN)LCN_ENOENT; @@ -1565,7 +1565,7 @@ goto do_next_attr_loop; } ntfs_error(base_ni->vol->sb, "Inode contains corrupt attribute list " - "attribute.\n"); + "attribute."); if (ni != base_ni) { unmap_extent_mft_record(ni); ctx->ntfs_ino = base_ni; diff -Nru a/fs/ntfs/compress.c b/fs/ntfs/compress.c --- a/fs/ntfs/compress.c 2004-06-08 21:54:24 -07:00 +++ b/fs/ntfs/compress.c 2004-06-08 21:54:24 -07:00 @@ -433,7 +433,7 @@ goto do_next_tag; return_overflow: - ntfs_error(NULL, "Failed. Returning -EOVERFLOW.\n"); + ntfs_error(NULL, "Failed. Returning -EOVERFLOW."); goto return_error; } @@ -851,7 +851,7 @@ if (err) { ntfs_error(vol->sb, "ntfs_decompress() failed in inode " "0x%lx with error code %i. Skipping " - "this compression block.\n", + "this compression block.", ni->mft_no, -err); /* Release the unfinished pages. */ for (; prev_cur_page < cur_page; prev_cur_page++) { diff -Nru a/fs/ntfs/inode.c b/fs/ntfs/inode.c --- a/fs/ntfs/inode.c 2004-06-08 21:54:24 -07:00 +++ b/fs/ntfs/inode.c 2004-06-08 21:54:24 -07:00 @@ -1960,49 +1960,134 @@ return err; } +/** + * ntfs_write_inode - write out a dirty inode + * @vi: inode to write out + * @sync: if true, write out synchronously + * + * Write out a dirty inode to disk including any extent inodes if present. + * + * If @sync is true, commit the inode to disk and wait for io completion. This + * is done using write_mft_record(). + * + * If @sync is false, just schedule the write to happen but do not wait for i/o + * completion. In 2.6 kernels, scheduling usually happens just by virtue of + * marking the page (and in this case mft record) dirty but we do not implement + * this yet as write_mft_record() largely ignores the @sync parameter and + * always performs synchronous writes. + */ void ntfs_write_inode(struct inode *vi, int sync) { ntfs_inode *ni = NTFS_I(vi); +#if 0 + attr_search_context *ctx; +#endif + MFT_RECORD *m; + int err = 0; ntfs_debug("Entering for %sinode 0x%lx.", NInoAttr(ni) ? "attr " : "", vi->i_ino); - /* * Dirty attribute inodes are written via their real inodes so just - * clean them here. + * clean them here. TODO: Take care of access time updates. */ if (NInoAttr(ni)) { NInoClearDirty(ni); return; } - - /* Write this base mft record. */ - if (NInoDirty(ni)) { - ntfs_warning(vi->i_sb, "Cleaning dirty inode 0x%lx without " - "writing to disk as this is not yet " - "implemented.", vi->i_ino); - NInoClearDirty(ni); + /* Map, pin, and lock the mft record belonging to the inode. */ + m = map_mft_record(ni); + if (unlikely(IS_ERR(m))) { + err = PTR_ERR(m); + goto err_out; } - +#if 0 + /* Obtain the standard information attribute. */ + ctx = get_attr_search_ctx(ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto unm_err_out; + } + if (unlikely(!lookup_attr(AT_STANDARD_INFORMATION, NULL, 0, + IGNORE_CASE, 0, NULL, 0, ctx))) { + put_attr_search_ctx(ctx); + err = -ENOENT; + goto unm_err_out; + } + // TODO: Update the access times in the standard information attribute + // which is now in ctx->attr. + // - Probably want to have use sops->dirty_inode() to set a flag that + // we need to update the times here rather than having to blindly do + // it every time. Or even don't do it here at all and do it in + // sops->dirty_inode() instead. Problem with this would be that + // sops->dirty_inode() must be atomic under certain circumstances + // and mapping mft records and such like is not atomic. + // - For atime updates also need to check whether they are enabled in + // the superblock flags. + ntfs_warning(vi->i_sb, "Access time updates not implement yet."); + /* + * We just modified the mft record containing the standard information + * attribute. So need to mark the mft record dirty, too, but we do it + * manually so that mark_inode_dirty() is not called again. + * TODO: Only do this if there was a change in any of the times! + */ + if (!NInoTestSetDirty(ctx->ntfs_ino)) + __set_page_dirty_nobuffers(ctx->ntfs_ino->page); + put_attr_search_ctx(ctx); +#endif + /* Write this base mft record. */ + if (NInoDirty(ni)) + err = write_mft_record(ni, m, sync); /* Write all attached extent mft records. */ down(&ni->extent_lock); if (ni->nr_extents > 0) { - int i; ntfs_inode **extent_nis = ni->ext.extent_ntfs_inos; + int i; + ntfs_debug("Writing %i extent inodes.", ni->nr_extents); for (i = 0; i < ni->nr_extents; i++) { ntfs_inode *tni = extent_nis[i]; if (NInoDirty(tni)) { - ntfs_warning(vi->i_sb, "Cleaning dirty extent " - "inode 0x%lx without writing " - "to disk as this is not yet " - "implemented.", tni->mft_no); - NInoClearDirty(tni); + MFT_RECORD *tm = map_mft_record(tni); + int ret; + + if (unlikely(IS_ERR(tm))) { + if (!err || err == -ENOMEM) + err = PTR_ERR(tm); + continue; + } + ret = write_mft_record(tni, tm, sync); + unmap_mft_record(tni); + if (unlikely(ret)) { + if (!err || err == -ENOMEM) + err = ret; + } } } } up(&ni->extent_lock); + unmap_mft_record(ni); + if (unlikely(err)) + goto err_out; + ntfs_debug("Done."); + return; +#if 0 +unm_err_out: + unmap_mft_record(ni); +#endif +err_out: + if (err == -ENOMEM) { + ntfs_warning(vi->i_sb, "Not enough memory to write inode. " + "Marking the inode dirty again, so the VFS " + "retries later."); + mark_inode_dirty(vi); + } else { + ntfs_error(vi->i_sb, "Failed (error code %i): Marking inode " + "as bad. You should run chkdsk.", -err); + make_bad_inode(vi); + } + return; } #endif /* NTFS_RW */ diff -Nru a/fs/ntfs/mft.c b/fs/ntfs/mft.c --- a/fs/ntfs/mft.c 2004-06-08 21:54:24 -07:00 +++ b/fs/ntfs/mft.c 2004-06-08 21:54:24 -07:00 @@ -102,6 +102,13 @@ */ extern int ntfs_readpage(struct file *, struct page *); +#ifdef NTFS_RW +/** + * ntfs_mft_writepage - forward declaration, function is further below + */ +static int ntfs_mft_writepage(struct page *page, struct writeback_control *wbc); +#endif /* NTFS_RW */ + /** * ntfs_mft_aops - address space operations for access to $MFT * @@ -112,6 +119,10 @@ .readpage = ntfs_readpage, /* Fill page with data. */ .sync_page = block_sync_page, /* Currently, just unplugs the disk request queue. */ +#ifdef NTFS_RW + .writepage = ntfs_mft_writepage, /* Write out the dirty mft + records in a page. */ +#endif /* NTFS_RW */ }; /** @@ -429,3 +440,654 @@ ntfs_clear_extent_inode(ni); return m; } + +#ifdef NTFS_RW + +/** + * __mark_mft_record_dirty - set the mft record and the page containing it dirty + * @ni: ntfs inode describing the mapped mft record + * + * Internal function. Users should call mark_mft_record_dirty() instead. + * + * Set the mapped (extent) mft record of the (base or extent) ntfs inode @ni, + * as well as the page containing the mft record, dirty. Also, mark the base + * vfs inode dirty. This ensures that any changes to the mft record are + * written out to disk. + * + * NOTE: We only set I_DIRTY_SYNC and I_DIRTY_DATASYNC (and not I_DIRTY_PAGES) + * on the base vfs inode, because even though file data may have been modified, + * it is dirty in the inode meta data rather than the data page cache of the + * inode, and thus there are no data pages that need writing out. Therefore, a + * full mark_inode_dirty() is overkill. A mark_inode_dirty_sync(), on the + * other hand, is not sufficient, because I_DIRTY_DATASYNC needs to be set to + * ensure ->write_inode is called from generic_osync_inode() and this needs to + * happen or the file data would not necessarily hit the device synchronously, + * even though the vfs inode has the O_SYNC flag set. Also, I_DIRTY_DATASYNC + * simply "feels" better than just I_DIRTY_SYNC, since the file data has not + * actually hit the block device yet, which is not what I_DIRTY_SYNC on its own + * would suggest. + */ +void __mark_mft_record_dirty(ntfs_inode *ni) +{ + struct page *page = ni->page; + ntfs_inode *base_ni; + + ntfs_debug("Entering for inode 0x%lx.", ni->mft_no); + BUG_ON(!page); + BUG_ON(NInoAttr(ni)); + + /* + * Set the page containing the mft record dirty. This also marks the + * $MFT inode dirty (I_DIRTY_PAGES). + */ + __set_page_dirty_nobuffers(page); + + /* Determine the base vfs inode and mark it dirty, too. */ + down(&ni->extent_lock); + if (likely(ni->nr_extents >= 0)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + up(&ni->extent_lock); + __mark_inode_dirty(VFS_I(base_ni), I_DIRTY_SYNC | I_DIRTY_DATASYNC); +} + +static const char *ntfs_please_email = "Please email " + "linux-ntfs-dev@lists.sourceforge.net and say that you saw " + "this message. Thank you."; + +/** + * sync_mft_mirror_umount - synchronise an mft record to the mft mirror + * @ni: ntfs inode whose mft record to synchronize + * @m: mapped, mst protected (extent) mft record to synchronize + * + * Write the mapped, mst protected (extent) mft record @m described by the + * (regular or extent) ntfs inode @ni to the mft mirror ($MFTMirr) bypassing + * the page cache and the $MFTMirr inode itself. + * + * This function is only for use at umount time when the mft mirror inode has + * already been disposed off. We BUG() if we are called while the mft mirror + * inode is still attached to the volume. + * + * On success return 0. On error return -errno. + * + * NOTE: This function is not implemented yet as I am not convinced it can + * actually be triggered considering the sequence of commits we do in super.c:: + * ntfs_put_super(). But just in case we provide this place holder as the + * alternative would be either to BUG() or to get a NULL pointer dereference + * and Oops. + */ +static int sync_mft_mirror_umount(ntfs_inode *ni, MFT_RECORD *m) +{ + ntfs_volume *vol = ni->vol; + + BUG_ON(vol->mftmirr_ino); + ntfs_error(vol->sb, "Umount time mft mirror syncing is not " + "implemented yet. %s", ntfs_please_email); + return -EOPNOTSUPP; +} + +/** + * sync_mft_mirror - synchronize an mft record to the mft mirror + * @ni: ntfs inode whose mft record to synchronize + * @m: mapped, mst protected (extent) mft record to synchronize + * @sync: if true, wait for i/o completion + * + * Write the mapped, mst protected (extent) mft record @m described by the + * (regular or extent) ntfs inode @ni to the mft mirror ($MFTMirr). + * + * On success return 0. On error return -errno and set the volume errors flag + * in the ntfs_volume to which @ni belongs. + * + * NOTE: We always perform synchronous i/o and ignore the @sync parameter. + * + * TODO: If @sync is false, want to do truly asynchronous i/o, i.e. just + * schedule i/o via ->writepage or do it via kntfsd or whatever. + */ +static int sync_mft_mirror(ntfs_inode *ni, MFT_RECORD *m, int sync) +{ + ntfs_volume *vol = ni->vol; + struct page *page; + unsigned int blocksize = vol->sb->s_blocksize; + int max_bhs = vol->mft_record_size / blocksize; + struct buffer_head *bhs[max_bhs]; + struct buffer_head *bh, *head; + u8 *kmirr; + unsigned int block_start, block_end, m_start, m_end; + int i_bhs, nr_bhs, err = 0; + + ntfs_debug("Entering for inode 0x%lx.", ni->mft_no); + BUG_ON(!max_bhs); + if (unlikely(!vol->mftmirr_ino)) { + /* This could happen during umount... */ + err = sync_mft_mirror_umount(ni, m); + if (likely(!err)) + return err; + goto err_out; + } + /* Get the page containing the mirror copy of the mft record @m. */ + page = ntfs_map_page(vol->mftmirr_ino->i_mapping, ni->mft_no >> + (PAGE_CACHE_SHIFT - vol->mft_record_size_bits)); + if (unlikely(IS_ERR(page))) { + ntfs_error(vol->sb, "Failed to map mft mirror page."); + err = PTR_ERR(page); + goto err_out; + } + /* + * Exclusion against other writers. This should never be a problem + * since the page in which the mft record @m resides is also locked and + * hence any other writers would be held up there but it is better to + * make sure no one is writing from elsewhere. + */ + lock_page(page); + /* The address in the page of the mirror copy of the mft record @m. */ + kmirr = page_address(page) + ((ni->mft_no << vol->mft_record_size_bits) + & ~PAGE_CACHE_MASK); + /* Copy the mst protected mft record to the mirror. */ + memcpy(kmirr, m, vol->mft_record_size); + /* Make sure we have mapped buffers. */ + if (!page_has_buffers(page)) { +no_buffers_err_out: + ntfs_error(vol->sb, "Writing mft mirror records without " + "existing buffers is not implemented yet. %s", + ntfs_please_email); + err = -EOPNOTSUPP; + goto unlock_err_out; + } + bh = head = page_buffers(page); + if (!bh) + goto no_buffers_err_out; + nr_bhs = 0; + block_start = 0; + m_start = kmirr - (u8*)page_address(page); + m_end = m_start + vol->mft_record_size; + do { + block_end = block_start + blocksize; + /* + * If the buffer is outside the mft record, just skip it, + * clearing it if it is dirty to make sure it is not written + * out. It should never be marked dirty but better be safe. + */ + if ((block_end <= m_start) || (block_start >= m_end)) { + if (buffer_dirty(bh)) { + ntfs_warning(vol->sb, "Clearing dirty mft " + "record page buffer. %s", + ntfs_please_email); + clear_buffer_dirty(bh); + } + continue; + } + if (!buffer_mapped(bh)) { + ntfs_error(vol->sb, "Writing mft mirror records " + "without existing mapped buffers is " + "not implemented yet. %s", + ntfs_please_email); + err = -EOPNOTSUPP; + continue; + } + if (!buffer_uptodate(bh)) { + ntfs_error(vol->sb, "Writing mft mirror records " + "without existing uptodate buffers is " + "not implemented yet. %s", + ntfs_please_email); + err = -EOPNOTSUPP; + continue; + } + BUG_ON(!nr_bhs && (m_start != block_start)); + BUG_ON(nr_bhs >= max_bhs); + bhs[nr_bhs++] = bh; + BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end)); + } while (block_start = block_end, (bh = bh->b_this_page) != head); + if (likely(!err)) { + /* Lock buffers and start synchronous write i/o on them. */ + for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { + struct buffer_head *tbh = bhs[i_bhs]; + + if (unlikely(test_set_buffer_locked(tbh))) + BUG(); + BUG_ON(!buffer_uptodate(tbh)); + if (buffer_dirty(tbh)) + clear_buffer_dirty(tbh); + get_bh(tbh); + tbh->b_end_io = end_buffer_write_sync; + submit_bh(WRITE, tbh); + } + /* Wait on i/o completion of buffers. */ + for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { + struct buffer_head *tbh = bhs[i_bhs]; + + wait_on_buffer(tbh); + if (unlikely(!buffer_uptodate(tbh))) { + err = -EIO; + /* + * Set the buffer uptodate so the page & buffer + * states don't become out of sync. + */ + if (PageUptodate(page)) + set_buffer_uptodate(tbh); + } + } + } else /* if (unlikely(err)) */ { + /* Clean the buffers. */ + for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) + clear_buffer_dirty(bhs[i_bhs]); + } +unlock_err_out: + /* Current state: all buffers are clean, unlocked, and uptodate. */ + /* Remove the mst protection fixups again. */ + post_write_mst_fixup((NTFS_RECORD*)kmirr); + flush_dcache_page(page); + unlock_page(page); + ntfs_unmap_page(page); + if (unlikely(err)) { + /* I/O error during writing. This is really bad! */ + ntfs_error(vol->sb, "I/O error while writing mft mirror " + "record 0x%lx! You should unmount the volume " + "and run chkdsk or ntfsfix.", ni->mft_no); + goto err_out; + } + ntfs_debug("Done."); + return 0; +err_out: + ntfs_error(vol->sb, "Failed to synchronize $MFTMirr (error code %i). " + "Volume will be left marked dirty on umount. Run " + "ntfsfix on the partition after umounting to correct " + "this.", -err); + /* We don't want to clear the dirty bit on umount. */ + NVolSetErrors(vol); + return err; +} + +/** + * write_mft_record_nolock - write out a mapped (extent) mft record + * @ni: ntfs inode describing the mapped (extent) mft record + * @m: mapped (extent) mft record to write + * @sync: if true, wait for i/o completion + * + * Write the mapped (extent) mft record @m described by the (regular or extent) + * ntfs inode @ni to backing store. If the mft record @m has a counterpart in + * the mft mirror, that is also updated. + * + * On success, clean the mft record and return 0. On error, leave the mft + * record dirty and return -errno. The caller should call make_bad_inode() on + * the base inode to ensure no more access happens to this inode. We do not do + * it here as the caller may want to finish writing other extent mft records + * first to minimize on-disk metadata inconsistencies. + * + * NOTE: We always perform synchronous i/o and ignore the @sync parameter. + * However, if the mft record has a counterpart in the mft mirror and @sync is + * true, we write the mft record, wait for i/o completion, and only then write + * the mft mirror copy. This ensures that if the system crashes either the mft + * or the mft mirror will contain a self-consistent mft record @m. If @sync is + * false on the other hand, we start i/o on both and then wait for completion + * on them. This provides a speedup but no longer guarantees that you will end + * up with a self-consistent mft record in the case of a crash but if you asked + * for asynchronous writing you probably do not care about that anyway. + * + * TODO: If @sync is false, want to do truly asynchronous i/o, i.e. just + * schedule i/o via ->writepage or do it via kntfsd or whatever. + */ +int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync) +{ + ntfs_volume *vol = ni->vol; + struct page *page = ni->page; + unsigned int blocksize = vol->sb->s_blocksize; + int max_bhs = vol->mft_record_size / blocksize; + struct buffer_head *bhs[max_bhs]; + struct buffer_head *bh, *head; + unsigned int block_start, block_end, m_start, m_end; + int i_bhs, nr_bhs, err = 0; + + ntfs_debug("Entering for inode 0x%lx.", ni->mft_no); + BUG_ON(NInoAttr(ni)); + BUG_ON(!max_bhs); + BUG_ON(!page); + BUG_ON(!PageLocked(page)); + /* + * If the ntfs_inode is clean no need to do anything. If it is dirty, + * mark it as clean now so that it can be redirtied later on if needed. + * There is no danger of races as as long as the caller is holding the + * locks for the mft record @m and the page it is in. + */ + if (!NInoTestClearDirty(ni)) + goto done; + /* Make sure we have mapped buffers. */ + if (!page_has_buffers(page)) { +no_buffers_err_out: + ntfs_error(vol->sb, "Writing mft records without existing " + "buffers is not implemented yet. %s", + ntfs_please_email); + err = -EOPNOTSUPP; + goto err_out; + } + bh = head = page_buffers(page); + if (!bh) + goto no_buffers_err_out; + nr_bhs = 0; + block_start = 0; + m_start = ni->page_ofs; + m_end = m_start + vol->mft_record_size; + do { + block_end = block_start + blocksize; + /* + * If the buffer is outside the mft record, just skip it, + * clearing it if it is dirty to make sure it is not written + * out. It should never be marked dirty but better be safe. + */ + if ((block_end <= m_start) || (block_start >= m_end)) { + if (buffer_dirty(bh)) { + ntfs_warning(vol->sb, "Clearing dirty mft " + "record page buffer. %s", + ntfs_please_email); + clear_buffer_dirty(bh); + } + continue; + } + if (!buffer_mapped(bh)) { + ntfs_error(vol->sb, "Writing mft records without " + "existing mapped buffers is not " + "implemented yet. %s", + ntfs_please_email); + err = -EOPNOTSUPP; + continue; + } + if (!buffer_uptodate(bh)) { + ntfs_error(vol->sb, "Writing mft records without " + "existing uptodate buffers is not " + "implemented yet. %s", + ntfs_please_email); + err = -EOPNOTSUPP; + continue; + } + BUG_ON(!nr_bhs && (m_start != block_start)); + BUG_ON(nr_bhs >= max_bhs); + bhs[nr_bhs++] = bh; + BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end)); + } while (block_start = block_end, (bh = bh->b_this_page) != head); + if (unlikely(err)) + goto cleanup_out; + /* Apply the mst protection fixups. */ + err = pre_write_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size); + if (err) { + ntfs_error(vol->sb, "Failed to apply mst fixups!"); + goto cleanup_out; + } + flush_dcache_mft_record_page(ni); + /* Lock buffers and start synchronous write i/o on them. */ + for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { + struct buffer_head *tbh = bhs[i_bhs]; + + if (unlikely(test_set_buffer_locked(tbh))) + BUG(); + BUG_ON(!buffer_uptodate(tbh)); + if (buffer_dirty(tbh)) + clear_buffer_dirty(tbh); + get_bh(tbh); + tbh->b_end_io = end_buffer_write_sync; + submit_bh(WRITE, tbh); + } + /* Synchronize the mft mirror now if not @sync. */ + if (!sync && ni->mft_no < vol->mftmirr_size) + sync_mft_mirror(ni, m, sync); + /* Wait on i/o completion of buffers. */ + for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { + struct buffer_head *tbh = bhs[i_bhs]; + + wait_on_buffer(tbh); + if (unlikely(!buffer_uptodate(tbh))) { + err = -EIO; + /* + * Set the buffer uptodate so the page & buffer states + * don't become out of sync. + */ + if (PageUptodate(page)) + set_buffer_uptodate(tbh); + } + } + /* If @sync, now synchronize the mft mirror. */ + if (sync && ni->mft_no < vol->mftmirr_size) + sync_mft_mirror(ni, m, sync); + /* Remove the mst protection fixups again. */ + post_write_mst_fixup((NTFS_RECORD*)m); + flush_dcache_mft_record_page(ni); + if (unlikely(err)) { + /* I/O error during writing. This is really bad! */ + ntfs_error(vol->sb, "I/O error while writing mft record " + "0x%lx! Marking base inode as bad. You " + "should unmount the volume and run chkdsk.", + ni->mft_no); + goto err_out; + } +done: + ntfs_debug("Done."); + return 0; +cleanup_out: + /* Clean the buffers. */ + for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) + clear_buffer_dirty(bhs[i_bhs]); +err_out: + /* + * Current state: all buffers are clean, unlocked, and uptodate. + * The caller should mark the base inode as bad so that no more i/o + * happens. ->clear_inode() will still be invoked so all extent inodes + * and other allocated memory will be freed. + */ + if (err == -ENOMEM) { + ntfs_error(vol->sb, "Not enough memory to write mft record. " + "Redirtying so the write is retried later."); + mark_mft_record_dirty(ni); + err = 0; + } + return err; +} + +/** + * ntfs_mft_writepage - check if a metadata page contains dirty mft records + * @page: metadata page possibly containing dirty mft records + * @wbc: writeback control structure + * + * This is called from the VM when it wants to have a dirty $MFT/$DATA metadata + * page cache page cleaned. The VM has already locked the page and marked it + * clean. Instead of writing the page as a conventional ->writepage function + * would do, we check if the page still contains any dirty mft records (it must + * have done at some point in the past since the page was marked dirty) and if + * none are found, i.e. all mft records are clean, we unlock the page and + * return. The VM is then free to do with the page as it pleases. If on the + * other hand we do find any dirty mft records in the page, we redirty the page + * before unlocking it and returning so the VM knows that the page is still + * busy and cannot be thrown out. + * + * Note, we do not actually write any dirty mft records here because they are + * dirty inodes and hence will be written by the VFS inode dirty code paths. + * There is no need to write them from the VM page dirty code paths, too and in + * fact once we implement journalling it would be a complete nightmare having + * two code paths leading to mft record writeout. + */ +static int ntfs_mft_writepage(struct page *page, struct writeback_control *wbc) +{ + struct inode *mft_vi = page->mapping->host; + struct super_block *sb = mft_vi->i_sb; + ntfs_volume *vol = NTFS_SB(sb); + u8 *maddr; + MFT_RECORD *m; + ntfs_inode **extent_nis; + unsigned long mft_no; + int nr, i, j; + BOOL is_dirty = FALSE; + + BUG_ON(mft_vi != vol->mft_ino); + /* The first mft record number in the page. */ + mft_no = page->index << (PAGE_CACHE_SHIFT - vol->mft_record_size_bits); + /* Number of mft records in the page. */ + nr = PAGE_CACHE_SIZE >> vol->mft_record_size_bits; + BUG_ON(!nr); + ntfs_debug("Entering for %i inodes starting at 0x%lx.", nr, mft_no); + /* Iterate over the mft records in the page looking for a dirty one. */ + maddr = (u8*)kmap(page); + for (i = 0; i < nr; ++i, ++mft_no, maddr += vol->mft_record_size) { + struct inode *vi; + ntfs_inode *ni, *eni; + ntfs_attr na; + + na.mft_no = mft_no; + na.name = NULL; + na.name_len = 0; + na.type = AT_UNUSED; + /* + * Check if the inode corresponding to this mft record is in + * the VFS inode cache and obtain a reference to it if it is. + */ + ntfs_debug("Looking for inode 0x%lx in icache.", mft_no); + /* + * For inode 0, i.e. $MFT itself, we cannot use ilookup5() from + * here or we deadlock because the inode is already locked by + * the kernel (fs/fs-writeback.c::__sync_single_inode()) and + * ilookup5() waits until the inode is unlocked before + * returning it and it never gets unlocked because + * ntfs_mft_writepage() never returns. )-: Fortunately, we + * have inode 0 pinned in icache for the duration of the mount + * so we can access it directly. + */ + if (!mft_no) { + /* Balance the below iput(). */ + vi = igrab(mft_vi); + BUG_ON(vi != mft_vi); + } else + vi = ilookup5(sb, mft_no, (test_t)ntfs_test_inode, &na); + if (vi) { + ntfs_debug("Inode 0x%lx is in icache.", mft_no); + /* The inode is in icache. Check if it is dirty. */ + ni = NTFS_I(vi); + if (!NInoDirty(ni)) { + /* The inode is not dirty, skip this record. */ + ntfs_debug("Inode 0x%lx is not dirty, " + "continuing search.", mft_no); + iput(vi); + continue; + } + ntfs_debug("Inode 0x%lx is dirty, aborting search.", + mft_no); + /* The inode is dirty, no need to search further. */ + iput(vi); + is_dirty = TRUE; + break; + } + ntfs_debug("Inode 0x%lx is not in icache.", mft_no); + /* The inode is not in icache. */ + /* Skip the record if it is not a mft record (type "FILE"). */ + if (!ntfs_is_mft_recordp(maddr)) { + ntfs_debug("Mft record 0x%lx is not a FILE record, " + "continuing search.", mft_no); + continue; + } + m = (MFT_RECORD*)maddr; + /* + * Skip the mft record if it is not in use. FIXME: What about + * deleted/deallocated (extent) inodes? (AIA) + */ + if (!(m->flags & MFT_RECORD_IN_USE)) { + ntfs_debug("Mft record 0x%lx is not in use, " + "continuing search.", mft_no); + continue; + } + /* Skip the mft record if it is a base inode. */ + if (!m->base_mft_record) { + ntfs_debug("Mft record 0x%lx is a base record, " + "continuing search.", mft_no); + continue; + } + /* + * This is an extent mft record. Check if the inode + * corresponding to its base mft record is in icache. + */ + na.mft_no = MREF_LE(m->base_mft_record); + ntfs_debug("Mft record 0x%lx is an extent record. Looking " + "for base inode 0x%lx in icache.", mft_no, + na.mft_no); + vi = ilookup5(sb, na.mft_no, (test_t)ntfs_test_inode, + &na); + if (!vi) { + /* + * The base inode is not in icache. Skip this extent + * mft record. + */ + ntfs_debug("Base inode 0x%lx is not in icache, " + "continuing search.", na.mft_no); + continue; + } + ntfs_debug("Base inode 0x%lx is in icache.", na.mft_no); + /* + * The base inode is in icache. Check if it has the extent + * inode corresponding to this extent mft record attached. + */ + ni = NTFS_I(vi); + down(&ni->extent_lock); + if (ni->nr_extents <= 0) { + /* + * The base inode has no attached extent inodes. Skip + * this extent mft record. + */ + up(&ni->extent_lock); + iput(vi); + continue; + } + /* Iterate over the attached extent inodes. */ + extent_nis = ni->ext.extent_ntfs_inos; + for (eni = NULL, j = 0; j < ni->nr_extents; ++j) { + if (mft_no == extent_nis[j]->mft_no) { + /* + * Found the extent inode corresponding to this + * extent mft record. + */ + eni = extent_nis[j]; + break; + } + } + /* + * If the extent inode was not attached to the base inode, skip + * this extent mft record. + */ + if (!eni) { + up(&ni->extent_lock); + iput(vi); + continue; + } + /* + * Found the extent inode corrsponding to this extent mft + * record. If it is dirty, no need to search further. + */ + if (NInoDirty(eni)) { + up(&ni->extent_lock); + iput(vi); + is_dirty = TRUE; + break; + } + /* The extent inode is not dirty, so do the next record. */ + up(&ni->extent_lock); + iput(vi); + } + kunmap(page); + /* If a dirty mft record was found, redirty the page. */ + if (is_dirty) { + ntfs_debug("Inode 0x%lx is dirty. Redirtying the page " + "starting at inode 0x%lx.", mft_no, + page->index << (PAGE_CACHE_SHIFT - + vol->mft_record_size_bits)); + redirty_page_for_writepage(wbc, page); + unlock_page(page); + } else { + /* + * Keep the VM happy. This must be done otherwise the + * radix-tree tag PAGECACHE_TAG_DIRTY remains set even though + * the page is clean. + */ + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + unlock_page(page); + end_page_writeback(page); + } + ntfs_debug("Done."); + return 0; +} + +#endif /* NTFS_RW */ diff -Nru a/fs/ntfs/mft.h b/fs/ntfs/mft.h --- a/fs/ntfs/mft.h 2004-06-08 21:54:24 -07:00 +++ b/fs/ntfs/mft.h 2004-06-08 21:54:24 -07:00 @@ -57,6 +57,60 @@ flush_dcache_page(ni->page); } +extern void __mark_mft_record_dirty(ntfs_inode *ni); + +/** + * mark_mft_record_dirty - set the mft record and the page containing it dirty + * @ni: ntfs inode describing the mapped mft record + * + * Set the mapped (extent) mft record of the (base or extent) ntfs inode @ni, + * as well as the page containing the mft record, dirty. Also, mark the base + * vfs inode dirty. This ensures that any changes to the mft record are + * written out to disk. + * + * NOTE: Do not do anything if the mft record is already marked dirty. + */ +static inline void mark_mft_record_dirty(ntfs_inode *ni) +{ + if (!NInoTestSetDirty(ni)) + __mark_mft_record_dirty(ni); +} + +extern int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync); + +/** + * write_mft_record - write out a mapped (extent) mft record + * @ni: ntfs inode describing the mapped (extent) mft record + * @m: mapped (extent) mft record to write + * @sync: if true, wait for i/o completion + * + * This is just a wrapper for write_mft_record_nolock() (see mft.c), which + * locks the page for the duration of the write. This ensures that there are + * no race conditions between writing the mft record via the dirty inode code + * paths and via the page cache write back code paths or between writing + * neighbouring mft records residing in the same page. + * + * Locking the page also serializes us against ->readpage() if the page is not + * uptodate. + * + * On success, clean the mft record and return 0. On error, leave the mft + * record dirty and return -errno. The caller should call make_bad_inode() on + * the base inode to ensure no more access happens to this inode. We do not do + * it here as the caller may want to finish writing other extent mft records + * first to minimize on-disk metadata inconsistencies. + */ +static inline int write_mft_record(ntfs_inode *ni, MFT_RECORD *m, int sync) +{ + struct page *page = ni->page; + int err; + + BUG_ON(!page); + lock_page(page); + err = write_mft_record_nolock(ni, m, sync); + unlock_page(page); + return err; +} + #endif /* NTFS_RW */ #endif /* _LINUX_NTFS_MFT_H */ diff -Nru a/fs/ntfs/super.c b/fs/ntfs/super.c --- a/fs/ntfs/super.c 2004-06-08 21:54:24 -07:00 +++ b/fs/ntfs/super.c 2004-06-08 21:54:24 -07:00 @@ -291,6 +291,101 @@ return FALSE; } +#ifdef NTFS_RW + +/** + * ntfs_write_volume_flags - write new flags to the volume information flags + * @vol: ntfs volume on which to modify the flags + * @flags: new flags value for the volume information flags + * + * Internal function. You probably want to use ntfs_{set,clear}_volume_flags() + * instead (see below). + * + * Replace the volume information flags on the volume @vol with the value + * supplied in @flags. Note, this overwrites the volume information flags, so + * make sure to combine the flags you want to modify with the old flags and use + * the result when calling ntfs_write_volume_flags(). + * + * Return 0 on success and -errno on error. + */ +static int ntfs_write_volume_flags(ntfs_volume *vol, const VOLUME_FLAGS flags) +{ + ntfs_inode *ni = NTFS_I(vol->vol_ino); + MFT_RECORD *m; + VOLUME_INFORMATION *vi; + attr_search_context *ctx; + int err; + + ntfs_debug("Entering, old flags = 0x%x, new flags = 0x%x.", + vol->vol_flags, flags); + if (vol->vol_flags == flags) + goto done; + BUG_ON(!ni); + m = map_mft_record(ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + goto err_out; + } + ctx = get_attr_search_ctx(ni, m); + if (!ctx) { + err = -ENOMEM; + goto put_unm_err_out; + } + if (!lookup_attr(AT_VOLUME_INFORMATION, NULL, 0, 0, 0, NULL, 0, ctx)) { + err = -EIO; + goto put_unm_err_out; + } + vi = (VOLUME_INFORMATION*)((u8*)ctx->attr + + le16_to_cpu(ctx->attr->data.resident.value_offset)); + vol->vol_flags = vi->flags = flags; + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + put_attr_search_ctx(ctx); + unmap_mft_record(ni); +done: + ntfs_debug("Done."); + return 0; +put_unm_err_out: + if (ctx) + put_attr_search_ctx(ctx); + unmap_mft_record(ni); +err_out: + ntfs_error(vol->sb, "Failed with error code %i.", -err); + return err; +} + +/** + * ntfs_set_volume_flags - set bits in the volume information flags + * @vol: ntfs volume on which to modify the flags + * @flags: flags to set on the volume + * + * Set the bits in @flags in the volume information flags on the volume @vol. + * + * Return 0 on success and -errno on error. + */ +static inline int ntfs_set_volume_flags(ntfs_volume *vol, VOLUME_FLAGS flags) +{ + flags &= VOLUME_FLAGS_MASK; + return ntfs_write_volume_flags(vol, vol->vol_flags | flags); +} + +/** + * ntfs_clear_volume_flags - clear bits in the volume information flags + * @vol: ntfs volume on which to modify the flags + * @flags: flags to clear on the volume + * + * Clear the bits in @flags in the volume information flags on the volume @vol. + * + * Return 0 on success and -errno on error. + */ +static inline int ntfs_clear_volume_flags(ntfs_volume *vol, VOLUME_FLAGS flags) +{ + flags &= VOLUME_FLAGS_MASK; + return ntfs_write_volume_flags(vol, vol->vol_flags & ~flags); +} + +#endif /* NTFS_RW */ + /** * ntfs_remount - change the mount options of a mounted ntfs filesystem * @sb: superblock of mounted ntfs filesystem @@ -316,30 +411,72 @@ * For the read-write compiled driver, if we are remounting read-write, * make sure there are no volume errors and that no unsupported volume * flags are set. Also, empty the logfile journal as it would become - * stale as soon as something is written to the volume. + * stale as soon as something is written to the volume and mark the + * volume dirty so that chkdsk is run if the volume is not umounted + * cleanly. + * + * When remounting read-only, mark the volume clean if no volume errors + * have occured. */ if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) { static const char *es = ". Cannot remount read-write."; + /* Remounting read-write. */ if (NVolErrors(vol)) { ntfs_error(sb, "Volume has errors and is read-only%s", es); return -EROFS; } + if (vol->vol_flags & VOLUME_IS_DIRTY) { + ntfs_error(sb, "Volume is dirty and read-only%s", es); + return -EROFS; + } if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) { ntfs_error(sb, "Volume has unsupported flags set and " "is read-only%s", es); return -EROFS; } + if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) { + ntfs_error(sb, "Failed to set dirty bit in volume " + "information flags%s", es); + return -EROFS; + } +#if 0 + // TODO: Enable this code once we start modifying anything that + // is different between NTFS 1.2 and 3.x... + /* Set NT4 compatibility flag on newer NTFS version volumes. */ + if ((vol->major_ver > 1)) { + if (ntfs_set_volume_flags(vol, VOLUME_MOUNTED_ON_NT4)) { + ntfs_error(sb, "Failed to set NT4 " + "compatibility flag%s", es); + NVolSetErrors(vol); + return -EROFS; + } + } +#endif if (!ntfs_empty_logfile(vol->logfile_ino)) { ntfs_error(sb, "Failed to empty journal $LogFile%s", es); NVolSetErrors(vol); return -EROFS; } + } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) { + /* Remounting read-only. */ + if (!NVolErrors(vol)) { + if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY)) + ntfs_warning(sb, "Failed to clear dirty bit " + "in volume information " + "flags. Run chkdsk."); + } } // TODO: For now we enforce no atime and dir atime updates as they are // not implemented. + if ((sb->s_flags & MS_NOATIME) && !(*flags & MS_NOATIME)) + ntfs_warning(sb, "Atime updates are not implemented yet. " + "Leaving them disabled."); + else if ((sb->s_flags & MS_NODIRATIME) && !(*flags & MS_NODIRATIME)) + ntfs_warning(sb, "Directory atime updates are not implemented " + "yet. Leaving them disabled."); *flags |= MS_NOATIME | MS_NODIRATIME; #endif /* ! NTFS_RW */ @@ -1131,7 +1268,7 @@ le32_to_cpu(ctx->attr->data.resident.value_length) > (u8*)ctx->attr + le32_to_cpu(ctx->attr->length)) goto err_put_vol; - /* Setup volume flags and version. */ + /* Copy the volume flags and version to the ntfs_volume structure. */ vol->vol_flags = vi->flags; vol->major_ver = vi->major_ver; vol->minor_ver = vi->minor_ver; @@ -1142,9 +1279,12 @@ #ifdef NTFS_RW /* Make sure that no unsupported volume flags are set. */ if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) { - static const char *es1 = "Volume has unsupported flags set"; + static const char *es1a = "Volume is dirty"; + static const char *es1b = "Volume has unsupported flags set"; static const char *es2 = ". Run chkdsk and mount in Windows."; - + const char *es1; + + es1 = vol->vol_flags & VOLUME_IS_DIRTY ? es1a : es1b; /* If a read-write mount, convert it to a read-only mount. */ if (!(sb->s_flags & MS_RDONLY)) { if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | @@ -1171,10 +1311,12 @@ */ if (!load_and_check_logfile(vol) || !ntfs_is_logfile_clean(vol->logfile_ino)) { - static const char *es1 = "Failed to load $LogFile"; - static const char *es2 = "$LogFile is not clean"; - static const char *es3 = ". Mount in Windows."; + static const char *es1a = "Failed to load $LogFile"; + static const char *es1b = "$LogFile is not clean"; + static const char *es2 = ". Mount in Windows."; + const char *es1; + es1 = !vol->logfile_ino ? es1a : es1b; /* If a read-write mount, convert it to a read-only mount. */ if (!(sb->s_flags & MS_RDONLY)) { if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | @@ -1182,21 +1324,66 @@ ntfs_error(sb, "%s and neither on_errors=" "continue nor on_errors=" "remount-ro was specified%s", - !vol->logfile_ino ? es1 : es2, - es3); + es1, es2); goto iput_logfile_err_out; } sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME; - ntfs_error(sb, "%s. Mounting read-only%s", - !vol->logfile_ino ? es1 : es2, es3); + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); } else ntfs_warning(sb, "%s. Will not be able to remount " - "read-write%s", - !vol->logfile_ino ? es1 : es2, es3); + "read-write%s", es1, es2); /* This will prevent a read-write remount. */ NVolSetErrors(vol); - /* If a read-write mount, empty the logfile. */ - } else if (!(sb->s_flags & MS_RDONLY) && + } + /* If (still) a read-write mount, mark the volume dirty. */ + if (!(sb->s_flags & MS_RDONLY) && + ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) { + static const char *es1 = "Failed to set dirty bit in volume " + "information flags"; + static const char *es2 = ". Run chkdsk."; + + /* Convert to a read-only mount. */ + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=continue nor " + "on_errors=remount-ro was specified%s", + es1, es2); + goto iput_logfile_err_out; + } + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME; + /* + * Do not set NVolErrors() because ntfs_remount() might manage + * to set the dirty flag in which case all would be well. + */ + } +#if 0 + // TODO: Enable this code once we start modifying anything that is + // different between NTFS 1.2 and 3.x... + /* + * If (still) a read-write mount, set the NT4 compatibility flag on + * newer NTFS version volumes. + */ + if (!(sb->s_flags & MS_RDONLY) && (vol->major_ver > 1) && + ntfs_set_volume_flags(vol, VOLUME_MOUNTED_ON_NT4)) { + static const char *es1 = "Failed to set NT4 compatibility flag"; + static const char *es2 = ". Run chkdsk."; + + /* Convert to a read-only mount. */ + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=continue nor " + "on_errors=remount-ro was specified%s", + es1, es2); + goto iput_logfile_err_out; + } + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME; + NVolSetErrors(vol); + } +#endif + /* If (still) a read-write mount, empty the logfile. */ + if (!(sb->s_flags & MS_RDONLY) && !ntfs_empty_logfile(vol->logfile_ino)) { static const char *es1 = "Failed to empty $LogFile"; static const char *es2 = ". Mount in Windows."; @@ -1209,12 +1396,11 @@ es1, es2); goto iput_logfile_err_out; } - sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME; ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - /* This will prevent a read-write remount. */ + sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME; NVolSetErrors(vol); } -#endif +#endif /* NTFS_RW */ /* * Get the inode for the attribute definitions file and parse the * attribute definitions. @@ -1289,18 +1475,69 @@ /** * ntfs_put_super - called by the vfs to unmount a volume - * @vfs_sb: vfs superblock of volume to unmount + * @sb: vfs superblock of volume to unmount * * ntfs_put_super() is called by the VFS (from fs/super.c::do_umount()) when * the volume is being unmounted (umount system call has been invoked) and it * releases all inodes and memory belonging to the NTFS specific part of the * super block. */ -static void ntfs_put_super(struct super_block *vfs_sb) +static void ntfs_put_super(struct super_block *sb) { - ntfs_volume *vol = NTFS_SB(vfs_sb); + ntfs_volume *vol = NTFS_SB(sb); ntfs_debug("Entering."); +#ifdef NTFS_RW + /* + * Commit all inodes while they are still open in case some of them + * cause others to be dirtied. + */ + ntfs_commit_inode(vol->vol_ino); + + /* NTFS 3.0+ specific. */ + if (vol->major_ver >= 3) { + if (vol->secure_ino) + ntfs_commit_inode(vol->secure_ino); + } + + ntfs_commit_inode(vol->root_ino); + + down_write(&vol->lcnbmp_lock); + ntfs_commit_inode(vol->lcnbmp_ino); + up_write(&vol->lcnbmp_lock); + + down_write(&vol->mftbmp_lock); + ntfs_commit_inode(vol->mftbmp_ino); + up_write(&vol->mftbmp_lock); + + if (vol->logfile_ino) + ntfs_commit_inode(vol->logfile_ino); + + if (vol->mftmirr_ino) + ntfs_commit_inode(vol->mftmirr_ino); + ntfs_commit_inode(vol->mft_ino); + + /* + * If a read-write mount and no volume errors have occured, mark the + * volume clean. Also, re-commit all affected inodes. + */ + if (!(sb->s_flags & MS_RDONLY)) { + if (!NVolErrors(vol)) { + if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY)) + ntfs_warning(sb, "Failed to clear dirty bit " + "in volume information " + "flags. Run chkdsk."); + ntfs_commit_inode(vol->vol_ino); + ntfs_commit_inode(vol->root_ino); + if (vol->mftmirr_ino) + ntfs_commit_inode(vol->mftmirr_ino); + ntfs_commit_inode(vol->mft_ino); + } else { + ntfs_warning(sb, "Volume has errors. Leaving volume " + "marked dirty. Run chkdsk."); + } + } +#endif /* NTFS_RW */ iput(vol->vol_ino); vol->vol_ino = NULL; @@ -1331,11 +1568,47 @@ iput(vol->logfile_ino); vol->logfile_ino = NULL; } - if (vol->mftmirr_ino) { + /* Re-commit the mft mirror and mft just in case. */ + ntfs_commit_inode(vol->mftmirr_ino); + ntfs_commit_inode(vol->mft_ino); iput(vol->mftmirr_ino); vol->mftmirr_ino = NULL; } + /* + * If any dirty inodes are left, throw away all mft data page cache + * pages to allow a clean umount. This should never happen any more + * due to mft.c::ntfs_mft_writepage() cleaning all the dirty pages as + * the underlying mft records are written out and cleaned. If it does, + * happen anyway, we want to know... + */ + ntfs_commit_inode(vol->mft_ino); + write_inode_now(vol->mft_ino, 1); + if (!list_empty(&sb->s_dirty)) { + const char *s1, *s2; + + down(&vol->mft_ino->i_sem); + truncate_inode_pages(vol->mft_ino->i_mapping, 0); + up(&vol->mft_ino->i_sem); + write_inode_now(vol->mft_ino, 1); + if (!list_empty(&sb->s_dirty)) { + static const char *_s1 = "inodes"; + static const char *_s2 = ""; + s1 = _s1; + s2 = _s2; + } else { + static const char *_s1 = "mft pages"; + static const char *_s2 = "They have been thrown " + "away. "; + s1 = _s1; + s2 = _s2; + } + ntfs_error(sb, "Dirty %s found at umount time. %sYou should " + "run chkdsk. Please email " + "linux-ntfs-dev@lists.sourceforge.net and say " + "that you saw this message. Thank you.", s1, + s2); + } #endif /* NTFS_RW */ iput(vol->mft_ino); @@ -1344,7 +1617,7 @@ vol->upcase_len = 0; /* * Decrease the number of mounts and destroy the global default upcase - * table if necessary. Also decrease the number of upcase users if we + * table if necessary. Also decrease the number of upcase users if we * are a user. */ down(&ntfs_lock); @@ -1368,7 +1641,7 @@ unload_nls(vol->nls_map); vol->nls_map = NULL; } - vfs_sb->s_fs_info = NULL; + sb->s_fs_info = NULL; kfree(vol); return; } @@ -1629,8 +1902,8 @@ #ifdef NTFS_RW //.dirty_inode = NULL, /* VFS: Called from // __mark_inode_dirty(). */ - //.write_inode = NULL, /* VFS: Write dirty inode to - // disk. */ + .write_inode = ntfs_write_inode, /* VFS: Write dirty inode to + disk. */ //.drop_inode = NULL, /* VFS: Called just after the // inode reference count has // been decreased to zero. @@ -1719,8 +1992,12 @@ #ifndef NTFS_RW sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME; #else - // TODO: For now we enforce no atime and dir atime updates as they are - // not implemented. + if (!(sb->s_flags & MS_NOATIME)) + ntfs_warning(sb, "Atime updates are not implemented yet. " + "Disabling them."); + else if (!(sb->s_flags & MS_NODIRATIME)) + ntfs_warning(sb, "Directory atime updates are not implemented " + "yet. Disabling them."); sb->s_flags |= MS_NOATIME | MS_NODIRATIME; #endif /* Allocate a new ntfs_volume and place it in sb->s_fs_info. */