--- linux-2.2.17pre9.ext3-0.0.2e/fs/buffer.c.~1~ Thu Jun 29 17:24:22 2000 +++ linux-2.2.17pre9.ext3-0.0.2e/fs/buffer.c Thu Jun 29 20:41:30 2000 @@ -234,9 +234,9 @@ bh->b_count++; next->b_count++; bh->b_flushtime = 0; - ll_rw_block(WRITE, 1, &bh); J_ASSERT(!bh->b_transaction); J_ASSERT(bh->b_jlist == 0); + ll_rw_block(WRITE, 1, &bh); bh->b_count--; next->b_count--; retry = 1; @@ -1516,7 +1516,7 @@ if (buffer_locked(p)) { if (wait) __wait_on_buffer(p); - } else if (buffer_dirty(p)) + } else if (buffer_dirty(p) && !p->b_jlist) ll_rw_block(WRITE, 1, &p); } while (tmp != bh); @@ -1790,9 +1790,9 @@ #ifdef DEBUG if(nlist != BUF_DIRTY) ncount++; #endif - ll_rw_block(WRITE, 1, &bh); J_ASSERT(!bh->b_transaction); J_ASSERT(bh->b_jlist == 0); + ll_rw_block(WRITE, 1, &bh); bh->b_count--; next->b_count--; } @@ -1951,6 +1951,8 @@ bh->b_count++; ndirty++; bh->b_flushtime = 0; + J_ASSERT(!bh->b_transaction); + J_ASSERT(bh->b_jlist == 0); if (major == LOOP_MAJOR) { ll_rw_block(wrta_cmd,1, &bh); wrta_cmd = WRITEA; @@ -1959,8 +1961,6 @@ } else ll_rw_block(WRITE, 1, &bh); - J_ASSERT(!bh->b_transaction); - J_ASSERT(bh->b_jlist == 0); #ifdef DEBUG if(nlist != BUF_DIRTY) ncount++; #endif --- linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/balloc.c.~1~ Thu Jun 29 17:24:22 2000 +++ linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/balloc.c Thu Jun 29 21:29:39 2000 @@ -214,7 +214,7 @@ */ if (sb->u.ext3_sb.s_loaded_block_bitmaps > 0 && sb->u.ext3_sb.s_block_bitmap_number[0] == block_group && - sb->u.ext3_sb.s_block_bitmap[block_group]) { + sb->u.ext3_sb.s_block_bitmap[0]) { return 0; } /* @@ -611,6 +611,8 @@ unlock_super (sb); return 0; } + if (!buffer_uptodate(bh)) + wait_on_buffer(bh); /* @@@ This will eventually have to be a data-style operation, not metadata */ --- linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/dir.c.~1~ Thu Jun 29 17:24:22 2000 +++ linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/dir.c Thu Jun 29 17:36:49 2000 @@ -196,15 +196,18 @@ * version stamp to detect whether or * not the directory has been modified * during the copy operation. + * AV: It can't be modified, but it fscking + * can be seeked by another process that shares + * the descriptor. */ - unsigned long version = inode->i_version; + unsigned long version = filp->f_version; error = filldir(dirent, de->name, de->name_len, filp->f_pos, le32_to_cpu(de->inode)); if (error) break; - if (version != inode->i_version) + if (version != filp->f_version) goto revalidate; stored ++; } --- linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/file.c.~1~ Thu Jun 29 17:24:22 2000 +++ linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/file.c Wed Jul 5 14:50:32 2000 @@ -216,8 +216,7 @@ needed = (count >> EXT3_BLOCK_SIZE_BITS(sb)) + 1; if (needed > EXT3_MAX_TRANS_DATA) needed = EXT3_MAX_TRANS_DATA; - handle = journal_start(EXT3_JOURNAL(inode), - EXT3_DATA_TRANS_BLOCKS + needed); + handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS + needed); /* Check for overflow.. */ @@ -297,14 +296,13 @@ if (journal_extend(handle, needed)) { /* Couldn't extend: OK, commit the current * transaction and start a new one. */ - if (pos > inode->i_size) - inode->i_size = pos; + if (pos > inode->u.ext3_i.i_disksize) + inode->u.ext3_i.i_disksize = pos; inode->i_ctime = inode->i_mtime = CURRENT_TIME; ext3_mark_inode_dirty(handle, inode); - journal_stop(handle); - handle = journal_start(EXT3_JOURNAL(inode), - EXT3_DATA_TRANS_BLOCKS - + needed); + ext3_journal_stop(handle, inode); + handle = ext3_journal_start + (inode, EXT3_DATA_TRANS_BLOCKS + needed); } } @@ -417,13 +415,16 @@ if (filp->f_flags & O_SYNC) handle->h_sync = 1; - if (pos > inode->i_size) + if (pos > inode->i_size) { inode->i_size = pos; + inode->u.ext3_i.i_disksize = pos; + } + inode->i_ctime = inode->i_mtime = CURRENT_TIME; *ppos = pos; ext3_mark_inode_dirty(handle, inode); error_out: - journal_stop(handle); + ext3_journal_stop(handle, inode); return written; } --- linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/fsync.c.~1~ Thu Jun 29 17:24:22 2000 +++ linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/fsync.c Tue Jul 4 15:00:23 2000 @@ -46,10 +46,21 @@ if (!bh) return 0; if (wait && buffer_req(bh) && !buffer_uptodate(bh)) { - brelse (bh); - return -1; + /* There can be a parallel read(2) that started read-I/O + on the buffer so we can't assume that there's been + an I/O error without first waiting I/O completation. */ + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + { + brelse (bh); + return -1; + } } if (wait || !buffer_uptodate(bh) || !buffer_dirty(bh)) { + if (wait) + /* when we return from fsync all the blocks + must be _just_ stored on disk */ + wait_on_buffer(bh); brelse (bh); return 0; } @@ -262,7 +273,7 @@ struct inode *inode = dentry->d_inode; handle_t *handle; - handle = journal_start(EXT3_JOURNAL(inode), 1); /* @@@ Error? */ + handle = ext3_journal_start(inode, 1); /* @@@ Error? */ handle->h_sync = 1; if (S_ISLNK(inode->i_mode) && !(inode->i_blocks)) @@ -289,6 +300,6 @@ skip: err |= ext3_mark_inode_dirty (handle, inode); - journal_stop(handle); + ext3_journal_stop(handle, inode); return err ? -EIO : 0; } --- linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/ialloc.c.~1~ Thu Jun 29 17:24:22 2000 +++ linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/ialloc.c Fri Jun 30 11:06:36 2000 @@ -270,21 +270,6 @@ } /* - * This function increments the inode version number - * - * This may be used one day by the NFS server - */ -static void inc_inode_version (struct inode * inode, - struct ext3_group_desc *gdp, - int mode) -{ - inode->u.ext3_i.i_version++; - mark_inode_dirty(inode); - - return; -} - -/* * There are two policies for allocating an inode. If the new inode is * a directory, then a forward search is made for a block group with both * free space and a low directory-to-inode ratio; if that fails, then of @@ -497,13 +482,15 @@ inode->u.ext3_i.i_file_acl = 0; inode->u.ext3_i.i_dir_acl = 0; inode->u.ext3_i.i_dtime = 0; + INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan); inode->u.ext3_i.i_block_group = i; inode->i_op = NULL; if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL) inode->i_flags |= MS_SYNCHRONOUS; insert_inode_hash(inode); + inode->i_generation = inode_generation_count++; + inode->u.ext3_i.i_version = inode->i_generation; ext3_mark_inode_dirty(handle, inode); - inc_inode_version (inode, gdp, mode); unlock_super (sb); if(DQUOT_ALLOC_INODE(sb, inode)) { @@ -516,6 +503,51 @@ ext3_debug ("allocating inode %lu\n", inode->i_ino); *err = 0; + return inode; +} + +/* Verify that we are loading a valid orphan from disk */ +struct inode *ext3_orphan_get (struct super_block * sb, ino_t ino) +{ + ino_t max_ino = le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count); + unsigned long block_group; + unsigned long bit; + int bitmap_nr; + struct buffer_head *bh; + struct inode *inode = NULL; + + /* Error cases - e2fsck has already cleaned up for us */ + if (ino > max_ino) { + ext3_warning(sb, __FUNCTION__, + "bad orphan ino %ld! e2fsck was run?\n", ino); + return NULL; + } + + block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb); + bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb); + if ((bitmap_nr = load_inode_bitmap(sb, block_group)) < 0 || + !(bh = EXT3_SB(sb)->s_inode_bitmap[bitmap_nr])) { + ext3_warning(sb, __FUNCTION__, + "inode bitmap error for orphan %ld\n", ino); + return NULL; + } + + /* Having the inode bit set should be a 100% indicator that this + * is a valid orphan (no e2fsck run on fs). Orphans also include + * inodes that were being truncated, so we can't check i_nlink==0. + */ + if (!ext3_test_bit(bit, bh->b_data) || !(inode = iget(sb, ino)) || + is_bad_inode(inode) || NEXT_ORPHAN(inode) > max_ino) { + ext3_warning(sb, __FUNCTION__, + "bad orphan inode %ld! e2fsck was run?\n", ino); + + /* Avoid freeing blocks if we got a bad deleted inode */ + if (inode && inode->i_nlink == 0) + inode->i_blocks = 0; + iput(inode); + return NULL; + } + return inode; } --- linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/inode.c.~1~ Thu Jun 29 17:24:22 2000 +++ linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/inode.c Wed Jul 5 14:48:31 2000 @@ -42,6 +42,40 @@ } /* + * ext3_orphan_del() removes an unlinked or truncated inode from the list + * of such inodes stored on disk, because it is finally being cleaned up. + */ +void ext3_orphan_del(handle_t *handle, struct inode *inode) +{ + struct list_head *prev = inode->u.ext3_i.i_orphan.prev; + struct ext3_sb_info *sbi = EXT3_SB(inode->i_sb); + ino_t ino_next = NEXT_ORPHAN(inode); + + if (list_empty(&inode->u.ext3_i.i_orphan)) + return; + + jfs_debug(4, "remove inode %ld from orphan list\n", inode->i_ino); + lock_super(inode->i_sb); + list_del(&inode->u.ext3_i.i_orphan); + + if (prev == &sbi->s_orphan) { + jfs_debug(4, "superblock will point to %ld\n", ino_next); + journal_get_write_access(handle, sbi->s_sbh); + sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); + journal_dirty_metadata(handle, sbi->s_sbh); + } else { + struct inode *i_prev = + list_entry(prev, struct inode, u.ext3_i.i_orphan); + + jfs_debug(4, "orphan inode %ld will point to %ld\n", + i_prev->i_ino, ino_next); + NEXT_ORPHAN(i_prev) = ino_next; + ext3_mark_inode_dirty(handle, i_prev); + } + unlock_super(inode->i_sb); +} + +/* * Called at the last iput() if i_nlink is zero. */ void ext3_delete_inode (struct inode * inode) @@ -51,19 +85,24 @@ if (inode->i_ino == EXT3_ACL_IDX_INO || inode->i_ino == EXT3_ACL_DATA_INO) return; - inode->u.ext3_i.i_dtime = CURRENT_TIME; - handle = journal_start(EXT3_JOURNAL(inode), - EXT3_DELETE_TRANS_BLOCKS); + /* When we delete an inode, we increment its i_version. If it + is ever read in from disk again, it will have a different + i_version. */ + inode->u.ext3_i.i_version++; + + handle = ext3_journal_start(inode, EXT3_DELETE_TRANS_BLOCKS); if (IS_SYNC(inode)) handle->h_sync = 1; - ext3_mark_inode_dirty(handle, inode); inode->i_size = 0; if (inode->i_blocks) ext3_truncate (inode); - ext3_free_inode (handle, inode); - journal_stop(handle); + ext3_orphan_del(handle, inode); + inode->u.ext3_i.i_dtime = CURRENT_TIME; + ext3_mark_inode_dirty(handle, inode); + ext3_free_inode(handle, inode); + ext3_journal_stop(handle, inode); } #define inode_bmap(inode, nr) ((inode)->u.ext3_i.i_data[(nr)]) @@ -129,6 +168,8 @@ "cannot get block %lu", result); return 0; } + if (!buffer_uptodate(bh)) + wait_on_buffer(bh); /* @@@ Once we start journaling data separately, this needs to become dependent on the type of inode we are allocating inside */ @@ -579,10 +620,13 @@ << 32; #endif } + inode->u.ext3_i.i_disksize = inode->i_size; inode->u.ext3_i.i_version = le32_to_cpu(iloc.raw_inode->i_version); + inode->i_generation = inode->u.ext3_i.i_version; inode->u.ext3_i.i_block_group = iloc.block_group; inode->u.ext3_i.i_next_alloc_block = 0; inode->u.ext3_i.i_next_alloc_goal = 0; + INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan); if (inode->u.ext3_i.i_prealloc_count) ext3_error (inode->i_sb, "ext3_read_inode", "New inode has non-zero prealloc count!"); @@ -656,7 +700,7 @@ raw_inode->i_uid = cpu_to_le16(inode->i_uid); raw_inode->i_gid = cpu_to_le16(inode->i_gid); raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); - raw_inode->i_size = cpu_to_le32(inode->i_size); + raw_inode->i_size = cpu_to_le32(inode->u.ext3_i.i_disksize); raw_inode->i_atime = cpu_to_le32(inode->i_atime); raw_inode->i_ctime = cpu_to_le32(inode->i_ctime); raw_inode->i_mtime = cpu_to_le32(inode->i_mtime); @@ -674,7 +718,7 @@ raw_inode->i_size_high = cpu_to_le32(inode->u.ext3_i.i_high_size); #else - raw_inode->i_size_high = cpu_to_le32(inode->i_size >> 32); + raw_inode->i_size_high = cpu_to_le32(inode->u.ext3_i.i_disksize >> 32); #endif } raw_inode->i_version = cpu_to_le32(inode->u.ext3_i.i_version); @@ -813,14 +857,14 @@ struct buffer_head *bh = sb->u.ext3_sb.s_sbh; /* @@@ Error, null handle? */ - handle = journal_start(EXT3_JOURNAL(inode), 1); + handle = ext3_journal_start(inode, 1); /* If this is the first large file * created, add a flag to the superblock */ es->s_feature_ro_compat |= cpu_to_le32(EXT3_FEATURE_RO_COMPAT_LARGE_FILE); journal_dirty_metadata(handle, bh); /*@@@err*/ - journal_stop(handle); + ext3_journal_stop(handle, inode); } } #endif @@ -835,7 +879,7 @@ * required is one. */ /* @@@ Error, null handle? */ - handle = journal_start(EXT3_JOURNAL(inode), 1); + handle = ext3_journal_start(inode, 1); retval = ext3_reserve_inode_write(handle, inode, &iloc); if (retval) goto out_stop; @@ -877,7 +921,7 @@ retval = ext3_mark_iloc_dirty(handle, inode, &iloc); out_stop: - journal_stop(handle); + ext3_journal_stop(handle, inode); out: return retval; } --- linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/ioctl.c.~1~ Thu Jun 29 17:24:22 2000 +++ linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/ioctl.c Thu Jun 29 21:36:18 2000 @@ -77,6 +77,7 @@ if (get_user(inode->u.ext3_i.i_version, (int *) arg)) return -EFAULT; inode->i_ctime = CURRENT_TIME; + inode->i_generation = inode->u.ext3_i.i_version; mark_inode_dirty(inode); return 0; default: --- linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/namei.c.~1~ Thu Jun 29 17:24:22 2000 +++ linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/namei.c Tue Jul 4 15:35:08 2000 @@ -373,8 +373,7 @@ handle_t *handle; int err = -EIO; - handle = journal_start(EXT3_JOURNAL(dir), - EXT3_DATA_TRANS_BLOCKS + 3); + handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); /* * N.B. Several error exits in ext3_new_inode don't set err. @@ -406,7 +405,7 @@ d_instantiate(dentry, inode); out: - journal_stop(handle); + ext3_journal_stop(handle, inode); return err; } @@ -418,8 +417,7 @@ int err = -EIO; handle_t *handle = 0; - handle = journal_start(EXT3_JOURNAL(dir), - EXT3_DATA_TRANS_BLOCKS + 3); + handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); inode = ext3_new_inode (handle, dir, mode, &err); if (!inode) @@ -438,6 +436,10 @@ if (EXT3_HAS_INCOMPAT_FEATURE(dir->i_sb, EXT3_FEATURE_INCOMPAT_FILETYPE)) de->file_type = EXT3_FT_REG_FILE; + } else if (S_ISSOCK(inode->i_mode)) { + if (EXT3_HAS_INCOMPAT_FEATURE(dir->i_sb, + EXT3_FEATURE_INCOMPAT_FILETYPE)) + de->file_type = EXT3_FT_SOCK; } else if (S_ISCHR(inode->i_mode)) { inode->i_op = &chrdev_inode_operations; if (EXT3_HAS_INCOMPAT_FEATURE(dir->i_sb, @@ -463,7 +465,7 @@ d_instantiate(dentry, inode); brelse(bh); out_stop: - journal_stop(handle); + ext3_journal_stop(handle, inode); return err; out_no_entry: @@ -485,8 +487,7 @@ if (dir->i_nlink >= EXT3_LINK_MAX) goto out; - handle = journal_start(EXT3_JOURNAL(dir), - EXT3_DATA_TRANS_BLOCKS + 4); + handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 4); err = -EIO; inode = ext3_new_inode (handle, dir, S_IFDIR, &err); if (!inode) @@ -545,7 +546,7 @@ err = 0; out_stop: - journal_stop(handle); + ext3_journal_stop(handle, inode); out: return err; @@ -628,8 +629,7 @@ struct ext3_dir_entry_2 * de; handle_t *handle; - handle = journal_start(EXT3_JOURNAL(dir), - EXT3_DELETE_TRANS_BLOCKS); + handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); retval = -ENOENT; bh = ext3_find_entry (dir, dentry->d_name.name, dentry->d_name.len, &de); @@ -669,11 +669,41 @@ handle->h_sync = 1; end_rmdir: - journal_stop(handle); + ext3_journal_stop(handle, dir); brelse (bh); return retval; } +/* ext3_orphan_add() links a unlinked or truncated inode into a list of + * such inodes, starting at the superblock, in case we crash before the + * file is closed/deleted, or in case the inode truncate spans multiple + * transactions and the last transaction is not recovered after a crash. + * + * At filesystem recovery time, we walk this list deleting unlinked + * inodes and truncating linked inodes in ext3_orphan_cleanup(). + */ +void ext3_orphan_add(handle_t *handle, struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + lock_super(sb); + if (!list_empty(&inode->u.ext3_i.i_orphan)) { + unlock_super(sb); + return; + } + journal_get_write_access(handle, sb->u.ext3_sb.s_sbh); + NEXT_ORPHAN(inode) = le32_to_cpu(EXT3_SB(sb)->s_es->s_last_orphan); + EXT3_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); + journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh); + list_add(&inode->u.ext3_i.i_orphan, &EXT3_SB(sb)->s_orphan); + ext3_mark_inode_dirty(handle, inode); + unlock_super(sb); + + jfs_debug(4, "superblock will point to %ld\n", inode->i_ino); + jfs_debug(4, "orphan inode %ld will point to %d\n", + inode->i_ino, NEXT_ORPHAN(inode)); +} + int ext3_unlink(struct inode * dir, struct dentry *dentry) { int retval; @@ -682,8 +712,7 @@ struct ext3_dir_entry_2 * de; handle_t *handle; - handle = journal_start(EXT3_JOURNAL(dir), - EXT3_DELETE_TRANS_BLOCKS); + handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); retval = -ENOENT; bh = ext3_find_entry (dir, dentry->d_name.name, dentry->d_name.len, &de); @@ -713,13 +742,15 @@ dir->u.ext3_i.i_flags &= ~EXT3_BTREE_FL; ext3_mark_inode_dirty(handle, dir); inode->i_nlink--; + if (!inode->i_nlink) + ext3_orphan_add(handle, inode); ext3_mark_inode_dirty(handle, inode); inode->i_ctime = dir->i_ctime; retval = 0; d_delete(dentry); /* This also frees the inode */ end_unlink: - journal_stop(handle); + ext3_journal_stop(handle, dir); brelse (bh); return retval; } @@ -734,8 +765,7 @@ char c; handle_t *handle; - handle = journal_start(EXT3_JOURNAL(dir), - EXT3_DATA_TRANS_BLOCKS + 5); + handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 5); if (!(inode = ext3_new_inode (handle, dir, S_IFLNK, &err))) goto out; @@ -788,7 +818,7 @@ d_instantiate(dentry, inode); err = 0; out: - journal_stop(handle); + ext3_journal_stop(handle, dir); return err; out_no_entry: @@ -813,8 +843,7 @@ if (inode->i_nlink >= EXT3_LINK_MAX) return -EMLINK; - handle = journal_start(EXT3_JOURNAL(dir), - EXT3_DATA_TRANS_BLOCKS); + handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS); bh = ext3_add_entry (handle, dir, dentry->d_name.name, dentry->d_name.len, &de, &err); if (!bh) @@ -829,6 +858,8 @@ de->file_type = EXT3_FT_DIR; else if (S_ISLNK(inode->i_mode)) de->file_type = EXT3_FT_SYMLINK; + else if (S_ISSOCK(inode->i_mode)) + de->file_type = EXT3_FT_SOCK; else if (S_ISCHR(inode->i_mode)) de->file_type = EXT3_FT_CHRDEV; else if (S_ISBLK(inode->i_mode)) @@ -847,7 +878,7 @@ d_instantiate(dentry, inode); err = 0; out: - journal_stop(handle); + ext3_journal_stop(handle, dir); return err; } @@ -870,8 +901,7 @@ old_bh = new_bh = dir_bh = NULL; - handle = journal_start(EXT3_JOURNAL(old_dir), - 2 * EXT3_DATA_TRANS_BLOCKS + 2); + handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + 2); old_bh = ext3_find_entry (old_dir, old_dentry->d_name.name, old_dentry->d_name.len, &old_de); /* @@ -972,6 +1002,6 @@ brelse (dir_bh); brelse (old_bh); brelse (new_bh); - journal_stop(handle); + ext3_journal_stop(handle, old_dir); return retval; } --- linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/super.c.~1~ Thu Jun 29 17:24:22 2000 +++ linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/super.c Tue Jul 4 23:47:07 2000 @@ -44,6 +44,8 @@ static void ext3_commit_super (struct super_block * sb, struct ext3_super_block * es, int sync); +static void ext3_mark_recovery_complete(struct super_block * sb, + struct ext3_super_block * es); void ext3_error (struct super_block * sb, const char * function, const char * fmt, ...) @@ -137,6 +139,8 @@ brelse (EXT3_SB(sb)->s_block_bitmap[i]); brelse (EXT3_SB(sb)->s_sbh); + J_ASSERT (list_empty(&EXT3_SB(sb)->s_orphan)); + MOD_DEC_USE_COUNT; return; } @@ -197,7 +201,7 @@ else if (!strcmp (this_char, "errors")) { if (!value || !*value) { printk ("EXT3-fs: the errors option requires " - "an argument"); + "an argument\n"); return 0; } if (!strcmp (value, "continue")) { @@ -414,6 +418,73 @@ return 1; } +/* ext3_orphan_cleanup() walks a singly-linked list of inodes (starting at + * the superblock) which were deleted from all directories, but held open by + * a process at the time of a crash. We walk the list and try to delete these + * inodes at recovery time (only with a read-write filesystem). + * + * In order to keep the orphan inode chain consistent during traversal (in + * case of crash during recovery), we link each inode into the superblock + * orphan list_head and handle it the same way as an inode deletion during + * normal operation (which journals the operations for us). + * + * We only do an iget() and an iput() on each inode, which is very safe if we + * accidentally point at an in-use or already deleted inode. The worst that + * can happen in this case is that we get a "bit already cleared" message from + * ext3_free_inode(). The only reason we would point at a wrong inode is if + * e2fsck was run on this filesystem, and it must have already done the orphan + * inode cleanup for us, so we can safely abort without any further action. + */ +static void ext3_orphan_cleanup (struct super_block * sb, + struct ext3_super_block * es) +{ + unsigned int s_flags = sb->s_flags; + int nr_orphans = 0, nr_truncates = 0; + if (!es->s_last_orphan) { + jfs_debug(4, "no orphan inodes to clean up\n"); + return; + } + + if (s_flags & MS_RDONLY) { + printk(KERN_INFO "EXT3-fs: %s: orphan cleanup on read-only fs\n", + kdevname(sb->s_dev)); + sb->s_flags &= ~MS_RDONLY; + } + + while (es->s_last_orphan) { + struct inode *inode; + + if (!(inode = + ext3_orphan_get(sb, le32_to_cpu(es->s_last_orphan)))) { + es->s_last_orphan = 0; + break; + } + + list_add(&inode->u.ext3_i.i_orphan, &EXT3_SB(sb)->s_orphan); + if (inode->i_nlink) { + jfs_debug(2, "truncating inode %ld to %ld bytes\n", + inode->i_ino, inode->i_size); + ext3_truncate(inode); + nr_truncates++; + } else { + jfs_debug(2, "deleting unreferenced inode %ld\n", + inode->i_ino); + nr_orphans++; + } + iput(inode); /* The delete magic happens here! */ + } + +#define PLURAL(x) (x), ((x)==1) ? "" : "s" + + if (nr_orphans) + printk(KERN_INFO "EXT3-fs: %s: %d orphan inode%s deleted\n", + kdevname(sb->s_dev), PLURAL(nr_orphans)); + if (nr_truncates) + printk(KERN_INFO "EXT3-fs: %s: %d truncate%s cleaned up\n", + kdevname(sb->s_dev), PLURAL(nr_truncates)); + sb->s_flags = s_flags; /* Restore MS_RDONLY status */ +} + #define log2(n) ffz(~(n)) struct super_block * ext3_read_super (struct super_block * sb, void * data, @@ -670,6 +741,7 @@ */ sb->s_dev = dev; sb->s_op = &ext3_sops; + INIT_LIST_HEAD(&sb->u.ext3_sb.s_orphan); /* unlinked but open files */ unlock_super (sb); err = 0; @@ -699,7 +771,10 @@ sb->s_root = d_alloc_root(iget(sb, EXT3_ROOT_INO), NULL); if (!sb->s_root) goto error_out; - ext3_setup_super (sb, es); + ext3_setup_super(sb, es); + ext3_orphan_cleanup(sb, es); + ext3_mark_recovery_complete(sb, es); + printk (KERN_INFO "EXT3-fs: recovery complete.\n"); return sb; error_out: @@ -755,7 +830,7 @@ * can get read-write access to the device. */ - if (es->s_feature_incompat & EXT3_FEATURE_INCOMPAT_RECOVER) { + if (es->s_feature_incompat & cpu_to_le32(EXT3_FEATURE_INCOMPAT_RECOVER)) { if (sb->s_flags & MS_RDONLY) { printk(KERN_ERR "EXT3-fs: WARNING: recovery required on readonly filesystem.\n"); if (is_read_only(sb->s_dev)) { @@ -785,21 +860,6 @@ } EXT3_SB(sb)->s_journal = journal; - - /* - * Have we just finished recovery? If so, and if we are - * mounting the filesystem readonly, then we will end up with a - * consistent fs on disk. Record that fact if so. - */ - - if (le32_to_cpu(es->s_feature_incompat) & EXT3_FEATURE_INCOMPAT_RECOVER) { - printk (KERN_INFO "EXT3-fs: recovery complete.\n"); - if (sb->s_flags & MS_RDONLY) { - es->s_feature_incompat &= ~(cpu_to_le32(EXT3_FEATURE_INCOMPAT_RECOVER)); - ext3_commit_super(sb, es, 1); - } - } - return 0; } @@ -848,13 +908,33 @@ { es->s_wtime = cpu_to_le32(CURRENT_TIME); mark_buffer_dirty(sb->u.ext3_sb.s_sbh, 1); - sb->s_dirt = 0; if (sync) { ll_rw_block(WRITE, 1, &sb->u.ext3_sb.s_sbh); wait_on_buffer(sb->u.ext3_sb.s_sbh); } } + +/* + * Have we just finished recovery? If so, and if we are mounting the + * filesystem readonly, then we will end up with a consistent fs on + * disk. Record that fact if so. + */ +static void ext3_mark_recovery_complete(struct super_block * sb, + struct ext3_super_block * es) +{ + journal_flush(EXT3_SB(sb)->s_journal); + if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) { + if (sb->s_flags & MS_RDONLY) { + EXT3_SB(sb)->s_feature_incompat &= ~EXT3_FEATURE_INCOMPAT_RECOVER; + es->s_feature_incompat = cpu_to_le32(EXT3_SB(sb)->s_feature_incompat); + es->s_mtime = cpu_to_le32(CURRENT_TIME); + ext3_commit_super(sb, es, 1); + sb->s_dirt = 0; + } + } +} + /* * In the second extended file system, it is not necessary to * write the super block since we use a mapping of the @@ -868,20 +948,22 @@ void ext3_write_super (struct super_block * sb) { - struct ext3_super_block * es; + tid_t wait_tid; + sb->s_dirt = 0; + if (!(sb->s_flags & MS_RDONLY)) { journal_t *journal; journal = EXT3_SB(sb)->s_journal; - es = sb->u.ext3_sb.s_es; - if (journal->j_running_transaction) + if (journal->j_running_transaction) { + wait_tid = journal->j_running_transaction->t_tid; log_start_commit(journal, journal->j_running_transaction); - if (journal->j_committing_transaction) + log_wait_commit(journal, wait_tid); + } else if (journal->j_committing_transaction) log_wait_commit(journal, journal->j_committing_transaction->t_tid); } - sb->s_dirt = 0; } int ext3_remount (struct super_block * sb, int * flags, char * data) @@ -912,7 +994,6 @@ * to disable replay of the journal when we next remount */ sb->s_flags |= MS_RDONLY; - journal_flush(EXT3_SB(sb)->s_journal); /* * OK, test if we are remounting a valid rw partition @@ -923,11 +1004,7 @@ (sb->u.ext3_sb.s_mount_state & EXT3_VALID_FS)) es->s_state = cpu_to_le16(sb->u.ext3_sb.s_mount_state); - es->s_feature_incompat &= cpu_to_le32(~EXT3_FEATURE_INCOMPAT_RECOVER); - es->s_mtime = cpu_to_le32(CURRENT_TIME); - mark_buffer_dirty(sb->u.ext3_sb.s_sbh, 1); - sb->s_dirt = 1; - ext3_commit_super (sb, es, 1); + ext3_mark_recovery_complete(sb, es); } else { /* @@ -938,6 +1015,7 @@ sb->u.ext3_sb.s_mount_state = le16_to_cpu(es->s_state); sb->s_flags &= ~MS_RDONLY; ext3_setup_super (sb, es); + sb->s_dirt = 1; } return 0; } --- linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/truncate.c.~1~ Thu Jun 29 17:24:22 2000 +++ linux-2.2.17pre9.ext3-0.0.2e/fs/ext3/truncate.c Wed Jul 5 14:53:52 2000 @@ -136,8 +136,7 @@ if (needed > EXT3_MAX_TRANS_DATA) needed = EXT3_MAX_TRANS_DATA; - return journal_start(EXT3_JOURNAL(inode), - EXT3_DATA_TRANS_BLOCKS + needed); + return ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS + needed); } static int extend_transaction(handle_t *handle, struct inode *inode) @@ -487,6 +486,19 @@ handle = start_transaction(inode); + /* Add inode to orphan list, so that if this truncate spans multiple + * transactions, and we crash and don't recover the last transaction + * we will resume the truncate when the filesystem recovers. + */ + ext3_orphan_add(handle, inode); + ext3_mark_inode_dirty(handle, inode); + + /* The orphan list will now protect us from a crash before the + * truncate completes, so it is finally safe to propagate the + * new inode size (held for now in i_size) into the on-disk + * inode. */ + inode->u.ext3_i.i_disksize = inode->i_size; + while (1) { retry = trunc_direct(handle, inode); retry |= trunc_indirect (handle, inode, @@ -500,7 +512,7 @@ retry |= trunc_tindirect (handle, inode); if (!retry) break; - journal_stop(handle); + ext3_journal_stop(handle, inode); current->counter = 0; run_task_queue(&tq_disk); current->policy |= SCHED_YIELD; @@ -528,6 +540,10 @@ } } inode->i_mtime = inode->i_ctime = CURRENT_TIME; + if (inode->i_nlink) { + ext3_orphan_del(handle, inode); + NEXT_ORPHAN(inode) = 0; + } ext3_mark_inode_dirty(handle, inode); - journal_stop(handle); + ext3_journal_stop(handle, inode); } --- linux-2.2.17pre9.ext3-0.0.2e/fs/jfs/commit.c.~1~ Thu Jun 29 17:24:22 2000 +++ linux-2.2.17pre9.ext3-0.0.2e/fs/jfs/commit.c Thu Jun 29 23:27:37 2000 @@ -38,7 +38,7 @@ int blocknr; char *tagp = NULL; journal_header_t *header; - journal_block_tag_t *tag = NULL, *last_tag; + journal_block_tag_t *tag = NULL; int space_left = 0; int first_tag = 0; int tag_flag; --- linux-2.2.17pre9.ext3-0.0.2e/fs/jfs/journal.c.~1~ Thu Jun 29 17:24:22 2000 +++ linux-2.2.17pre9.ext3-0.0.2e/fs/jfs/journal.c Tue Jul 4 12:30:32 2000 @@ -131,6 +131,11 @@ journal->j_commit_request = transaction->t_tid; } + if (journal->j_commit_timer_active) { + journal->j_commit_timer_active = 0; + del_timer(journal->j_commit_timer); + } + journal->j_task = NULL; wake_up(&journal->j_wait_done_commit); jfs_debug(1, "Journal thread exiting.\n"); @@ -318,7 +323,7 @@ journal_file_buffer(bh_in, transaction, BJ_Shadow); journal_file_buffer(new_bh, transaction, BJ_IO); - return do_escape + (done_copy_out << 1); + return do_escape | (done_copy_out << 1); } @@ -784,6 +789,12 @@ while (!err && journal->j_checkpoint_transactions != NULL) err = log_do_checkpoint(journal, journal->j_maxlen); unlock_journal(journal); + + J_ASSERT(!journal->j_running_transaction); + J_ASSERT(!journal->j_committing_transaction); + J_ASSERT(!journal->j_checkpoint_transactions); + J_ASSERT(journal->j_head == journal->j_tail); + J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); return err; } --- linux-2.2.17pre9.ext3-0.0.2e/fs/jfs/recovery.c.~1~ Thu Jun 29 17:24:22 2000 +++ linux-2.2.17pre9.ext3-0.0.2e/fs/jfs/recovery.c Tue Jul 4 17:29:54 2000 @@ -223,7 +223,7 @@ */ if (!sb->s_start) { - jfs_debug(1, "No recovery required, last transaction %ld\n", + jfs_debug(1, "No recovery required, last transaction %d\n", ntohl(sb->s_sequence)); journal->j_transaction_sequence = ++next_commit_ID; return 0; @@ -327,8 +327,10 @@ /* If it is the commit block, then we are all done! */ - if (tmp->h_blocktype == htonl(JFS_COMMIT_BLOCK)) + if (tmp->h_blocktype == htonl(JFS_COMMIT_BLOCK)) { + brelse(bh); break; + } /* A descriptor block: we can now write all of * the data blocks. Yay, useful work is finally @@ -376,7 +378,7 @@ } mark_buffer_dirty(nbh, 1); - ll_rw_block(WRITE, 1, &nbh); + // ll_rw_block(WRITE, 1, &nbh); brelse(obh); brelse(nbh); } @@ -389,10 +391,11 @@ break; } /* end of tag loop */ + + brelse(bh); } /* end of descriptor block loop */ - brelse(bh); - + /* We have now replayed that entire transaction: start * looking for the next transaction. */ next_commit_ID++; --- linux-2.2.17pre9.ext3-0.0.2e/fs/jfs/transaction.c.~1~ Thu Jun 29 17:24:22 2000 +++ linux-2.2.17pre9.ext3-0.0.2e/fs/jfs/transaction.c Thu Jun 29 23:29:49 2000 @@ -507,7 +507,7 @@ * To deal with that, journal_get_undo_access requests write access to a * buffer for parts of non-rewindable operations such as delete * operations on the bitmaps. The journaling code must keep a copy of -` * the buffer's contents prior to the undo_access call until such time + * the buffer's contents prior to the undo_access call until such time * as we know that the buffer has definitely been committed to disk. * * We never need to know which transaction the committed data is part --- linux-2.2.17pre9.ext3-0.0.2e/include/linux/ext3_fs.h.~1~ Thu Jun 29 17:24:22 2000 +++ linux-2.2.17pre9.ext3-0.0.2e/include/linux/ext3_fs.h Tue Jul 4 16:43:28 2000 @@ -36,8 +36,8 @@ /* * The second extended file system version */ -#define EXT3FS_DATE "2000/3/22" -#define EXT3FS_VERSION "0.0.2d" +#define EXT3FS_DATE "2000/07/04" +#define EXT3FS_VERSION "0.0.2e" /* * Debug code @@ -406,8 +406,10 @@ */ __u8 s_journal_uuid[16]; /* uuid of journal superblock */ __u32 s_journal_inum; /* inode number of journal file */ + __u32 s_journal_dev; /* device number of journal file */ + __u32 s_last_orphan; /* start of list of inodes to delete */ - __u32 s_reserved[199]; /* Padding to the end of the block */ + __u32 s_reserved[197]; /* Padding to the end of the block */ }; #ifdef __KERNEL__ @@ -419,6 +421,8 @@ #define EXT3_SB(sb) (sb) #endif +#define NEXT_ORPHAN(inode) inode->u.ext3_i.i_dtime + /* * Codes for operating systems */ @@ -586,6 +590,7 @@ /* ialloc.c */ extern struct inode * ext3_new_inode (handle_t *, const struct inode *, int, int *); extern void ext3_free_inode (handle_t *, struct inode *); +extern struct inode * ext3_orphan_get (struct super_block * sb, ino_t ino); extern unsigned long ext3_count_free_inodes (struct super_block *); extern void ext3_check_inodes_bitmap (struct super_block *); @@ -601,6 +606,7 @@ extern void ext3_read_inode (struct inode *); extern void ext3_write_inode (struct inode *); extern void ext3_put_inode (struct inode *); +extern void ext3_orphan_del (handle_t *handle, struct inode *); extern void ext3_delete_inode (struct inode *); extern int ext3_sync_inode (handle_t *, struct inode *); extern int ext3_notify_change(struct dentry *, struct iattr *); @@ -616,6 +622,7 @@ extern int ext3_create (struct inode *,struct dentry *,int); extern int ext3_mkdir (struct inode *,struct dentry *,int); extern int ext3_rmdir (struct inode *,struct dentry *); +extern void ext3_orphan_add(handle_t *, struct inode *); extern int ext3_unlink (struct inode *,struct dentry *); extern int ext3_symlink (struct inode *,struct dentry *,const char *); extern int ext3_link (struct dentry *, struct inode *, struct dentry *); --- linux-2.2.17pre9.ext3-0.0.2e/include/linux/ext3_fs_i.h.~1~ Thu Jun 29 17:24:22 2000 +++ linux-2.2.17pre9.ext3-0.0.2e/include/linux/ext3_fs_i.h Wed Jul 5 14:45:43 2000 @@ -36,7 +36,13 @@ __u32 i_prealloc_block; __u32 i_prealloc_count; __u32 i_high_size; + struct list_head i_orphan; /* unlinked but open inodes */ int i_new_inode:1; /* Is a freshly allocated inode */ + /* i_disksize keeps track of what the inode size is ON DISK, not + * in memory. During truncate, i_size is set to 0 by the VFS + * but the filesystem won't set i_disksize to 0 until the + * truncate is actually under way. */ + off_t i_disksize; }; #endif /* _LINUX_EXT3_FS_I */ --- linux-2.2.17pre9.ext3-0.0.2e/include/linux/ext3_fs_sb.h.~1~ Thu Jun 29 17:41:18 2000 +++ linux-2.2.17pre9.ext3-0.0.2e/include/linux/ext3_fs_sb.h Tue Jul 4 17:01:54 2000 @@ -65,6 +65,7 @@ /* Journaling */ struct inode * s_journal_inode; struct journal_s * s_journal; + struct list_head s_orphan; }; #endif /* _LINUX_EXT3_FS_SB */ --- linux-2.2.17pre9.ext3-0.0.2e/include/linux/ext3_jfs.h.~1~ Fri Jun 30 19:53:35 2000 +++ linux-2.2.17pre9.ext3-0.0.2e/include/linux/ext3_jfs.h Wed Jul 5 14:55:34 2000 @@ -113,4 +113,26 @@ return err; } + +/* + * Wrappers for journal_start/end. + * + * The only special thing we need to do here is to make sure that all + * journal_end calls result in the superblock being marked dirty, so + * that sync() will call the filesystem's write_super callback if + * appropriate. + */ + +static inline handle_t *ext3_journal_start (struct inode *inode, int nblocks) +{ + return journal_start(EXT3_JOURNAL(inode), nblocks); +} + +static inline int ext3_journal_stop (handle_t *handle, struct inode *inode) +{ + int rc = journal_stop(handle); + inode->i_sb->s_dirt = 1; + return rc; +} + #endif /* _LINUX_EXT3_JFS_H */ --- linux-2.2.17pre9.ext3-0.0.2e/include/linux/fs.h.~1~ Thu Jun 29 17:41:20 2000 +++ linux-2.2.17pre9.ext3-0.0.2e/include/linux/fs.h Wed Jul 5 14:54:03 2000 @@ -618,9 +618,6 @@ short int s_ibasket_max; struct list_head s_dirty; /* dirty inodes */ - /* Pointer to journaling control structure for this filesystem */ - journal_t * s_journal; - /* Filesystem-specific data: */ union { struct minix_sb_info minix_sb;