Index: fs/buffer.c =================================================================== RCS file: /cvsroot/gkernel/ext3/fs/buffer.c,v retrieving revision 1.8.2.16.2.11 retrieving revision 1.36.2.22 diff -u -r1.8.2.16.2.11 -r1.36.2.22 --- fs/buffer.c 9 May 2002 15:54:51 -0000 1.8.2.16.2.11 +++ fs/buffer.c 9 May 2002 16:10:09 -0000 1.36.2.22 @@ -1746,9 +1746,14 @@ } /* Stage 3: start the IO */ - for (i = 0; i < nr; i++) - submit_bh(READ, arr[i]); - + for (i = 0; i < nr; i++) { + struct buffer_head * bh = arr[i]; + if (buffer_uptodate(bh)) + end_buffer_io_async(bh, 1); + else + submit_bh(READ, bh); + } + return 0; } Index: fs/ext3/file.c =================================================================== RCS file: /cvsroot/gkernel/ext3/fs/ext3/file.c,v retrieving revision 1.11.2.3 retrieving revision 1.27.2.3 diff -u -r1.11.2.3 -r1.27.2.3 --- fs/ext3/file.c 16 Nov 2001 14:35:14 -0000 1.11.2.3 +++ fs/ext3/file.c 14 May 2002 15:14:27 -0000 1.27.2.3 @@ -61,19 +61,53 @@ static ssize_t ext3_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos) { + int ret, err; struct inode *inode = file->f_dentry->d_inode; + extern kdev_t TRACE_DEV; - /* - * Nasty: if the file is subject to synchronous writes then we need - * to force generic_osync_inode() to call ext3_write_inode(). - * We do that by marking the inode dirty. This adds much more - * computational expense than we need, but we're going to sync - * anyway. - */ - if (IS_SYNC(inode) || (file->f_flags & O_SYNC)) - mark_inode_dirty(inode); + ret = generic_file_write(file, buf, count, ppos); - return generic_file_write(file, buf, count, ppos); + /* Skip file flushing code if there was an error, or if nothing + was written. */ + if (ret <= 0) + return ret; + + /* If the inode is IS_SYNC, or is O_SYNC and we are doing + data-journaling, then we need to make sure that we force the + transaction to disk to keep all metadata uptodate + synchronously. */ + + if (file->f_flags & O_SYNC) { + /* If we are non-data-journaled, then the dirty data has + already been flushed to backing store by + generic_osync_inode, and the inode has been flushed + too if there have been any modifications other than + mere timestamp updates. + + Open question --- do we care about flushing + timestamps too if the inode is IS_SYNC? */ + if (!ext3_should_journal_data(inode)) + return ret; + + goto force_commit; + } + + /* So we know that there has been no forced data flush. If the + inode is marked IS_SYNC, we need to force one ourselves. */ + if (!IS_SYNC(inode)) + return ret; + + /* Open question #2 --- should we force data to disk here too? + If we don't, the only impact is that data=writeback + filesystems won't flush data to disk automatically on + IS_SYNC, only metadata (but historically, that is what ext2 + has done.) */ + +force_commit: + err = ext3_force_commit(inode->i_sb); + if (err) + return err; + return ret; } struct file_operations ext3_file_operations = { Index: fs/ext3/fsync.c =================================================================== RCS file: /cvsroot/gkernel/ext3/fs/ext3/fsync.c,v retrieving revision 1.2.2.3 retrieving revision 1.7.2.2 diff -u -r1.2.2.3 -r1.7.2.2 --- fs/ext3/fsync.c 21 Nov 2001 07:49:08 -0000 1.2.2.3 +++ fs/ext3/fsync.c 10 Apr 2002 17:00:50 -0000 1.7.2.2 @@ -62,7 +62,12 @@ * we'll end up waiting on them in commit. */ ret = fsync_inode_buffers(inode); - ret |= fsync_inode_data_buffers(inode); + + /* In writeback node, we need to force out data buffers too. In + * the other modes, ext3_force_commit takes care of forcing out + * just the right data blocks. */ + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA) + ret |= fsync_inode_data_buffers(inode); ext3_force_commit(inode->i_sb); Index: fs/ext3/ialloc.c =================================================================== RCS file: /cvsroot/gkernel/ext3/fs/ext3/ialloc.c,v retrieving revision 1.5.2.4 retrieving revision 1.19.4.5 diff -u -r1.5.2.4 -r1.19.4.5 --- fs/ext3/ialloc.c 13 Mar 2002 11:53:43 -0000 1.5.2.4 +++ fs/ext3/ialloc.c 10 Apr 2002 17:02:19 -0000 1.19.4.5 @@ -392,7 +392,7 @@ err = -ENOSPC; if (!gdp) - goto fail; + goto out; err = -EIO; bitmap_nr = load_inode_bitmap (sb, i); @@ -523,9 +523,10 @@ return inode; fail: + ext3_std_error(sb, err); +out: unlock_super(sb); iput(inode); - ext3_std_error(sb, err); return ERR_PTR(err); } Index: fs/ext3/inode.c =================================================================== RCS file: /cvsroot/gkernel/ext3/fs/ext3/inode.c,v retrieving revision 1.12.2.6 retrieving revision 1.64.2.22 diff -u -r1.12.2.6 -r1.64.2.22 --- fs/ext3/inode.c 9 May 2002 15:54:51 -0000 1.12.2.6 +++ fs/ext3/inode.c 13 May 2002 17:10:02 -0000 1.64.2.22 @@ -412,6 +412,7 @@ return NULL; changed: + brelse(bh); *err = -EAGAIN; goto no_block; failure: @@ -948,11 +949,13 @@ } static int walk_page_buffers( handle_t *handle, + struct inode *inode, struct buffer_head *head, unsigned from, unsigned to, int *partial, int (*fn)( handle_t *handle, + struct inode *inode, struct buffer_head *bh)) { struct buffer_head *bh; @@ -970,7 +973,7 @@ *partial = 1; continue; } - err = (*fn)(handle, bh); + err = (*fn)(handle, inode, bh); if (!ret) ret = err; } @@ -1003,7 +1006,7 @@ * write. */ -static int do_journal_get_write_access(handle_t *handle, +static int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh) { return ext3_journal_get_write_access(handle, bh); @@ -1029,7 +1032,7 @@ goto prepare_write_failed; if (ext3_should_journal_data(inode)) { - ret = walk_page_buffers(handle, page->buffers, + ret = walk_page_buffers(handle, inode, page->buffers, from, to, NULL, do_journal_get_write_access); if (ret) { /* @@ -1050,24 +1053,32 @@ return ret; } -static int journal_dirty_sync_data(handle_t *handle, struct buffer_head *bh) +static int journal_dirty_sync_data(handle_t *handle, struct inode *inode, + struct buffer_head *bh) { - return ext3_journal_dirty_data(handle, bh, 0); + int ret = ext3_journal_dirty_data(handle, bh, 0); + if (bh->b_inode != inode) + buffer_insert_inode_data_queue(bh, inode); + return ret; } /* * For ext3_writepage(). We also brelse() the buffer to account for * the bget() which ext3_writepage() performs. */ -static int journal_dirty_async_data(handle_t *handle, struct buffer_head *bh) +static int journal_dirty_async_data(handle_t *handle, struct inode *inode, + struct buffer_head *bh) { int ret = ext3_journal_dirty_data(handle, bh, 1); + if (bh->b_inode != inode) + buffer_insert_inode_data_queue(bh, inode); __brelse(bh); return ret; } /* For commit_write() in data=journal mode */ -static int commit_write_fn(handle_t *handle, struct buffer_head *bh) +static int commit_write_fn(handle_t *handle, struct inode *inode, + struct buffer_head *bh) { set_bit(BH_Uptodate, &bh->b_state); return ext3_journal_dirty_metadata(handle, bh); @@ -1102,7 +1113,7 @@ int partial = 0; loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; - ret = walk_page_buffers(handle, page->buffers, + ret = walk_page_buffers(handle, inode, page->buffers, from, to, &partial, commit_write_fn); if (!partial) SetPageUptodate(page); @@ -1112,7 +1123,7 @@ EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; } else { if (ext3_should_order_data(inode)) { - ret = walk_page_buffers(handle, page->buffers, + ret = walk_page_buffers(handle, inode, page->buffers, from, to, NULL, journal_dirty_sync_data); } /* Be careful here if generic_commit_write becomes a @@ -1194,7 +1205,8 @@ return generic_block_bmap(mapping,block,ext3_get_block); } -static int bget_one(handle_t *handle, struct buffer_head *bh) +static int bget_one(handle_t *handle, struct inode *inode, + struct buffer_head *bh) { atomic_inc(&bh->b_count); return 0; @@ -1293,7 +1305,7 @@ create_empty_buffers(page, inode->i_dev, inode->i_sb->s_blocksize); page_buffers = page->buffers; - walk_page_buffers(handle, page_buffers, 0, + walk_page_buffers(handle, inode, page_buffers, 0, PAGE_CACHE_SIZE, NULL, bget_one); } @@ -1311,7 +1323,7 @@ /* And attach them to the current transaction */ if (order_data) { - err = walk_page_buffers(handle, page_buffers, + err = walk_page_buffers(handle, inode, page_buffers, 0, PAGE_CACHE_SIZE, NULL, journal_dirty_async_data); if (!ret) ret = err; Index: fs/ext3/super.c =================================================================== RCS file: /cvsroot/gkernel/ext3/fs/ext3/super.c,v retrieving revision 1.12.2.5 retrieving revision 1.34.2.21 diff -u -r1.12.2.5 -r1.34.2.21 --- fs/ext3/super.c 13 Mar 2002 11:53:43 -0000 1.12.2.5 +++ fs/ext3/super.c 15 Apr 2002 20:34:54 -0000 1.34.2.21 @@ -1589,8 +1589,10 @@ journal_t *journal = EXT3_SB(sb)->s_journal; /* Now we set up the journal barrier. */ + unlock_super(sb); journal_lock_updates(journal); journal_flush(journal); + lock_super(sb); /* Journal blocked and flushed, clear needs_recovery flag. */ EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); Index: fs/jbd/journal.c =================================================================== RCS file: /cvsroot/gkernel/ext3/fs/jbd/journal.c,v retrieving revision 1.11.2.5 retrieving revision 1.49.2.11 diff -u -r1.11.2.5 -r1.49.2.11 --- fs/jbd/journal.c 9 Apr 2002 17:30:41 -0000 1.11.2.5 +++ fs/jbd/journal.c 9 May 2002 15:05:59 -0000 1.49.2.11 @@ -1488,6 +1488,49 @@ unlock_journal(journal); } + +/* + * Report any unexpected dirty buffers which turn up. Normally those + * indicate an error, but they can occur if the user is running (say) + * tune2fs to modify the live filesystem, so we need the option of + * continuing as gracefully as possible. # + * + * The caller should already hold the journal lock and + * journal_datalist_lock spinlock: most callers will need those anyway + * in order to probe the buffer's journaling state safely. + */ +void __jbd_unexpected_dirty_buffer(char *function, int line, + struct journal_head *jh) +{ + struct buffer_head *bh = jh2bh(jh); + int jlist; + + if (buffer_dirty(bh)) { + printk ("%sUnexpected dirty buffer encountered at " + "%s:%d (%s blocknr %lu)\n", + KERN_WARNING, function, line, + kdevname(bh->b_dev), bh->b_blocknr); +#ifdef JBD_PARANOID_WRITES + J_ASSERT (!buffer_dirty(bh)); +#endif + + /* If this buffer is one which might reasonably be dirty + * --- ie. data, or not part of this journal --- then + * we're OK to leave it alone, but otherwise we need to + * move the dirty bit to the journal's own internal + * JBDDirty bit. */ + jlist = jh->b_jlist; + + if (jlist == BJ_Metadata || jlist == BJ_Reserved || + jlist == BJ_Shadow || jlist == BJ_Forget) { + if (atomic_set_buffer_clean(jh2bh(jh))) { + set_bit(BH_JBDDirty, &jh2bh(jh)->b_state); + } + } + } +} + + int journal_blocks_per_page(struct inode *inode) { return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); Index: fs/jbd/transaction.c =================================================================== RCS file: /cvsroot/gkernel/ext3/fs/jbd/transaction.c,v retrieving revision 1.14.2.4 retrieving revision 1.64.2.9 diff -u -r1.14.2.4 -r1.64.2.9 --- fs/jbd/transaction.c 23 Jan 2002 07:26:47 -0000 1.14.2.4 +++ fs/jbd/transaction.c 9 May 2002 15:16:34 -0000 1.64.2.9 @@ -539,76 +539,67 @@ static int do_get_write_access(handle_t *handle, struct journal_head *jh, int force_copy) { + struct buffer_head *bh; transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; int error; char *frozen_buffer = NULL; int need_copy = 0; - + int locked; + jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy); JBUFFER_TRACE(jh, "entry"); repeat: + bh = jh2bh(jh); + /* @@@ Need to check for errors here at some point. */ /* - * AKPM: neither bdflush nor kupdate run with the BKL. There's - * nothing we can do to prevent them from starting writeout of a - * BUF_DIRTY buffer at any time. And checkpointing buffers are on - * BUF_DIRTY. So. We no longer assert that the buffer is unlocked. - * - * However. It is very wrong for us to allow ext3 to start directly - * altering the ->b_data of buffers which may at that very time be - * undergoing writeout to the client filesystem. This can leave - * the filesystem in an inconsistent, transient state if we crash. - * So what we do is to steal the buffer if it is in checkpoint - * mode and dirty. The journal lock will keep out checkpoint-mode - * state transitions within journal_remove_checkpoint() and the buffer - * is locked to keep bdflush/kupdate/whoever away from it as well. - * * AKPM: we have replaced all the lock_journal_bh_wait() stuff with a * simple lock_journal(). This code here will care for locked buffers. */ - /* - * The buffer_locked() || buffer_dirty() tests here are simply an - * optimisation tweak. If anyone else in the system decides to - * lock this buffer later on, we'll blow up. There doesn't seem - * to be a good reason why they should do this. - */ - if (jh->b_cp_transaction && - (buffer_locked(jh2bh(jh)) || buffer_dirty(jh2bh(jh)))) { + locked = test_and_set_bit(BH_Lock, &bh->b_state); + if (locked) { + /* We can't reliably test the buffer state if we found + * it already locked, so just wait for the lock and + * retry. */ unlock_journal(journal); - lock_buffer(jh2bh(jh)); - spin_lock(&journal_datalist_lock); - if (jh->b_cp_transaction && buffer_dirty(jh2bh(jh))) { - /* OK, we need to steal it */ - JBUFFER_TRACE(jh, "stealing from checkpoint mode"); - J_ASSERT_JH(jh, jh->b_next_transaction == NULL); - J_ASSERT_JH(jh, jh->b_frozen_data == NULL); - - J_ASSERT(handle->h_buffer_credits > 0); - handle->h_buffer_credits--; - - /* This will clear BH_Dirty and set BH_JBDDirty. */ - JBUFFER_TRACE(jh, "file as BJ_Reserved"); - __journal_file_buffer(jh, transaction, BJ_Reserved); - - /* And pull it off BUF_DIRTY, onto BUF_CLEAN */ - refile_buffer(jh2bh(jh)); + __wait_on_buffer(bh); + lock_journal(journal); + goto repeat; + } + + /* We now hold the buffer lock so it is safe to query the buffer + * state. Is the buffer dirty? + * + * If so, there are two possibilities. The buffer may be + * non-journaled, and undergoing a quite legitimate writeback. + * Otherwise, it is journaled, and we don't expect dirty buffers + * in that state (the buffers should be marked JBD_Dirty + * instead.) So either the IO is being done under our own + * control and this is a bug, or it's a third party IO such as + * dump(8) (which may leave the buffer scheduled for read --- + * ie. locked but not dirty) or tune2fs (which may actually have + * the buffer dirtied, ugh.) */ - /* - * The buffer is now hidden from bdflush. It is - * metadata against the current transaction. - */ - JBUFFER_TRACE(jh, "steal from cp mode is complete"); + if (buffer_dirty(bh)) { + spin_lock(&journal_datalist_lock); + /* First question: is this buffer already part of the + * current transaction or the existing committing + * transaction? */ + if (jh->b_transaction) { + J_ASSERT_JH(jh, jh->b_transaction == transaction || + jh->b_transaction == journal->j_committing_transaction); + if (jh->b_next_transaction) + J_ASSERT_JH(jh, jh->b_next_transaction == transaction); + JBUFFER_TRACE(jh, "Unexpected dirty buffer"); + jbd_unexpected_dirty_buffer(jh); } spin_unlock(&journal_datalist_lock); - unlock_buffer(jh2bh(jh)); - lock_journal(journal); - goto repeat; } - J_ASSERT_JH(jh, !buffer_locked(jh2bh(jh))); + unlock_buffer(bh); error = -EROFS; if (is_handle_aborted(handle)) @@ -1926,6 +1917,7 @@ transaction_t *transaction, int jlist) { struct journal_head **list = 0; + int was_dirty = 0; assert_spin_locked(&journal_datalist_lock); @@ -1936,13 +1928,24 @@ J_ASSERT_JH(jh, jh->b_transaction == transaction || jh->b_transaction == 0); - if (jh->b_transaction) { - if (jh->b_jlist == jlist) - return; + if (jh->b_transaction && jh->b_jlist == jlist) + return; + + /* The following list of buffer states needs to be consistent + * with __jbd_unexpected_dirty_buffer()'s handling of dirty + * state. */ + + if (jlist == BJ_Metadata || jlist == BJ_Reserved || + jlist == BJ_Shadow || jlist == BJ_Forget) { + if (atomic_set_buffer_clean(jh2bh(jh)) || + test_and_clear_bit(BH_JBDDirty, &jh2bh(jh)->b_state)) + was_dirty = 1; + } + + if (jh->b_transaction) __journal_unfile_buffer(jh); - } else { + else jh->b_transaction = transaction; - } switch (jlist) { case BJ_None: @@ -1979,12 +1982,8 @@ __blist_add_buffer(list, jh); jh->b_jlist = jlist; - if (jlist == BJ_Metadata || jlist == BJ_Reserved || - jlist == BJ_Shadow || jlist == BJ_Forget) { - if (atomic_set_buffer_clean(jh2bh(jh))) { - set_bit(BH_JBDDirty, &jh2bh(jh)->b_state); - } - } + if (was_dirty) + set_bit(BH_JBDDirty, &jh2bh(jh)->b_state); } void journal_file_buffer(struct journal_head *jh, Index: include/linux/ext3_fs.h =================================================================== RCS file: /cvsroot/gkernel/ext3/include/linux/ext3_fs.h,v retrieving revision 1.22.2.3 retrieving revision 1.20.2.17 diff -u -r1.22.2.3 -r1.20.2.17 --- include/linux/ext3_fs.h 23 Jan 2002 07:26:47 -0000 1.22.2.3 +++ include/linux/ext3_fs.h 14 May 2002 15:24:16 -0000 1.20.2.17 @@ -36,8 +36,8 @@ /* * The second extended file system version */ -#define EXT3FS_DATE "10 Jan 2002" -#define EXT3FS_VERSION "2.4-0.9.17" +#define EXT3FS_DATE "14 May 2002" +#define EXT3FS_VERSION "2.4-0.9.18" /* * Debug code Index: include/linux/jbd.h =================================================================== RCS file: /cvsroot/gkernel/ext3/include/linux/jbd.h,v retrieving revision 1.38.2.5 diff -u -r1.38.2.5 jbd.h --- include/linux/jbd.h 13 Mar 2002 11:53:44 -0000 1.38.2.5 +++ include/linux/jbd.h 14 May 2002 15:31:34 -0000 @@ -32,6 +32,14 @@ #define journal_oom_retry 1 +/* + * Define JBD_PARANOID_WRITES to cause a kernel BUG() check if ext3 + * finds a buffer unexpectedly dirty. This is useful for debugging, but + * can cause spurious kernel panics if there are applications such as + * tune2fs modifying our buffer_heads behind our backs. + */ +#undef JBD_PARANOID_WRITES + #ifdef CONFIG_JBD_DEBUG /* * Define JBD_EXPENSIVE_CHECKING to enable more expensive internal @@ -730,6 +738,10 @@ schedule(); \ } while (1) +extern void __jbd_unexpected_dirty_buffer(char *, int, struct journal_head *); +#define jbd_unexpected_dirty_buffer(jh) \ + __jbd_unexpected_dirty_buffer(__FUNCTION__, __LINE__, (jh)) + /* * is_journal_abort *