Go through all use of b_transaction and implement the rules. Fairly straightforward. fs/jbd/checkpoint.c | 54 +++++++++++++++-- fs/jbd/commit.c | 10 +-- fs/jbd/journal.c | 50 ---------------- fs/jbd/transaction.c | 159 ++++++++++++++++++++------------------------------- 4 files changed, 116 insertions(+), 157 deletions(-) diff -puN fs/jbd/checkpoint.c~jbd-070-b_transaction-locking fs/jbd/checkpoint.c --- 25/fs/jbd/checkpoint.c~jbd-070-b_transaction-locking 2003-05-28 13:02:26.000000000 -0700 +++ 25-akpm/fs/jbd/checkpoint.c 2003-05-28 13:02:26.000000000 -0700 @@ -50,6 +50,7 @@ static inline void __buffer_unlink(struc * Try to release a checkpointed buffer from its transaction. * Returns 1 if we released it. * Requires journal_datalist_lock + * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it */ static int __try_to_free_cp_buf(struct journal_head *jh) { @@ -59,10 +60,13 @@ static int __try_to_free_cp_buf(struct j if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) { JBUFFER_TRACE(jh, "remove from checkpoint list"); __journal_remove_checkpoint(jh); + jbd_unlock_bh_state(bh); journal_remove_journal_head(bh); BUFFER_TRACE(bh, "release"); __brelse(bh); ret = 1; + } else { + jbd_unlock_bh_state(bh); } return ret; } @@ -93,6 +97,20 @@ void log_wait_for_space(journal_t *journ } /* + * We were unable to perform jbd_trylock_bh_state() inside + * journal_datalist_lock. The caller must restart a list walk. Wait for + * someone else to run jbd_unlock_bh_state(). + */ +static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh) +{ + get_bh(bh); + spin_unlock(&journal_datalist_lock); + jbd_lock_bh_state(bh); + jbd_unlock_bh_state(bh); + put_bh(bh); +} + +/* * Clean up a transaction's checkpoint list. * * We wait for any pending IO to complete and make sure any clean @@ -131,12 +149,21 @@ static int __cleanup_transaction(journal __brelse(bh); goto out_return_1; } - + + /* + * This is foul + */ + if (!jbd_trylock_bh_state(bh)) { + jbd_sync_bh(journal, bh); + goto out_return_1; + } + if (jh->b_transaction != NULL) { transaction_t *transaction = jh->b_transaction; tid_t tid = transaction->t_tid; spin_unlock(&journal_datalist_lock); + jbd_unlock_bh_state(bh); log_start_commit(journal, transaction); unlock_journal(journal); log_wait_commit(journal, tid); @@ -156,11 +183,13 @@ static int __cleanup_transaction(journal if (!buffer_dirty(bh) && !buffer_jbddirty(bh)) { BUFFER_TRACE(bh, "remove from checkpoint"); __journal_remove_checkpoint(jh); + jbd_unlock_bh_state(bh); journal_remove_journal_head(bh); __brelse(bh); ret = 1; + } else { + jbd_unlock_bh_state(bh); } - jh = next_jh; } while (jh != last_jh); @@ -197,6 +226,7 @@ static void __flush_batch(struct buffer_ * scan of the checkpoint list. * * Called with journal_datalist_lock held. + * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it */ static int __flush_buffer(journal_t *journal, struct journal_head *jh, struct buffer_head **bhs, int *batch_count, @@ -216,10 +246,11 @@ static int __flush_buffer(journal_t *jou * disk, as that would break recoverability. */ BUFFER_TRACE(bh, "queue"); - atomic_inc(&bh->b_count); - J_ASSERT_BH(bh, !test_bit(BH_JWrite, &bh->b_state)); - set_bit(BH_JWrite, &bh->b_state); + get_bh(bh); + J_ASSERT_BH(bh, !buffer_jwrite(bh)); + set_buffer_jwrite(bh); bhs[*batch_count] = bh; + jbd_unlock_bh_state(bh); (*batch_count)++; if (*batch_count == NR_BATCH) { __flush_batch(bhs, batch_count); @@ -302,8 +333,16 @@ repeat: last_jh = jh->b_cpprev; next_jh = jh; do { + struct buffer_head *bh; + jh = next_jh; next_jh = jh->b_cpnext; + bh = jh2bh(jh); + if (!jbd_trylock_bh_state(bh)) { + jbd_sync_bh(journal, bh); + spin_lock(&journal_datalist_lock); + break; + } retry = __flush_buffer(journal, jh, bhs, &batch_count, &drop_count); } while (jh != last_jh && !retry); @@ -439,10 +478,13 @@ int __journal_clean_checkpoint_list(jour if (jh) { struct journal_head *last_jh = jh->b_cpprev; struct journal_head *next_jh = jh; + do { jh = next_jh; next_jh = jh->b_cpnext; - ret += __try_to_free_cp_buf(jh); + /* Use trylock because of the ranknig */ + if (jbd_trylock_bh_state(jh2bh(jh))) + ret += __try_to_free_cp_buf(jh); } while (jh != last_jh); } } while (transaction != last_transaction); diff -puN fs/jbd/commit.c~jbd-070-b_transaction-locking fs/jbd/commit.c --- 25/fs/jbd/commit.c~jbd-070-b_transaction-locking 2003-05-28 13:02:26.000000000 -0700 +++ 25-akpm/fs/jbd/commit.c 2003-05-28 13:02:26.000000000 -0700 @@ -465,9 +465,10 @@ start_journal_io: * akpm: these are BJ_IO, and journal_datalist_lock is not needed. * See __journal_try_to_free_buffer. */ - wait_for_iobuf: +wait_for_iobuf: while (commit_transaction->t_iobuf_list != NULL) { struct buffer_head *bh; + jh = commit_transaction->t_iobuf_list->b_tprev; bh = jh2bh(jh); if (buffer_locked(bh)) { @@ -479,7 +480,7 @@ start_journal_io: goto wait_for_iobuf; } - clear_bit(BH_JWrite, &jh2bh(jh)->b_state); + clear_buffer_jwrite(bh); JBUFFER_TRACE(jh, "ph4: unfile after journal write"); journal_unfile_buffer(jh); @@ -495,7 +496,6 @@ start_journal_io: * ->t_iobuf_list should contain only dummy buffer_heads * which were created by journal_write_metadata_buffer(). */ - bh = jh2bh(jh); BUFFER_TRACE(bh, "dumping temporary bh"); journal_put_journal_head(jh); __brelse(bh); @@ -637,6 +637,8 @@ skip_commit: /* The journal should be un struct buffer_head *bh; jh = commit_transaction->t_forget; + bh = jh2bh(jh); + jbd_lock_bh_state(bh); J_ASSERT_JH(jh, jh->b_transaction == commit_transaction || jh->b_transaction == journal->j_running_transaction); @@ -650,7 +652,6 @@ skip_commit: /* The journal should be un * * Otherwise, we can just throw away the frozen data now. */ - jbd_lock_bh_state(jh2bh(jh)); if (jh->b_committed_data) { kfree(jh->b_committed_data); jh->b_committed_data = NULL; @@ -676,7 +677,6 @@ skip_commit: /* The journal should be un * by journal_forget, it may no longer be dirty and * there's no point in keeping a checkpoint record for * it. */ - bh = jh2bh(jh); /* A buffer which has been freed while still being * journaled by a previous transaction may end up still diff -puN fs/jbd/journal.c~jbd-070-b_transaction-locking fs/jbd/journal.c --- 25/fs/jbd/journal.c~jbd-070-b_transaction-locking 2003-05-28 13:02:26.000000000 -0700 +++ 25-akpm/fs/jbd/journal.c 2003-05-28 13:02:26.000000000 -0700 @@ -259,56 +259,6 @@ static void journal_kill_thread(journal_ } } -#if 0 - -This is no longer needed - we do it in commit quite efficiently. -Note that if this function is resurrected, the loop needs to -be reorganised into the next_jh/last_jh algorithm. - -/* - * journal_clean_data_list: cleanup after data IO. - * - * Once the IO system has finished writing the buffers on the transaction's - * data list, we can remove those buffers from the list. This function - * scans the list for such buffers and removes them cleanly. - * - * We assume that the journal is already locked. - * We are called with journal_datalist_lock held. - * - * AKPM: This function looks inefficient. Approximately O(n^2) - * for potentially thousands of buffers. It no longer shows on profiles - * because these buffers are mainly dropped in journal_commit_transaction(). - */ - -void __journal_clean_data_list(transaction_t *transaction) -{ - struct journal_head *jh, *next; - - assert_spin_locked(&journal_datalist_lock); - -restart: - jh = transaction->t_sync_datalist; - if (!jh) - goto out; - do { - next = jh->b_tnext; - if (!buffer_locked(jh2bh(jh)) && !buffer_dirty(jh2bh(jh))) { - struct buffer_head *bh = jh2bh(jh); - BUFFER_TRACE(bh, "data writeout complete: unfile"); - __journal_unfile_buffer(jh); - jh->b_transaction = NULL; - journal_remove_journal_head(bh); - __brelse(bh); - goto restart; - } - jh = next; - } while (transaction->t_sync_datalist && - jh != transaction->t_sync_datalist); -out: - return; -} -#endif - /* * journal_write_metadata_buffer: write a metadata buffer to the journal. * diff -puN fs/jbd/transaction.c~jbd-070-b_transaction-locking fs/jbd/transaction.c --- 25/fs/jbd/transaction.c~jbd-070-b_transaction-locking 2003-05-28 13:02:26.000000000 -0700 +++ 25-akpm/fs/jbd/transaction.c 2003-05-28 13:02:26.000000000 -0700 @@ -740,7 +740,7 @@ int journal_get_write_access (handle_t * * * Call this if you create a new bh. */ -int journal_get_create_access (handle_t *handle, struct buffer_head *bh) +int journal_get_create_access(handle_t *handle, struct buffer_head *bh) { transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; @@ -755,14 +755,18 @@ int journal_get_create_access (handle_t err = 0; JBUFFER_TRACE(jh, "entry"); - /* The buffer may already belong to this transaction due to - * pre-zeroing in the filesystem's new_block code. It may also - * be on the previous, committing transaction's lists, but it - * HAS to be in Forget state in that case: the transaction must - * have deleted the buffer for it to be reused here. */ + /* + * The buffer may already belong to this transaction due to pre-zeroing + * in the filesystem's new_block code. It may also be on the previous, + * committing transaction's lists, but it HAS to be in Forget state in + * that case: the transaction must have deleted the buffer for it to be + * reused here. + */ + jbd_lock_bh_state(bh); + spin_lock(&journal_datalist_lock); J_ASSERT_JH(jh, (jh->b_transaction == transaction || - jh->b_transaction == NULL || - (jh->b_transaction == journal->j_committing_transaction && + jh->b_transaction == NULL || + (jh->b_transaction == journal->j_committing_transaction && jh->b_jlist == BJ_Forget))); J_ASSERT_JH(jh, jh->b_next_transaction == NULL); @@ -771,7 +775,6 @@ int journal_get_create_access (handle_t J_ASSERT_JH(jh, handle->h_buffer_credits > 0); handle->h_buffer_credits--; - spin_lock(&journal_datalist_lock); if (jh->b_transaction == NULL) { jh->b_transaction = transaction; JBUFFER_TRACE(jh, "file as BJ_Reserved"); @@ -781,6 +784,7 @@ int journal_get_create_access (handle_t jh->b_next_transaction = transaction; } spin_unlock(&journal_datalist_lock); + jbd_unlock_bh_state(bh); /* * akpm: I added this. ext3_alloc_branch can pick up new indirect @@ -799,10 +803,9 @@ out: return err; } - - /** - * int journal_get_undo_access() - Notify intent to modify metadata with non-rewindable consequences + * int journal_get_undo_access() - Notify intent to modify metadata with + * non-rewindable consequences * @handle: transaction * @bh: buffer to undo * @@ -932,6 +935,7 @@ int journal_dirty_data (handle_t *handle * never, ever allow this to happen: there's nothing we can do * about it in this layer. */ + jbd_lock_bh_state(bh); spin_lock(&journal_datalist_lock); if (jh->b_transaction) { JBUFFER_TRACE(jh, "has transaction"); @@ -986,10 +990,12 @@ int journal_dirty_data (handle_t *handle * commit to never terminate. */ if (buffer_dirty(bh)) { - atomic_inc(&bh->b_count); + get_bh(bh); spin_unlock(&journal_datalist_lock); + jbd_unlock_bh_state(bh); need_brelse = 1; sync_dirty_buffer(bh); + jbd_lock_bh_state(bh); spin_lock(&journal_datalist_lock); /* The buffer may become locked again at any time if it is redirtied */ @@ -1025,6 +1031,7 @@ int journal_dirty_data (handle_t *handle } no_journal: spin_unlock(&journal_datalist_lock); + jbd_unlock_bh_state(bh); if (need_brelse) { BUFFER_TRACE(bh, "brelse"); __brelse(bh); @@ -1053,7 +1060,7 @@ no_journal: * buffer: that only gets done when the old transaction finally * completes its commit. */ -int journal_dirty_metadata (handle_t *handle, struct buffer_head *bh) +int journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) { transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; @@ -1113,7 +1120,7 @@ out_unlock: * journaling component to decide after the write access is returned * that global state has changed and the update is no longer required. */ -void journal_release_buffer (handle_t *handle, struct buffer_head *bh) +void journal_release_buffer(handle_t *handle, struct buffer_head *bh) { transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; @@ -1126,6 +1133,7 @@ void journal_release_buffer (handle_t *h * transaction, then it is safe to release it. In all other * cases, just leave the buffer as it is. */ + jbd_lock_bh_state(bh); spin_lock(&journal_datalist_lock); if (jh->b_jlist == BJ_Reserved && jh->b_transaction == transaction && !buffer_jbddirty(jh2bh(jh))) { @@ -1134,6 +1142,7 @@ void journal_release_buffer (handle_t *h __journal_refile_buffer(jh); } spin_unlock(&journal_datalist_lock); + jbd_unlock_bh_state(bh); JBUFFER_TRACE(jh, "exit"); unlock_journal(journal); @@ -1236,66 +1245,6 @@ not_jbd: return; } -#if 0 /* Unused */ -/* - * journal_sync_buffer: flush a potentially-journaled buffer to disk. - * - * Used for O_SYNC filesystem operations. If the buffer is journaled, - * we need to complete the O_SYNC by waiting for the transaction to - * complete. It is an error to call journal_sync_buffer before - * journal_stop! - */ - -void journal_sync_buffer(struct buffer_head *bh) -{ - transaction_t *transaction; - journal_t *journal; - long sequence; - struct journal_head *jh; - - /* If the buffer isn't journaled, this is easy: just sync it to - * disk. */ - BUFFER_TRACE(bh, "entry"); - - spin_lock(&journal_datalist_lock); - if (!buffer_jbd(bh)) { - spin_unlock(&journal_datalist_lock); - return; - } - jh = bh2jh(bh); - if (jh->b_transaction == NULL) { - /* If the buffer has already been journaled, then this - * is a noop. */ - if (jh->b_cp_transaction == NULL) { - spin_unlock(&journal_datalist_lock); - return; - } - atomic_inc(&bh->b_count); - spin_unlock(&journal_datalist_lock); - sync_dirty_buffer(bh); - __brelse(bh); - goto out; - } - - /* Otherwise, just wait until the transaction is synced to disk. */ - transaction = jh->b_transaction; - journal = transaction->t_journal; - sequence = transaction->t_tid; - spin_unlock(&journal_datalist_lock); - - jbd_debug(2, "requesting commit for jh %p\n", jh); - log_start_commit (journal, transaction); - - while (tid_gt(sequence, journal->j_commit_sequence)) { - wake_up(&journal->j_wait_done_commit); - sleep_on(&journal->j_wait_done_commit); - } - JBUFFER_TRACE(jh, "exit"); -out: - return; -} -#endif - /** * void journal_callback_set() - Register a callback function for this handle. * @handle: handle to attach the callback to. @@ -1584,7 +1533,7 @@ void journal_unfile_buffer(struct journa * One could use journal_datalist_lock to get unracy access to a * per-journal lock. * - * Called with journal_datalist_lock held. + * Called under jbd_lock_bh_state(bh) * * Returns non-zero iff we were able to free the journal_head. */ @@ -1592,8 +1541,6 @@ static inline int __journal_try_to_free_ { struct journal_head *jh; - assert_spin_locked(&journal_datalist_lock); - jh = bh2jh(bh); if (buffer_locked(bh) || buffer_dirty(bh)) @@ -1602,6 +1549,7 @@ static inline int __journal_try_to_free_ if (jh->b_next_transaction != 0) goto out; + spin_lock(&journal_datalist_lock); if (jh->b_transaction != 0 && jh->b_cp_transaction == 0) { if (jh->b_jlist == BJ_SyncData) { /* A written-back ordered data buffer */ @@ -1620,6 +1568,7 @@ static inline int __journal_try_to_free_ __brelse(bh); } } + spin_unlock(&journal_datalist_lock); return !buffer_jbd(bh); out: @@ -1672,14 +1621,14 @@ int journal_try_to_free_buffers(journal_ head = page_buffers(page); bh = head; - spin_lock(&journal_datalist_lock); do { + jbd_lock_bh_state(bh); if (buffer_jbd(bh) && !__journal_try_to_free_buffer(bh)) { - spin_unlock(&journal_datalist_lock); + jbd_unlock_bh_state(bh); goto busy; } + jbd_unlock_bh_state(bh); } while ((bh = bh->b_this_page) != head); - spin_unlock(&journal_datalist_lock); ret = try_to_free_buffers(page); busy: return ret; @@ -1692,28 +1641,29 @@ busy: * this transaction commits. If the buffer isn't on a checkpoint list, we * release it. * Returns non-zero if JBD no longer has an interest in the buffer. + * + * Called under journal_datalist_lock. + * + * Called under jbd_lock_bh_state(bh). */ -static int dispose_buffer(struct journal_head *jh, - transaction_t *transaction) +static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) { int may_free = 1; struct buffer_head *bh = jh2bh(jh); - spin_lock(&journal_datalist_lock); __journal_unfile_buffer(jh); jh->b_transaction = 0; if (jh->b_cp_transaction) { JBUFFER_TRACE(jh, "on running+cp transaction"); __journal_file_buffer(jh, transaction, BJ_Forget); - clear_bit(BH_JBDDirty, &bh->b_state); + clear_buffer_jbddirty(bh); may_free = 0; } else { JBUFFER_TRACE(jh, "on running transaction"); journal_remove_journal_head(bh); __brelse(bh); } - spin_unlock(&journal_datalist_lock); return may_free; } @@ -1769,6 +1719,7 @@ static int journal_unmap_buffer(journal_ transaction_t *transaction; struct journal_head *jh; int may_free = 1; + int ret; BUFFER_TRACE(bh, "entry"); @@ -1778,8 +1729,10 @@ static int journal_unmap_buffer(journal_ * page lock. --sct */ if (!buffer_jbd(bh)) - goto zap_buffer; + goto zap_buffer_unlocked; + jbd_lock_bh_state(bh); + spin_lock(&journal_datalist_lock); jh = bh2jh(bh); transaction = jh->b_transaction; if (transaction == NULL) { @@ -1806,8 +1759,11 @@ static int journal_unmap_buffer(journal_ * committed, the buffer won't be needed any * longer. */ JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget"); - return dispose_buffer(jh, + ret = __dispose_buffer(jh, journal->j_running_transaction); + spin_unlock(&journal_datalist_lock); + jbd_unlock_bh_state(bh); + return ret; } else { /* There is no currently-running transaction. So the * orphan record which we wrote for this file must have @@ -1815,12 +1771,15 @@ static int journal_unmap_buffer(journal_ * the committing transaction, if it exists. */ if (journal->j_committing_transaction) { JBUFFER_TRACE(jh, "give to committing trans"); - return dispose_buffer(jh, + ret = __dispose_buffer(jh, journal->j_committing_transaction); + spin_unlock(&journal_datalist_lock); + jbd_unlock_bh_state(bh); + return ret; } else { /* The orphan record's transaction has * committed. We can cleanse this buffer */ - clear_bit(BH_JBDDirty, &bh->b_state); + clear_buffer_jbddirty(bh); goto zap_buffer; } } @@ -1836,6 +1795,8 @@ static int journal_unmap_buffer(journal_ journal->j_running_transaction); jh->b_next_transaction = NULL; } + spin_unlock(&journal_datalist_lock); + jbd_unlock_bh_state(bh); return 0; } else { /* Good, the buffer belongs to the running transaction. @@ -1845,10 +1806,13 @@ static int journal_unmap_buffer(journal_ * i_size already for this truncate so recovery will not * expose the disk blocks we are discarding here.) */ J_ASSERT_JH(jh, transaction == journal->j_running_transaction); - may_free = dispose_buffer(jh, transaction); + may_free = __dispose_buffer(jh, transaction); } -zap_buffer: +zap_buffer: + spin_unlock(&journal_datalist_lock); + jbd_unlock_bh_state(bh); +zap_buffer_unlocked: clear_buffer_dirty(bh); J_ASSERT_BH(bh, !buffer_jbddirty(bh)); clear_buffer_mapped(bh); @@ -2007,6 +1971,7 @@ void journal_file_buffer(struct journal_ void __journal_refile_buffer(struct journal_head *jh) { int was_dirty; + struct buffer_head *bh = jh2bh(jh); assert_spin_locked(&journal_datalist_lock); @@ -2017,10 +1982,12 @@ void __journal_refile_buffer(struct jour return; } - /* It has been modified by a later transaction: add it to the - * new transaction's metadata list. */ + /* + * It has been modified by a later transaction: add it to the new + * transaction's metadata list. + */ - was_dirty = test_clear_buffer_jbddirty(jh2bh(jh)); + was_dirty = test_clear_buffer_jbddirty(bh); __journal_unfile_buffer(jh); jh->b_transaction = jh->b_next_transaction; jh->b_next_transaction = NULL; @@ -2028,7 +1995,7 @@ void __journal_refile_buffer(struct jour J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); if (was_dirty) - set_buffer_jbddirty(jh2bh(jh)); + set_buffer_jbddirty(bh); } /* _