We now start to move across the JBD data structure's fields, from "innermost" and outwards. Start with journal_head.b_frozen_data, because the locking for this field was partially implemented in jbd-010-b_committed_data-race-fix.patch. It is protected by jbd_lock_bh_state(). We keep the lock_journal() and spin_lock(&journal_datalist_lock) calls in place. Later, spin_lock(&journal_datalist_lock) is replaced by spin_lock(&journal->j_list_lock). Of course, this completion of the locking around b_frozen_data also puts a lot of the locking for other fields in place. 25-akpm/fs/jbd/commit.c | 13 +++++++++ 25-akpm/fs/jbd/journal.c | 55 +++++++++++++++++++++++------------------- 25-akpm/fs/jbd/transaction.c | 56 ++++++++++++++++++++----------------------- fs/jbd/revoke.c | 0 4 files changed, 69 insertions(+), 55 deletions(-) diff -puN fs/jbd/journal.c~jbd-050-b_frozen_data-locking fs/jbd/journal.c --- 25/fs/jbd/journal.c~jbd-050-b_frozen_data-locking Thu Jun 5 15:14:18 2003 +++ 25-akpm/fs/jbd/journal.c Thu Jun 5 15:14:18 2003 @@ -359,9 +359,10 @@ int journal_write_metadata_buffer(transa int do_escape = 0; char *mapped_data; struct buffer_head *new_bh; - struct journal_head * new_jh; + struct journal_head *new_jh; struct page *new_page; unsigned int new_offset; + struct buffer_head *bh_in = jh2bh(jh_in); /* * The buffer really shouldn't be locked: only the current committing @@ -372,13 +373,16 @@ int journal_write_metadata_buffer(transa * also part of a shared mapping, and another thread has * decided to launch a writepage() against this buffer. */ - J_ASSERT_JH(jh_in, buffer_jbddirty(jh2bh(jh_in))); + J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); + + new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); /* * If a new transaction has already done a buffer copy-out, then * we use that version of the data for the commit. */ - + jbd_lock_bh_state(bh_in); +repeat: if (jh_in->b_frozen_data) { done_copy_out = 1; new_page = virt_to_page(jh_in->b_frozen_data); @@ -388,12 +392,13 @@ int journal_write_metadata_buffer(transa new_offset = virt_to_offset(jh2bh(jh_in)->b_data); } - mapped_data = ((char *) kmap(new_page)) + new_offset; + mapped_data = kmap_atomic(new_page, KM_USER0); /* * Check for escaping */ - if (* ((unsigned int *) mapped_data) == htonl(JFS_MAGIC_NUMBER)) { + if (*((unsigned int *)(mapped_data + new_offset)) == + htonl(JFS_MAGIC_NUMBER)) { need_copy_out = 1; do_escape = 1; } @@ -401,38 +406,47 @@ int journal_write_metadata_buffer(transa /* * Do we need to do a data copy? */ - if (need_copy_out && !done_copy_out) { char *tmp; - tmp = jbd_rep_kmalloc(jh2bh(jh_in)->b_size, GFP_NOFS); + + kunmap_atomic(mapped_data, KM_USER0); + jbd_unlock_bh_state(bh_in); + tmp = jbd_rep_kmalloc(bh_in->b_size, GFP_NOFS); + jbd_lock_bh_state(bh_in); + if (jh_in->b_frozen_data) { + kfree(new_page); + goto repeat; + } jh_in->b_frozen_data = tmp; - memcpy (tmp, mapped_data, jh2bh(jh_in)->b_size); - + mapped_data = kmap_atomic(new_page, KM_USER0); + memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size); + /* If we get to this path, we'll always need the new address kmapped so that we can clear the escaped magic number below. */ - kunmap(new_page); new_page = virt_to_page(tmp); new_offset = virt_to_offset(tmp); - mapped_data = ((char *) kmap(new_page)) + new_offset; - done_copy_out = 1; } /* - * Right, time to make up the new buffer_head. + * Did we need to do an escaping? Now we've done all the + * copying, we can finally do so. */ - new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); + if (do_escape) + *((unsigned int *)(mapped_data + new_offset)) = 0; + kunmap_atomic(mapped_data, KM_USER0); /* keep subsequent assertions sane */ new_bh->b_state = 0; init_buffer(new_bh, NULL, NULL); atomic_set(&new_bh->b_count, 1); - new_jh = journal_add_journal_head(new_bh); + jbd_unlock_bh_state(bh_in); - set_bh_page(new_bh, new_page, new_offset); + new_jh = journal_add_journal_head(new_bh); /* This sleeps */ + set_bh_page(new_bh, new_page, new_offset); new_jh->b_transaction = NULL; new_bh->b_size = jh2bh(jh_in)->b_size; new_bh->b_bdev = transaction->t_journal->j_dev; @@ -443,15 +457,6 @@ int journal_write_metadata_buffer(transa *jh_out = new_jh; /* - * Did we need to do an escaping? Now we've done all the - * copying, we can finally do so. - */ - - if (do_escape) - * ((unsigned int *) mapped_data) = 0; - kunmap(new_page); - - /* * The to-be-written buffer needs to get moved to the io queue, * and the original buffer whose contents we are shadowing or * copying is moved to the transaction's shadow queue. diff -puN fs/jbd/transaction.c~jbd-050-b_frozen_data-locking fs/jbd/transaction.c --- 25/fs/jbd/transaction.c~jbd-050-b_frozen_data-locking Thu Jun 5 15:14:18 2003 +++ 25-akpm/fs/jbd/transaction.c Thu Jun 5 15:14:18 2003 @@ -502,7 +502,6 @@ do_get_write_access(handle_t *handle, st int error; char *frozen_buffer = NULL; int need_copy = 0; - int locked; jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy); @@ -512,16 +511,9 @@ repeat: /* @@@ Need to check for errors here at some point. */ - locked = test_set_buffer_locked(bh); - if (locked) { - /* We can't reliably test the buffer state if we found - * it already locked, so just wait for the lock and - * retry. */ - unlock_journal(journal); - wait_on_buffer(bh); - lock_journal(journal); - goto repeat; - } + lock_buffer(bh); + jbd_lock_bh_state(bh); + spin_lock(&journal_datalist_lock); /* We now hold the buffer lock so it is safe to query the buffer * state. Is the buffer dirty? @@ -537,7 +529,6 @@ repeat: * the buffer dirtied, ugh.) */ if (buffer_dirty(bh)) { - spin_lock(&journal_datalist_lock); /* First question: is this buffer already part of the * current transaction or the existing committing * transaction? */ @@ -552,18 +543,18 @@ repeat: JBUFFER_TRACE(jh, "Unexpected dirty buffer"); jbd_unexpected_dirty_buffer(jh); } - spin_unlock(&journal_datalist_lock); } unlock_buffer(bh); error = -EROFS; - if (is_handle_aborted(handle)) + if (is_handle_aborted(handle)) { + spin_unlock(&journal_datalist_lock); + jbd_unlock_bh_state(bh); goto out_unlocked; + } error = 0; - spin_lock(&journal_datalist_lock); - /* The buffer is already part of this transaction if * b_transaction or b_next_transaction points to it. */ @@ -606,6 +597,7 @@ repeat: JBUFFER_TRACE(jh, "on shadow: sleep"); spin_unlock(&journal_datalist_lock); + jbd_unlock_bh_state(bh); unlock_journal(journal); /* commit wakes up all shadow buffers after IO */ wqh = bh_waitq_head(jh2bh(jh)); @@ -633,16 +625,16 @@ repeat: if (!frozen_buffer) { JBUFFER_TRACE(jh, "allocate memory for buffer"); spin_unlock(&journal_datalist_lock); - unlock_journal(journal); + jbd_unlock_bh_state(bh); frozen_buffer = jbd_kmalloc(jh2bh(jh)->b_size, GFP_NOFS); - lock_journal(journal); if (!frozen_buffer) { printk(KERN_EMERG "%s: OOM for frozen_buffer\n", __FUNCTION__); JBUFFER_TRACE(jh, "oom!"); error = -ENOMEM; + jbd_lock_bh_state(bh); spin_lock(&journal_datalist_lock); goto done_locked; } @@ -672,7 +664,6 @@ repeat: } done_locked: - spin_unlock(&journal_datalist_lock); if (need_copy) { struct page *page; int offset; @@ -682,17 +673,16 @@ done_locked: "Possible IO failure.\n"); page = jh2bh(jh)->b_page; offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK; - source = kmap(page); + source = kmap_atomic(page, KM_USER0); memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); - kunmap(page); + kunmap_atomic(source, KM_USER0); } - + spin_unlock(&journal_datalist_lock); + jbd_unlock_bh_state(bh); /* If we are about to journal a buffer, then any revoke pending on it is no longer valid. */ - lock_kernel(); journal_cancel_revoke(handle, jh); - unlock_kernel(); out_unlocked: if (frozen_buffer) @@ -1070,8 +1060,9 @@ int journal_dirty_metadata (handle_t *ha if (is_handle_aborted(handle)) goto out_unlock; + jbd_lock_bh_state(bh); spin_lock(&journal_datalist_lock); - set_bit(BH_JBDDirty, &bh->b_state); + set_buffer_jbddirty(bh); J_ASSERT_JH(jh, jh->b_transaction != NULL); @@ -1102,6 +1093,7 @@ int journal_dirty_metadata (handle_t *ha done_locked: spin_unlock(&journal_datalist_lock); + jbd_unlock_bh_state(bh); JBUFFER_TRACE(jh, "exit"); out_unlock: unlock_journal(journal); @@ -1168,6 +1160,7 @@ void journal_forget (handle_t *handle, s BUFFER_TRACE(bh, "entry"); lock_journal(journal); + jbd_lock_bh_state(bh); spin_lock(&journal_datalist_lock); if (!buffer_jbd(bh)) @@ -1181,7 +1174,7 @@ void journal_forget (handle_t *handle, s * of this transaction, then we can just drop it from * the transaction immediately. */ clear_buffer_dirty(bh); - clear_bit(BH_JBDDirty, &bh->b_state); + clear_buffer_jbddirty(bh); JBUFFER_TRACE(jh, "belongs to current transaction: unfile"); J_ASSERT_JH(jh, !jh->b_committed_data); @@ -1208,6 +1201,7 @@ void journal_forget (handle_t *handle, s __brelse(bh); if (!buffer_jbd(bh)) { spin_unlock(&journal_datalist_lock); + jbd_unlock_bh_state(bh); unlock_journal(journal); __bforget(bh); return; @@ -1231,6 +1225,7 @@ void journal_forget (handle_t *handle, s not_jbd: spin_unlock(&journal_datalist_lock); + jbd_unlock_bh_state(bh); unlock_journal(journal); __brelse(bh); return; @@ -1924,7 +1919,7 @@ void __journal_file_buffer(struct journa struct buffer_head *bh = jh2bh(jh); assert_spin_locked(&journal_datalist_lock); - + #ifdef __SMP__ J_ASSERT (current->lock_depth >= 0); #endif @@ -1990,9 +1985,11 @@ void __journal_file_buffer(struct journa void journal_file_buffer(struct journal_head *jh, transaction_t *transaction, int jlist) { + jbd_lock_bh_state(jh2bh(jh)); spin_lock(&journal_datalist_lock); __journal_file_buffer(jh, transaction, jlist); spin_unlock(&journal_datalist_lock); + jbd_unlock_bh_state(jh2bh(jh)); } /* @@ -2045,12 +2042,13 @@ void __journal_refile_buffer(struct jour */ void journal_refile_buffer(struct journal_head *jh) { - struct buffer_head *bh; + struct buffer_head *bh = jh2bh(jh); + jbd_lock_bh_state(bh); spin_lock(&journal_datalist_lock); - bh = jh2bh(jh); __journal_refile_buffer(jh); + jbd_unlock_bh_state(bh); journal_remove_journal_head(bh); spin_unlock(&journal_datalist_lock); diff -puN fs/jbd/commit.c~jbd-050-b_frozen_data-locking fs/jbd/commit.c --- 25/fs/jbd/commit.c~jbd-050-b_frozen_data-locking Thu Jun 5 15:14:18 2003 +++ 25-akpm/fs/jbd/commit.c Thu Jun 5 15:14:18 2003 @@ -210,8 +210,18 @@ write_out_data_locked: wbuf[bufs++] = bh; } else { BUFFER_TRACE(bh, "writeout complete: unfile"); + /* + * We have a lock ranking problem.. + */ + if (!jbd_trylock_bh_state(bh)) { + spin_unlock(&journal_datalist_lock); + schedule(); + spin_lock(&journal_datalist_lock); + break; + } __journal_unfile_buffer(jh); jh->b_transaction = NULL; + jbd_unlock_bh_state(bh); journal_remove_journal_head(bh); __brelse(bh); } @@ -652,7 +662,6 @@ skip_commit: /* The journal should be un kfree(jh->b_frozen_data); jh->b_frozen_data = NULL; } - jbd_unlock_bh_state(jh2bh(jh)); spin_lock(&journal_datalist_lock); cp_transaction = jh->b_cp_transaction; @@ -687,11 +696,13 @@ skip_commit: /* The journal should be un __journal_insert_checkpoint(jh, commit_transaction); JBUFFER_TRACE(jh, "refile for checkpoint writeback"); __journal_refile_buffer(jh); + jbd_unlock_bh_state(bh); } else { J_ASSERT_BH(bh, !buffer_dirty(bh)); J_ASSERT_JH(jh, jh->b_next_transaction == NULL); __journal_unfile_buffer(jh); jh->b_transaction = 0; + jbd_unlock_bh_state(bh); journal_remove_journal_head(bh); __brelse(bh); } diff -puN fs/jbd/revoke.c~jbd-050-b_frozen_data-locking fs/jbd/revoke.c _