We now start to move across the JBD data structure's fields, from "innermost"
and outwards.

Start with journal_head.b_frozen_data, because the locking for this field was
partially implemented in jbd-010-b_committed_data-race-fix.patch.

It is protected by jbd_lock_bh_state().  We keep the lock_journal() and
spin_lock(&journal_datalist_lock) calls in place.  Later,
spin_lock(&journal_datalist_lock) is replaced by
spin_lock(&journal->j_list_lock).

Of course, this completion of the locking around b_frozen_data also puts a
lot of the locking for other fields in place.


 25-akpm/fs/jbd/commit.c      |   13 +++++++++
 25-akpm/fs/jbd/journal.c     |   55 +++++++++++++++++++++++-------------------
 25-akpm/fs/jbd/transaction.c |   56 ++++++++++++++++++++-----------------------
 fs/jbd/revoke.c              |    0 
 4 files changed, 69 insertions(+), 55 deletions(-)

diff -puN fs/jbd/journal.c~jbd-050-b_frozen_data-locking fs/jbd/journal.c
--- 25/fs/jbd/journal.c~jbd-050-b_frozen_data-locking	Thu Jun  5 15:14:18 2003
+++ 25-akpm/fs/jbd/journal.c	Thu Jun  5 15:14:18 2003
@@ -359,9 +359,10 @@ int journal_write_metadata_buffer(transa
 	int do_escape = 0;
 	char *mapped_data;
 	struct buffer_head *new_bh;
-	struct journal_head * new_jh;
+	struct journal_head *new_jh;
 	struct page *new_page;
 	unsigned int new_offset;
+	struct buffer_head *bh_in = jh2bh(jh_in);
 
 	/*
 	 * The buffer really shouldn't be locked: only the current committing
@@ -372,13 +373,16 @@ int journal_write_metadata_buffer(transa
 	 * also part of a shared mapping, and another thread has
 	 * decided to launch a writepage() against this buffer.
 	 */
-	J_ASSERT_JH(jh_in, buffer_jbddirty(jh2bh(jh_in)));
+	J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
+
+	new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
 
 	/*
 	 * If a new transaction has already done a buffer copy-out, then
 	 * we use that version of the data for the commit.
 	 */
-
+	jbd_lock_bh_state(bh_in);
+repeat:
 	if (jh_in->b_frozen_data) {
 		done_copy_out = 1;
 		new_page = virt_to_page(jh_in->b_frozen_data);
@@ -388,12 +392,13 @@ int journal_write_metadata_buffer(transa
 		new_offset = virt_to_offset(jh2bh(jh_in)->b_data);
 	}
 
-	mapped_data = ((char *) kmap(new_page)) + new_offset;
+	mapped_data = kmap_atomic(new_page, KM_USER0);
 
 	/*
 	 * Check for escaping
 	 */
-	if (* ((unsigned int *) mapped_data) == htonl(JFS_MAGIC_NUMBER)) {
+	if (*((unsigned int *)(mapped_data + new_offset)) ==
+				htonl(JFS_MAGIC_NUMBER)) {
 		need_copy_out = 1;
 		do_escape = 1;
 	}
@@ -401,38 +406,47 @@ int journal_write_metadata_buffer(transa
 	/*
 	 * Do we need to do a data copy?
 	 */
-
 	if (need_copy_out && !done_copy_out) {
 		char *tmp;
-		tmp = jbd_rep_kmalloc(jh2bh(jh_in)->b_size, GFP_NOFS);
+
+		kunmap_atomic(mapped_data, KM_USER0);
+		jbd_unlock_bh_state(bh_in);
+		tmp = jbd_rep_kmalloc(bh_in->b_size, GFP_NOFS);
+		jbd_lock_bh_state(bh_in);
+		if (jh_in->b_frozen_data) {
+			kfree(new_page);
+			goto repeat;
+		}
 
 		jh_in->b_frozen_data = tmp;
-		memcpy (tmp, mapped_data, jh2bh(jh_in)->b_size);
-		
+		mapped_data = kmap_atomic(new_page, KM_USER0);
+		memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size);
+
 		/* If we get to this path, we'll always need the new
 		   address kmapped so that we can clear the escaped
 		   magic number below. */
-		kunmap(new_page);
 		new_page = virt_to_page(tmp);
 		new_offset = virt_to_offset(tmp);
-		mapped_data = ((char *) kmap(new_page)) + new_offset;
-		
 		done_copy_out = 1;
 	}
 
 	/*
-	 * Right, time to make up the new buffer_head.
+	 * Did we need to do an escaping?  Now we've done all the
+	 * copying, we can finally do so.
 	 */
-	new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
+	if (do_escape)
+		*((unsigned int *)(mapped_data + new_offset)) = 0;
+	kunmap_atomic(mapped_data, KM_USER0);
 
 	/* keep subsequent assertions sane */
 	new_bh->b_state = 0;
 	init_buffer(new_bh, NULL, NULL);
 	atomic_set(&new_bh->b_count, 1);
-	new_jh = journal_add_journal_head(new_bh);
+	jbd_unlock_bh_state(bh_in);
 
-	set_bh_page(new_bh, new_page, new_offset);
+	new_jh = journal_add_journal_head(new_bh);	/* This sleeps */
 
+	set_bh_page(new_bh, new_page, new_offset);
 	new_jh->b_transaction = NULL;
 	new_bh->b_size = jh2bh(jh_in)->b_size;
 	new_bh->b_bdev = transaction->t_journal->j_dev;
@@ -443,15 +457,6 @@ int journal_write_metadata_buffer(transa
 	*jh_out = new_jh;
 
 	/*
-	 * Did we need to do an escaping?  Now we've done all the
-	 * copying, we can finally do so.
-	 */
-
-	if (do_escape)
-		* ((unsigned int *) mapped_data) = 0;
-	kunmap(new_page);
-	
-	/*
 	 * The to-be-written buffer needs to get moved to the io queue,
 	 * and the original buffer whose contents we are shadowing or
 	 * copying is moved to the transaction's shadow queue.
diff -puN fs/jbd/transaction.c~jbd-050-b_frozen_data-locking fs/jbd/transaction.c
--- 25/fs/jbd/transaction.c~jbd-050-b_frozen_data-locking	Thu Jun  5 15:14:18 2003
+++ 25-akpm/fs/jbd/transaction.c	Thu Jun  5 15:14:18 2003
@@ -502,7 +502,6 @@ do_get_write_access(handle_t *handle, st
 	int error;
 	char *frozen_buffer = NULL;
 	int need_copy = 0;
-	int locked;
 
 	jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy);
 
@@ -512,16 +511,9 @@ repeat:
 
 	/* @@@ Need to check for errors here at some point. */
 
-	locked = test_set_buffer_locked(bh);
-	if (locked) {
-		/* We can't reliably test the buffer state if we found
-		 * it already locked, so just wait for the lock and
-		 * retry. */
-		unlock_journal(journal);
-		wait_on_buffer(bh);
-		lock_journal(journal);
-		goto repeat;
-	}
+	lock_buffer(bh);
+	jbd_lock_bh_state(bh);
+	spin_lock(&journal_datalist_lock);
 
 	/* We now hold the buffer lock so it is safe to query the buffer
 	 * state.  Is the buffer dirty? 
@@ -537,7 +529,6 @@ repeat:
 	 * the buffer dirtied, ugh.)  */
 
 	if (buffer_dirty(bh)) {
-		spin_lock(&journal_datalist_lock);
 		/* First question: is this buffer already part of the
 		 * current transaction or the existing committing
 		 * transaction? */
@@ -552,18 +543,18 @@ repeat:
 			JBUFFER_TRACE(jh, "Unexpected dirty buffer");
 			jbd_unexpected_dirty_buffer(jh);
  		}
- 		spin_unlock(&journal_datalist_lock);
  	}
 
 	unlock_buffer(bh);
 
 	error = -EROFS;
-	if (is_handle_aborted(handle)) 
+	if (is_handle_aborted(handle)) {
+		spin_unlock(&journal_datalist_lock);
+		jbd_unlock_bh_state(bh);
 		goto out_unlocked;
+	}
 	error = 0;
 
-	spin_lock(&journal_datalist_lock);
-
 	/* The buffer is already part of this transaction if
 	 * b_transaction or b_next_transaction points to it. */
 
@@ -606,6 +597,7 @@ repeat:
 
 			JBUFFER_TRACE(jh, "on shadow: sleep");
 			spin_unlock(&journal_datalist_lock);
+			jbd_unlock_bh_state(bh);
 			unlock_journal(journal);
 			/* commit wakes up all shadow buffers after IO */
 			wqh = bh_waitq_head(jh2bh(jh));
@@ -633,16 +625,16 @@ repeat:
 			if (!frozen_buffer) {
 				JBUFFER_TRACE(jh, "allocate memory for buffer");
 				spin_unlock(&journal_datalist_lock);
-				unlock_journal(journal);
+				jbd_unlock_bh_state(bh);
 				frozen_buffer = jbd_kmalloc(jh2bh(jh)->b_size,
 							    GFP_NOFS);
-				lock_journal(journal);
 				if (!frozen_buffer) {
 					printk(KERN_EMERG
 					       "%s: OOM for frozen_buffer\n",
 					       __FUNCTION__);
 					JBUFFER_TRACE(jh, "oom!");
 					error = -ENOMEM;
+					jbd_lock_bh_state(bh);
 					spin_lock(&journal_datalist_lock);
 					goto done_locked;
 				}
@@ -672,7 +664,6 @@ repeat:
 	}
 	
 done_locked:
-	spin_unlock(&journal_datalist_lock);
 	if (need_copy) {
 		struct page *page;
 		int offset;
@@ -682,17 +673,16 @@ done_locked:
 			    "Possible IO failure.\n");
 		page = jh2bh(jh)->b_page;
 		offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK;
-		source = kmap(page);
+		source = kmap_atomic(page, KM_USER0);
 		memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
-		kunmap(page);
+		kunmap_atomic(source, KM_USER0);
 	}
-	
+	spin_unlock(&journal_datalist_lock);
+	jbd_unlock_bh_state(bh);
 
 	/* If we are about to journal a buffer, then any revoke pending
            on it is no longer valid. */
-	lock_kernel();
 	journal_cancel_revoke(handle, jh);
-	unlock_kernel();
 
 out_unlocked:
 	if (frozen_buffer)
@@ -1070,8 +1060,9 @@ int journal_dirty_metadata (handle_t *ha
 	if (is_handle_aborted(handle))
 		goto out_unlock;
 	
+	jbd_lock_bh_state(bh);
 	spin_lock(&journal_datalist_lock);
-	set_bit(BH_JBDDirty, &bh->b_state);
+	set_buffer_jbddirty(bh);
 
 	J_ASSERT_JH(jh, jh->b_transaction != NULL);
 	
@@ -1102,6 +1093,7 @@ int journal_dirty_metadata (handle_t *ha
 
 done_locked:
 	spin_unlock(&journal_datalist_lock);
+	jbd_unlock_bh_state(bh);
 	JBUFFER_TRACE(jh, "exit");
 out_unlock:
 	unlock_journal(journal);
@@ -1168,6 +1160,7 @@ void journal_forget (handle_t *handle, s
 	BUFFER_TRACE(bh, "entry");
 
 	lock_journal(journal);
+	jbd_lock_bh_state(bh);
 	spin_lock(&journal_datalist_lock);
 
 	if (!buffer_jbd(bh))
@@ -1181,7 +1174,7 @@ void journal_forget (handle_t *handle, s
 		 * of this transaction, then we can just drop it from
 		 * the transaction immediately. */
 		clear_buffer_dirty(bh);
-		clear_bit(BH_JBDDirty, &bh->b_state);
+		clear_buffer_jbddirty(bh);
 
 		JBUFFER_TRACE(jh, "belongs to current transaction: unfile");
 		J_ASSERT_JH(jh, !jh->b_committed_data);
@@ -1208,6 +1201,7 @@ void journal_forget (handle_t *handle, s
 			__brelse(bh);
 			if (!buffer_jbd(bh)) {
 				spin_unlock(&journal_datalist_lock);
+				jbd_unlock_bh_state(bh);
 				unlock_journal(journal);
 				__bforget(bh);
 				return;
@@ -1231,6 +1225,7 @@ void journal_forget (handle_t *handle, s
 
 not_jbd:
 	spin_unlock(&journal_datalist_lock);
+	jbd_unlock_bh_state(bh);
 	unlock_journal(journal);
 	__brelse(bh);
 	return;
@@ -1924,7 +1919,7 @@ void __journal_file_buffer(struct journa
 	struct buffer_head *bh = jh2bh(jh);
 
 	assert_spin_locked(&journal_datalist_lock);
-	
+
 #ifdef __SMP__
 	J_ASSERT (current->lock_depth >= 0);
 #endif
@@ -1990,9 +1985,11 @@ void __journal_file_buffer(struct journa
 void journal_file_buffer(struct journal_head *jh,
 				transaction_t *transaction, int jlist)
 {
+	jbd_lock_bh_state(jh2bh(jh));
 	spin_lock(&journal_datalist_lock);
 	__journal_file_buffer(jh, transaction, jlist);
 	spin_unlock(&journal_datalist_lock);
+	jbd_unlock_bh_state(jh2bh(jh));
 }
 
 /* 
@@ -2045,12 +2042,13 @@ void __journal_refile_buffer(struct jour
  */
 void journal_refile_buffer(struct journal_head *jh)
 {
-	struct buffer_head *bh;
+	struct buffer_head *bh = jh2bh(jh);
 
+	jbd_lock_bh_state(bh);
 	spin_lock(&journal_datalist_lock);
-	bh = jh2bh(jh);
 
 	__journal_refile_buffer(jh);
+	jbd_unlock_bh_state(bh);
 	journal_remove_journal_head(bh);
 
 	spin_unlock(&journal_datalist_lock);
diff -puN fs/jbd/commit.c~jbd-050-b_frozen_data-locking fs/jbd/commit.c
--- 25/fs/jbd/commit.c~jbd-050-b_frozen_data-locking	Thu Jun  5 15:14:18 2003
+++ 25-akpm/fs/jbd/commit.c	Thu Jun  5 15:14:18 2003
@@ -210,8 +210,18 @@ write_out_data_locked:
 				wbuf[bufs++] = bh;
 			} else {
 				BUFFER_TRACE(bh, "writeout complete: unfile");
+				/*
+				 * We have a lock ranking problem..
+				 */
+				if (!jbd_trylock_bh_state(bh)) {
+					spin_unlock(&journal_datalist_lock);
+					schedule();
+					spin_lock(&journal_datalist_lock);
+					break;
+				}
 				__journal_unfile_buffer(jh);
 				jh->b_transaction = NULL;
+				jbd_unlock_bh_state(bh);
 				journal_remove_journal_head(bh);
 				__brelse(bh);
 			}
@@ -652,7 +662,6 @@ skip_commit: /* The journal should be un
 			kfree(jh->b_frozen_data);
 			jh->b_frozen_data = NULL;
 		}
-		jbd_unlock_bh_state(jh2bh(jh));
 
 		spin_lock(&journal_datalist_lock);
 		cp_transaction = jh->b_cp_transaction;
@@ -687,11 +696,13 @@ skip_commit: /* The journal should be un
 			__journal_insert_checkpoint(jh, commit_transaction);
 			JBUFFER_TRACE(jh, "refile for checkpoint writeback");
 			__journal_refile_buffer(jh);
+			jbd_unlock_bh_state(bh);
 		} else {
 			J_ASSERT_BH(bh, !buffer_dirty(bh));
 			J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
 			__journal_unfile_buffer(jh);
 			jh->b_transaction = 0;
+			jbd_unlock_bh_state(bh);
 			journal_remove_journal_head(bh);
 			__brelse(bh);
 		}
diff -puN fs/jbd/revoke.c~jbd-050-b_frozen_data-locking fs/jbd/revoke.c

_