From: Alex Tomas We have a race wherein the block allocator can decide that journal_head.b_committed_data is present and then will use it. But kjournald can concurrently free it and set the pointer to NULL. It goes oops. We introduce per-buffer_head "spinlocking" based on a bit in b_state. To do this we abstract out pte_chain_lock() and reuse the implementation. The bit-based spinlocking is pretty inefficient CPU-wise (hence the warning in there) and we may move this to a hashed spinlock later. fs/ext3/balloc.c | 54 +++++++++++++++++++++------------ fs/jbd/commit.c | 2 + include/linux/jbd.h | 16 +++++++++ include/linux/rmap-locking.h | 28 +---------------- include/linux/spinlock.h | 70 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 125 insertions(+), 45 deletions(-) diff -puN fs/ext3/balloc.c~jbd-010-b_committed_data-race-fix fs/ext3/balloc.c --- 25/fs/ext3/balloc.c~jbd-010-b_committed_data-race-fix 2003-06-07 14:29:32.000000000 -0700 +++ 25-akpm/fs/ext3/balloc.c 2003-06-07 14:29:32.000000000 -0700 @@ -185,11 +185,14 @@ do_more: if (err) goto error_return; + jbd_lock_bh_state(bitmap_bh); + for (i = 0; i < count; i++) { /* * An HJ special. This is expensive... */ #ifdef CONFIG_JBD_DEBUG + jbd_unlock_bh_state(bitmap_bh); { struct buffer_head *debug_bh; debug_bh = sb_find_get_block(sb, block + i); @@ -202,6 +205,7 @@ do_more: __brelse(debug_bh); } } + jbd_lock_bh_state(bitmap_bh); #endif /* @@@ This prevents newly-allocated data from being * freed and then reallocated within the same @@ -243,6 +247,7 @@ do_more: dquot_freed_blocks++; } } + jbd_unlock_bh_state(bitmap_bh); spin_lock(sb_bgl_lock(sbi, block_group)); gdp->bg_free_blocks_count = @@ -289,11 +294,12 @@ error_return: * data-writes at some point, and disable it for metadata allocations or * sync-data inodes. */ -static int ext3_test_allocatable(int nr, struct buffer_head *bh) +static inline int ext3_test_allocatable(int nr, struct buffer_head *bh, + int have_access) { if (ext3_test_bit(nr, bh->b_data)) return 0; - if (!buffer_jbd(bh) || !bh2jh(bh)->b_committed_data) + if (!have_access || !buffer_jbd(bh) || !bh2jh(bh)->b_committed_data) return 1; return !ext3_test_bit(nr, bh2jh(bh)->b_committed_data); } @@ -305,8 +311,8 @@ static int ext3_test_allocatable(int nr, * the initial goal; then for a free byte somewhere in the bitmap; then * for any free bit in the bitmap. */ -static int find_next_usable_block(int start, - struct buffer_head *bh, int maxblocks) +static int find_next_usable_block(int start, struct buffer_head *bh, + int maxblocks, int have_access) { int here, next; char *p, *r; @@ -322,7 +328,8 @@ static int find_next_usable_block(int st */ int end_goal = (start + 63) & ~63; here = ext3_find_next_zero_bit(bh->b_data, end_goal, start); - if (here < end_goal && ext3_test_allocatable(here, bh)) + if (here < end_goal && + ext3_test_allocatable(here, bh, have_access)) return here; ext3_debug ("Bit not found near goal\n"); @@ -345,7 +352,7 @@ static int find_next_usable_block(int st r = memscan(p, 0, (maxblocks - here + 7) >> 3); next = (r - ((char *) bh->b_data)) << 3; - if (next < maxblocks && ext3_test_allocatable(next, bh)) + if (next < maxblocks && ext3_test_allocatable(next, bh, have_access)) return next; /* The bitmap search --- search forward alternately @@ -357,13 +364,13 @@ static int find_next_usable_block(int st maxblocks, here); if (next >= maxblocks) return -1; - if (ext3_test_allocatable(next, bh)) + if (ext3_test_allocatable(next, bh, have_access)) return next; - J_ASSERT_BH(bh, bh2jh(bh)->b_committed_data); - here = ext3_find_next_zero_bit - ((unsigned long *) bh2jh(bh)->b_committed_data, - maxblocks, next); + if (have_access) + here = ext3_find_next_zero_bit + ((unsigned long *) bh2jh(bh)->b_committed_data, + maxblocks, next); } return -1; } @@ -402,17 +409,18 @@ ext3_try_to_allocate(struct super_block *errp = 0; - if (goal >= 0 && ext3_test_allocatable(goal, bitmap_bh)) + if (goal >= 0 && ext3_test_allocatable(goal, bitmap_bh, 0)) goto got; repeat: goal = find_next_usable_block(goal, bitmap_bh, - EXT3_BLOCKS_PER_GROUP(sb)); + EXT3_BLOCKS_PER_GROUP(sb), have_access); if (goal < 0) goto fail; for (i = 0; - i < 7 && goal > 0 && ext3_test_allocatable(goal - 1, bitmap_bh); + i < 7 && goal > 0 && + ext3_test_allocatable(goal - 1, bitmap_bh, have_access); i++, goal--); got: @@ -429,6 +437,7 @@ got: *errp = fatal; goto fail; } + jbd_lock_bh_state(bitmap_bh); have_access = 1; } @@ -444,6 +453,7 @@ got: } BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for bitmap block"); + jbd_unlock_bh_state(bitmap_bh); fatal = ext3_journal_dirty_metadata(handle, bitmap_bh); if (fatal) { *errp = fatal; @@ -454,6 +464,7 @@ got: fail: if (have_access) { BUFFER_TRACE(bitmap_bh, "journal_release_buffer"); + jbd_unlock_bh_state(bitmap_bh); ext3_journal_release_buffer(handle, bitmap_bh); } return -1; @@ -611,14 +622,19 @@ allocated: brelse(debug_bh); } } -#endif + jbd_lock_bh_state(bitmap_bh); spin_lock(sb_bgl_lock(sbi, group_no)); - if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) - J_ASSERT_BH(bitmap_bh, - !ext3_test_bit(ret_block, - bh2jh(bitmap_bh)->b_committed_data)); + if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) { + if (ext3_test_bit(ret_block, + bh2jh(bitmap_bh)->b_committed_data)) { + printk("%s: block was unexpectedly set in " + "b_committed_data\n", __FUNCTION__); + } + } ext3_debug("found bit %d\n", ret_block); spin_unlock(sb_bgl_lock(sbi, group_no)); + jbd_unlock_bh_state(bitmap_bh); +#endif /* ret_block was blockgroup-relative. Now it becomes fs-relative */ ret_block = target_block; diff -puN fs/jbd/commit.c~jbd-010-b_committed_data-race-fix fs/jbd/commit.c --- 25/fs/jbd/commit.c~jbd-010-b_committed_data-race-fix 2003-06-07 14:29:32.000000000 -0700 +++ 25-akpm/fs/jbd/commit.c 2003-06-07 14:29:32.000000000 -0700 @@ -640,6 +640,7 @@ skip_commit: /* The journal should be un * * Otherwise, we can just throw away the frozen data now. */ + jbd_lock_bh_state(jh2bh(jh)); if (jh->b_committed_data) { kfree(jh->b_committed_data); jh->b_committed_data = NULL; @@ -651,6 +652,7 @@ skip_commit: /* The journal should be un kfree(jh->b_frozen_data); jh->b_frozen_data = NULL; } + jbd_unlock_bh_state(jh2bh(jh)); spin_lock(&journal_datalist_lock); cp_transaction = jh->b_cp_transaction; diff -puN include/linux/jbd.h~jbd-010-b_committed_data-race-fix include/linux/jbd.h --- 25/include/linux/jbd.h~jbd-010-b_committed_data-race-fix 2003-06-07 14:29:32.000000000 -0700 +++ 25-akpm/include/linux/jbd.h 2003-06-07 14:29:34.000000000 -0700 @@ -292,6 +292,7 @@ enum jbd_state_bits { BH_Revoked, /* Has been revoked from the log */ BH_RevokeValid, /* Revoked flag is valid */ BH_JBDDirty, /* Is dirty but journaled */ + BH_State, /* Pins most journal_head state */ }; BUFFER_FNS(JBD, jbd) @@ -309,6 +310,21 @@ static inline struct journal_head *bh2jh return bh->b_private; } +static inline void jbd_lock_bh_state(struct buffer_head *bh) +{ + bit_spin_lock(BH_State, &bh->b_state); +} + +static inline int jbd_trylock_bh_state(struct buffer_head *bh) +{ + return bit_spin_trylock(BH_State, &bh->b_state); +} + +static inline void jbd_unlock_bh_state(struct buffer_head *bh) +{ + bit_spin_unlock(BH_State, &bh->b_state); +} + #define HAVE_JOURNAL_CALLBACK_STATUS /** * struct journal_callback - Base structure for callback information. diff -puN include/linux/rmap-locking.h~jbd-010-b_committed_data-race-fix include/linux/rmap-locking.h --- 25/include/linux/rmap-locking.h~jbd-010-b_committed_data-race-fix 2003-06-07 14:29:32.000000000 -0700 +++ 25-akpm/include/linux/rmap-locking.h 2003-06-07 14:29:32.000000000 -0700 @@ -10,32 +10,8 @@ struct pte_chain; extern kmem_cache_t *pte_chain_cache; -static inline void pte_chain_lock(struct page *page) -{ - /* - * Assuming the lock is uncontended, this never enters - * the body of the outer loop. If it is contended, then - * within the inner loop a non-atomic test is used to - * busywait with less bus contention for a good time to - * attempt to acquire the lock bit. - */ - preempt_disable(); -#ifdef CONFIG_SMP - while (test_and_set_bit(PG_chainlock, &page->flags)) { - while (test_bit(PG_chainlock, &page->flags)) - cpu_relax(); - } -#endif -} - -static inline void pte_chain_unlock(struct page *page) -{ -#ifdef CONFIG_SMP - smp_mb__before_clear_bit(); - clear_bit(PG_chainlock, &page->flags); -#endif - preempt_enable(); -} +#define pte_chain_lock(page) bit_spin_lock(PG_chainlock, &page->flags) +#define pte_chain_unlock(page) bit_spin_unlock(PG_chainlock, &page->flags) struct pte_chain *pte_chain_alloc(int gfp_flags); void __pte_chain_free(struct pte_chain *pte_chain); diff -puN include/linux/spinlock.h~jbd-010-b_committed_data-race-fix include/linux/spinlock.h --- 25/include/linux/spinlock.h~jbd-010-b_committed_data-race-fix 2003-06-07 14:29:32.000000000 -0700 +++ 25-akpm/include/linux/spinlock.h 2003-06-07 14:29:32.000000000 -0700 @@ -541,4 +541,74 @@ do { \ extern int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock); #endif +/* + * bit-based spin_lock() + * + * Don't use this unless you really need to: spin_lock() and spin_unlock() + * are significantly faster. + */ +static inline void bit_spin_lock(int bitnum, unsigned long *addr) +{ + /* + * Assuming the lock is uncontended, this never enters + * the body of the outer loop. If it is contended, then + * within the inner loop a non-atomic test is used to + * busywait with less bus contention for a good time to + * attempt to acquire the lock bit. + */ + preempt_disable(); +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) + while (test_and_set_bit(bitnum, addr)) { + while (test_bit(bitnum, addr)) + cpu_relax(); + } +#endif +} + +/* + * Return true if it was acquired + */ +static inline int bit_spin_trylock(int bitnum, unsigned long *addr) +{ +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) + int ret; + + preempt_disable(); + ret = !test_and_set_bit(bitnum, addr); + if (!ret) + preempt_enable(); + return ret; +#else + preempt_disable(); + return 1; +#endif +} + +/* + * bit-based spin_unlock() + */ +static inline void bit_spin_unlock(int bitnum, unsigned long *addr) +{ +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) + BUG_ON(!test_bit(bitnum, addr)); + smp_mb__before_clear_bit(); + clear_bit(bitnum, addr); +#endif + preempt_enable(); +} + +/* + * Return true if the lock is held. + */ +static inline int bit_spin_is_locked(int bitnum, unsigned long *addr) +{ +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) + return test_bit(bitnum, addr); +#elif defined CONFIG_PREEMPT + return preempt_count(); +#else + return 1; +#endif +} + #endif /* __LINUX_SPINLOCK_H */ _