From: Alex Tomas This patch weans ext3 off lock_super()-based protection for the inode and block allocators. It's basically the same as the ext2 changes. 1) each group has own spinlock, which is used for group counter modifications 2) sb->s_free_blocks_count isn't used any more. ext2_statfs() and find_group_orlov() loop over groups to count free blocks 3) sb->s_free_blocks_count is recalculated at mount/umount/sync_super time in order to check consistency and to avoid fsck warnings 4) reserved blocks are distributed over last groups 5) ext3_new_block() tries to use non-reserved blocks and if it fails then tries to use reserved blocks 6) ext3_new_block() and ext3_free_blocks do not modify sb->s_free_blocks, therefore they do not call mark_buffer_dirty() for superblock's buffer_head. this should reduce I/O a bit DESC Fix orlov allocator boundary case EDESC In the interests of SMP scalability the ext2 free blocks and free inodes counters are "approximate". But there is a piece of code in the Orlov allocator which fails due to boundary conditions on really small filesystems. Fix that up via a final allocation pass which simply uses first-fit for allocatiopn of a directory inode. 25-akpm/fs/ext3/balloc.c | 298 +++++++++++++++++++++---------------- 25-akpm/fs/ext3/ialloc.c | 79 ++++----- 25-akpm/fs/ext3/super.c | 59 ++++++- 25-akpm/fs/jbd/journal.c | 2 25-akpm/fs/jbd/transaction.c | 2 25-akpm/include/linux/ext3_fs.h | 2 25-akpm/include/linux/ext3_fs_sb.h | 10 - 25-akpm/include/linux/ext3_jbd.h | 6 8 files changed, 280 insertions(+), 178 deletions(-) diff -puN fs/ext3/balloc.c~ext3-concurrent-block-inode-allocation fs/ext3/balloc.c --- 25/fs/ext3/balloc.c~ext3-concurrent-block-inode-allocation Thu Jun 5 15:14:11 2003 +++ 25-akpm/fs/ext3/balloc.c Thu Jun 5 15:14:11 2003 @@ -118,7 +118,6 @@ void ext3_free_blocks (handle_t *handle, printk ("ext3_free_blocks: nonexistent device"); return; } - lock_super (sb); es = EXT3_SB(sb)->s_es; if (block < le32_to_cpu(es->s_first_data_block) || block + count < block || @@ -184,11 +183,6 @@ do_more: if (err) goto error_return; - BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get_write_access"); - err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); - if (err) - goto error_return; - for (i = 0; i < count; i++) { /* * An HJ special. This is expensive... @@ -207,19 +201,6 @@ do_more: } } #endif - BUFFER_TRACE(bitmap_bh, "clear bit"); - if (!ext3_clear_bit (bit + i, bitmap_bh->b_data)) { - ext3_error (sb, __FUNCTION__, - "bit already cleared for block %lu", - block + i); - BUFFER_TRACE(bitmap_bh, "bit already cleared"); - } else { - dquot_freed_blocks++; - gdp->bg_free_blocks_count = - cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)+1); - es->s_free_blocks_count = - cpu_to_le32(le32_to_cpu(es->s_free_blocks_count)+1); - } /* @@@ This prevents newly-allocated data from being * freed and then reallocated within the same * transaction. @@ -238,12 +219,35 @@ do_more: * activity on the buffer any more and so it is safe to * reallocate it. */ - BUFFER_TRACE(bitmap_bh, "clear in b_committed_data"); + BUFFER_TRACE(bitmap_bh, "set in b_committed_data"); J_ASSERT_BH(bitmap_bh, bh2jh(bitmap_bh)->b_committed_data != NULL); - ext3_set_bit(bit + i, bh2jh(bitmap_bh)->b_committed_data); + ext3_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i, + bh2jh(bitmap_bh)->b_committed_data); + + /* + * We clear the bit in the bitmap after setting the committed + * data bit, because this is the reverse order to that which + * the allocator uses. + */ + BUFFER_TRACE(bitmap_bh, "clear bit"); + if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, block_group), + bit + i, bitmap_bh->b_data)) { + ext3_error (sb, __FUNCTION__, + "bit already cleared for block %lu", + block + i); + BUFFER_TRACE(bitmap_bh, "bit already cleared"); + } else { + dquot_freed_blocks++; + } } + spin_lock(bg_lock(sb, block_group)); + gdp->bg_free_blocks_count = + cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + + dquot_freed_blocks); + spin_unlock(bg_lock(sb, block_group)); + /* We dirtied the bitmap block */ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); err = ext3_journal_dirty_metadata(handle, bitmap_bh); @@ -253,11 +257,6 @@ do_more: ret = ext3_journal_dirty_metadata(handle, gd_bh); if (!err) err = ret; - /* And the superblock */ - BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "dirtied superblock"); - ret = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); - if (!err) err = ret; - if (overflow && !err) { block += count; count = overflow; @@ -267,7 +266,6 @@ do_more: error_return: brelse(bitmap_bh); ext3_std_error(sb, err); - unlock_super(sb); if (dquot_freed_blocks) DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); return; @@ -368,6 +366,98 @@ static int find_next_usable_block(int st } /* + * We think we can allocate this block in this bitmap. Try to set the bit. + * If that succeeds then check that nobody has allocated and then freed the + * block since we saw that is was not marked in b_committed_data. If it _was_ + * allocated and freed then clear the bit in the bitmap again and return + * zero (failure). + */ +static inline int +claim_block(spinlock_t *lock, int block, struct buffer_head *bh) +{ + if (ext3_set_bit_atomic(lock, block, bh->b_data)) + return 0; + if (buffer_jbd(bh) && bh2jh(bh)->b_committed_data && + ext3_test_bit(block, bh2jh(bh)->b_committed_data)) { + ext3_clear_bit_atomic(lock, block, bh->b_data); + return 0; + } + return 1; +} + +/* + * If we failed to allocate the desired block then we may end up crossing to a + * new bitmap. In that case we must release write access to the old one via + * ext3_journal_release_buffer(), else we'll run out of credits. + */ +static int +ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group, + struct buffer_head *bitmap_bh, int goal, int *errp) +{ + int i, fatal = 0; + int have_access = 0; + + *errp = 0; + + if (goal >= 0 && ext3_test_allocatable(goal, bitmap_bh)) + goto got; + +repeat: + goal = find_next_usable_block(goal, bitmap_bh, + EXT3_BLOCKS_PER_GROUP(sb)); + if (goal < 0) + goto fail; + + for (i = 0; + i < 7 && goal > 0 && ext3_test_allocatable(goal - 1, bitmap_bh); + i++, goal--); + +got: + if (!have_access) { + /* + * Make sure we use undo access for the bitmap, because it is + * critical that we do the frozen_data COW on bitmap buffers in + * all cases even if the buffer is in BJ_Forget state in the + * committing transaction. + */ + BUFFER_TRACE(bitmap_bh, "get undo access for new block"); + fatal = ext3_journal_get_undo_access(handle, bitmap_bh); + if (fatal) { + *errp = fatal; + goto fail; + } + have_access = 1; + } + + if (!claim_block(bg_lock(sb, group), goal, bitmap_bh)) { + /* + * The block was allocated by another thread, or it was + * allocated and then freed by another thread + */ + goal++; + if (goal >= EXT3_BLOCKS_PER_GROUP(sb)) + goto fail; + goto repeat; + } + + BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for bitmap block"); + fatal = ext3_journal_dirty_metadata(handle, bitmap_bh); + if (fatal) { + *errp = fatal; + goto fail; + } + + return goal; +fail: + if (have_access) { + BUFFER_TRACE(bitmap_bh, "journal_release_buffer"); + ext3_journal_release_buffer(handle, bitmap_bh); + } + return -1; +} + + +/* * ext3_new_block uses a goal block to assist allocation. If the goal is * free, or there is a free block within 32 blocks of the goal, that block * is allocated. Otherwise a forward search is made for a free block; within @@ -383,10 +473,12 @@ ext3_new_block(handle_t *handle, struct struct buffer_head *gdp_bh; /* bh2 */ int group_no; /* i */ int ret_block; /* j */ - int bit; /* k */ + int bgi; /* blockgroup iteration index */ int target_block; /* tmp */ int fatal = 0, err; int performed_allocation = 0; + int free; + int use_reserve = 0; struct super_block *sb; struct ext3_group_desc *gdp; struct ext3_super_block *es; @@ -408,16 +500,7 @@ ext3_new_block(handle_t *handle, struct return 0; } - lock_super(sb); es = EXT3_SB(sb)->s_es; - if (le32_to_cpu(es->s_free_blocks_count) <= - le32_to_cpu(es->s_r_blocks_count) && - ((EXT3_SB(sb)->s_resuid != current->fsuid) && - (EXT3_SB(sb)->s_resgid == 0 || - !in_group_p(EXT3_SB(sb)->s_resgid)) && - !capable(CAP_SYS_RESOURCE))) - goto out; - ext3_debug("goal=%lu.\n", goal); /* @@ -432,40 +515,28 @@ ext3_new_block(handle_t *handle, struct if (!gdp) goto io_error; - if (le16_to_cpu(gdp->bg_free_blocks_count) > 0) { + free = le16_to_cpu(gdp->bg_free_blocks_count); + free -= EXT3_SB(sb)->s_bgi[group_no].bg_reserved; + if (free > 0) { ret_block = ((goal - le32_to_cpu(es->s_first_data_block)) % EXT3_BLOCKS_PER_GROUP(sb)); -#ifdef EXT3FS_DEBUG - if (ret_block) - goal_attempts++; -#endif bitmap_bh = read_block_bitmap(sb, group_no); if (!bitmap_bh) - goto io_error; - - ext3_debug("goal is at %d:%d.\n", group_no, ret_block); - - if (ext3_test_allocatable(ret_block, bitmap_bh)) { -#ifdef EXT3FS_DEBUG - goal_hits++; - ext3_debug("goal bit allocated.\n"); -#endif - goto got_block; - } - - ret_block = find_next_usable_block(ret_block, bitmap_bh, - EXT3_BLOCKS_PER_GROUP(sb)); + goto io_error; + ret_block = ext3_try_to_allocate(sb, handle, group_no, + bitmap_bh, ret_block, &fatal); + if (fatal) + goto out; if (ret_block >= 0) - goto search_back; + goto allocated; } - - ext3_debug("Bit not found in block group %d.\n", group_no); - + /* * Now search the rest of the groups. We assume that * i and gdp correctly point to the last group visited. */ - for (bit = 0; bit < EXT3_SB(sb)->s_groups_count; bit++) { +repeat: + for (bgi = 0; bgi < EXT3_SB(sb)->s_groups_count; bgi++) { group_no++; if (group_no >= EXT3_SB(sb)->s_groups_count) group_no = 0; @@ -474,57 +545,47 @@ ext3_new_block(handle_t *handle, struct *errp = -EIO; goto out; } - if (le16_to_cpu(gdp->bg_free_blocks_count) > 0) { - brelse(bitmap_bh); - bitmap_bh = read_block_bitmap(sb, group_no); - if (!bitmap_bh) - goto io_error; - ret_block = find_next_usable_block(-1, bitmap_bh, - EXT3_BLOCKS_PER_GROUP(sb)); - if (ret_block >= 0) - goto search_back; - } + free = le16_to_cpu(gdp->bg_free_blocks_count); + if (!use_reserve) + free -= EXT3_SB(sb)->s_bgi[group_no].bg_reserved; + if (free <= 0) + continue; + + brelse(bitmap_bh); + bitmap_bh = read_block_bitmap(sb, group_no); + if (!bitmap_bh) + goto io_error; + ret_block = ext3_try_to_allocate(sb, handle, group_no, + bitmap_bh, -1, &fatal); + if (fatal) + goto out; + if (ret_block >= 0) + goto allocated; + } + + if (!use_reserve && + (EXT3_SB(sb)->s_resuid == current->fsuid || + (EXT3_SB(sb)->s_resgid != 0 && in_group_p(EXT3_SB(sb)->s_resgid)) || + capable(CAP_SYS_RESOURCE))) { + use_reserve = 1; + group_no = 0; + goto repeat; } /* No space left on the device */ + *errp = -ENOSPC; goto out; -search_back: - /* - * We have succeeded in finding a free byte in the block - * bitmap. Now search backwards up to 7 bits to find the - * start of this group of free blocks. - */ - for ( bit = 0; - bit < 7 && ret_block > 0 && - ext3_test_allocatable(ret_block - 1, bitmap_bh); - bit++, ret_block--) - ; - -got_block: +allocated: ext3_debug("using block group %d(%d)\n", group_no, gdp->bg_free_blocks_count); - /* Make sure we use undo access for the bitmap, because it is - critical that we do the frozen_data COW on bitmap buffers in - all cases even if the buffer is in BJ_Forget state in the - committing transaction. */ - BUFFER_TRACE(bitmap_bh, "get undo access for marking new block"); - fatal = ext3_journal_get_undo_access(handle, bitmap_bh); - if (fatal) - goto out; - BUFFER_TRACE(gdp_bh, "get_write_access"); fatal = ext3_journal_get_write_access(handle, gdp_bh); if (fatal) goto out; - BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get_write_access"); - fatal = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); - if (fatal) - goto out; - target_block = ret_block + group_no * EXT3_BLOCKS_PER_GROUP(sb) + le32_to_cpu(es->s_first_data_block); @@ -536,11 +597,6 @@ got_block: "Allocating block in system zone - " "block = %u", target_block); - /* The superblock lock should guard against anybody else beating - * us to this point! */ - J_ASSERT_BH(bitmap_bh, !ext3_test_bit(ret_block, bitmap_bh->b_data)); - BUFFER_TRACE(bitmap_bh, "setting bitmap bit"); - ext3_set_bit(ret_block, bitmap_bh->b_data); performed_allocation = 1; #ifdef CONFIG_JBD_DEBUG @@ -556,20 +612,17 @@ got_block: } } #endif + spin_lock(bg_lock(sb, group_no)); if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) J_ASSERT_BH(bitmap_bh, !ext3_test_bit(ret_block, bh2jh(bitmap_bh)->b_committed_data)); ext3_debug("found bit %d\n", ret_block); + spin_unlock(bg_lock(sb, group_no)); /* ret_block was blockgroup-relative. Now it becomes fs-relative */ ret_block = target_block; - BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for bitmap block"); - err = ext3_journal_dirty_metadata(handle, bitmap_bh); - if (!fatal) - fatal = err; - if (ret_block >= le32_to_cpu(es->s_blocks_count)) { ext3_error(sb, "ext3_new_block", "block(%d) >= blocks count(%d) - " @@ -586,27 +639,20 @@ got_block: ext3_debug("allocating block %d. Goal hits %d of %d.\n", ret_block, goal_hits, goal_attempts); + spin_lock(bg_lock(sb, group_no)); gdp->bg_free_blocks_count = cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - 1); - es->s_free_blocks_count = - cpu_to_le32(le32_to_cpu(es->s_free_blocks_count) - 1); + spin_unlock(bg_lock(sb, group_no)); BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor"); err = ext3_journal_dirty_metadata(handle, gdp_bh); if (!fatal) fatal = err; - BUFFER_TRACE(EXT3_SB(sb)->s_sbh, - "journal_dirty_metadata for superblock"); - err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); - if (!fatal) - fatal = err; - sb->s_dirt = 1; if (fatal) goto out; - unlock_super(sb); *errp = 0; brelse(bitmap_bh); return ret_block; @@ -618,7 +664,6 @@ out: *errp = fatal; ext3_std_error(sb, fatal); } - unlock_super(sb); /* * Undo the block allocation */ @@ -631,12 +676,13 @@ out: unsigned long ext3_count_free_blocks(struct super_block *sb) { + unsigned long desc_count; + struct ext3_group_desc *gdp; + int i; #ifdef EXT3FS_DEBUG struct ext3_super_block *es; - unsigned long desc_count, bitmap_count, x; + unsigned long bitmap_count, x; struct buffer_head *bitmap_bh = NULL; - struct ext3_group_desc *gdp; - int i; lock_super(sb); es = EXT3_SB(sb)->s_es; @@ -664,7 +710,15 @@ unsigned long ext3_count_free_blocks(str unlock_super(sb); return bitmap_count; #else - return le32_to_cpu(EXT3_SB(sb)->s_es->s_free_blocks_count); + desc_count = 0; + for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) { + gdp = ext3_get_group_desc(sb, i, NULL); + if (!gdp) + continue; + desc_count += le16_to_cpu(gdp->bg_free_blocks_count); + } + + return desc_count; #endif } diff -puN fs/ext3/ialloc.c~ext3-concurrent-block-inode-allocation fs/ext3/ialloc.c --- 25/fs/ext3/ialloc.c~ext3-concurrent-block-inode-allocation Thu Jun 5 15:14:11 2003 +++ 25-akpm/fs/ext3/ialloc.c Thu Jun 5 15:14:11 2003 @@ -131,7 +131,6 @@ void ext3_free_inode (handle_t *handle, /* Do this BEFORE marking the inode not in use or returning an error */ clear_inode (inode); - lock_super (sb); es = EXT3_SB(sb)->s_es; if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { ext3_error (sb, "ext3_free_inode", @@ -150,7 +149,8 @@ void ext3_free_inode (handle_t *handle, goto error_return; /* Ok, now we can actually update the inode bitmaps.. */ - if (!ext3_clear_bit(bit, bitmap_bh->b_data)) + if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, block_group), + bit, bitmap_bh->b_data)) ext3_error (sb, "ext3_free_inode", "bit already cleared for inode %lu", ino); else { @@ -160,28 +160,18 @@ void ext3_free_inode (handle_t *handle, fatal = ext3_journal_get_write_access(handle, bh2); if (fatal) goto error_return; - BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get write access"); - fatal = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); - if (fatal) goto error_return; - if (gdp) { + spin_lock(&EXT3_SB(sb)->s_bgi[block_group].bg_ialloc_lock); gdp->bg_free_inodes_count = cpu_to_le16( le16_to_cpu(gdp->bg_free_inodes_count) + 1); - if (is_directory) { + if (is_directory) gdp->bg_used_dirs_count = cpu_to_le16( le16_to_cpu(gdp->bg_used_dirs_count) - 1); - EXT3_SB(sb)->s_dir_count--; - } + spin_unlock(&EXT3_SB(sb)->s_bgi[block_group].bg_ialloc_lock); } BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata"); err = ext3_journal_dirty_metadata(handle, bh2); if (!fatal) fatal = err; - es->s_free_inodes_count = - cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) + 1); - BUFFER_TRACE(EXT3_SB(sb)->s_sbh, - "call ext3_journal_dirty_metadata"); - err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); - if (!fatal) fatal = err; } BUFFER_TRACE(bitmap_bh, "call ext3_journal_dirty_metadata"); err = ext3_journal_dirty_metadata(handle, bitmap_bh); @@ -191,7 +181,6 @@ void ext3_free_inode (handle_t *handle, error_return: brelse(bitmap_bh); ext3_std_error(sb, fatal); - unlock_super(sb); } /* @@ -206,9 +195,8 @@ error_return: */ static int find_group_dir(struct super_block *sb, struct inode *parent) { - struct ext3_super_block * es = EXT3_SB(sb)->s_es; int ngroups = EXT3_SB(sb)->s_groups_count; - int avefreei = le32_to_cpu(es->s_free_inodes_count) / ngroups; + int avefreei = ext3_count_free_inodes(sb) / ngroups; struct ext3_group_desc *desc, *best_desc = NULL; struct buffer_head *bh; int group, best_group = -1; @@ -264,10 +252,12 @@ static int find_group_orlov(struct super struct ext3_super_block *es = sbi->s_es; int ngroups = sbi->s_groups_count; int inodes_per_group = EXT3_INODES_PER_GROUP(sb); - int avefreei = le32_to_cpu(es->s_free_inodes_count) / ngroups; - int avefreeb = le32_to_cpu(es->s_free_blocks_count) / ngroups; + int freei = ext3_count_free_inodes(sb); + int avefreei = freei / ngroups; + int freeb = ext3_count_free_blocks(sb); + int avefreeb = freeb / ngroups; int blocks_per_dir; - int ndirs = sbi->s_dir_count; + int ndirs = ext3_count_dirs(sb); int max_debt, max_dirs, min_blocks, min_inodes; int group = -1, i; struct ext3_group_desc *desc; @@ -319,7 +309,7 @@ static int find_group_orlov(struct super desc = ext3_get_group_desc (sb, group, &bh); if (!desc || !desc->bg_free_inodes_count) continue; - if (sbi->s_debts[group] >= max_debt) + if (sbi->s_bgi[group].bg_debts >= max_debt) continue; if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs) continue; @@ -340,6 +330,15 @@ fallback: return group; } + if (avefreei) { + /* + * The free-inodes counter is approximate, and for really small + * filesystems the above test can fail to find any blockgroups + */ + avefreei = 0; + goto fallback; + } + return -1; } @@ -435,7 +434,6 @@ struct inode *ext3_new_inode(handle_t *h return ERR_PTR(-ENOMEM); ei = EXT3_I(inode); - lock_super (sb); es = EXT3_SB(sb)->s_es; repeat: if (S_ISDIR(mode)) { @@ -464,11 +462,9 @@ repeat: err = ext3_journal_get_write_access(handle, bitmap_bh); if (err) goto fail; - if (ext3_set_bit(ino, bitmap_bh->b_data)) { - ext3_error (sb, "ext3_new_inode", - "bit already set for inode %lu", ino); + if (ext3_set_bit_atomic(sb_bgl_lock(sbi, group), + ino, bitmap_bh->b_data)) goto repeat; - } BUFFER_TRACE(bitmap_bh, "call ext3_journal_dirty_metadata"); err = ext3_journal_dirty_metadata(handle, bitmap_bh); if (err) goto fail; @@ -504,26 +500,19 @@ repeat: BUFFER_TRACE(bh2, "get_write_access"); err = ext3_journal_get_write_access(handle, bh2); if (err) goto fail; + spin_lock(&EXT3_SB(sb)->s_bgi[group].bg_ialloc_lock); gdp->bg_free_inodes_count = cpu_to_le16(le16_to_cpu(gdp->bg_free_inodes_count) - 1); if (S_ISDIR(mode)) { gdp->bg_used_dirs_count = cpu_to_le16(le16_to_cpu(gdp->bg_used_dirs_count) + 1); - EXT3_SB(sb)->s_dir_count++; } + spin_unlock(&EXT3_SB(sb)->s_bgi[group].bg_ialloc_lock); BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata"); err = ext3_journal_dirty_metadata(handle, bh2); if (err) goto fail; - BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get_write_access"); - err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); - if (err) goto fail; - es->s_free_inodes_count = - cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) - 1); - BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "call ext3_journal_dirty_metadata"); - err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); sb->s_dirt = 1; - if (err) goto fail; inode->i_uid = current->fsuid; if (test_opt (sb, GRPID)) @@ -576,7 +565,6 @@ repeat: ei->i_state = EXT3_STATE_NEW; - unlock_super(sb); ret = inode; if(DQUOT_ALLOC_INODE(inode)) { DQUOT_DROP(inode); @@ -600,7 +588,6 @@ repeat: fail: ext3_std_error(sb, err); out: - unlock_super(sb); iput(inode); ret = ERR_PTR(err); really_out: @@ -673,12 +660,13 @@ out: unsigned long ext3_count_free_inodes (struct super_block * sb) { + unsigned long desc_count; + struct ext3_group_desc *gdp; + int i; #ifdef EXT3FS_DEBUG struct ext3_super_block *es; - unsigned long desc_count, bitmap_count, x; - struct ext3_group_desc *gdp; + unsigned long bitmap_count, x; struct buffer_head *bitmap_bh = NULL; - int i; lock_super (sb); es = EXT3_SB(sb)->s_es; @@ -706,7 +694,14 @@ unsigned long ext3_count_free_inodes (st unlock_super(sb); return desc_count; #else - return le32_to_cpu(EXT3_SB(sb)->s_es->s_free_inodes_count); + desc_count = 0; + for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) { + gdp = ext3_get_group_desc (sb, i, NULL); + if (!gdp) + continue; + desc_count += le16_to_cpu(gdp->bg_free_inodes_count); + } + return desc_count; #endif } diff -puN fs/ext3/super.c~ext3-concurrent-block-inode-allocation fs/ext3/super.c --- 25/fs/ext3/super.c~ext3-concurrent-block-inode-allocation Thu Jun 5 15:14:11 2003 +++ 25-akpm/fs/ext3/super.c Thu Jun 5 15:14:11 2003 @@ -460,7 +460,7 @@ void ext3_put_super (struct super_block for (i = 0; i < sbi->s_gdb_count; i++) brelse(sbi->s_group_desc[i]); kfree(sbi->s_group_desc); - kfree(sbi->s_debts); + kfree(sbi->s_bgi); brelse(sbi->s_sbh); /* Debugging code just in case the in-memory inode orphan list @@ -901,6 +901,8 @@ static int ext3_check_descriptors (struc struct ext3_sb_info *sbi = EXT3_SB(sb); unsigned long block = le32_to_cpu(sbi->s_es->s_first_data_block); struct ext3_group_desc * gdp = NULL; + unsigned long total_free; + unsigned int reserved = le32_to_cpu(sbi->s_es->s_r_blocks_count); int desc_block = 0; int i; @@ -947,6 +949,43 @@ static int ext3_check_descriptors (struc block += EXT3_BLOCKS_PER_GROUP(sb); gdp++; } + + total_free = ext3_count_free_blocks(sb); + if (total_free != le32_to_cpu(EXT3_SB(sb)->s_es->s_free_blocks_count)) { + printk("EXT3-fs: invalid s_free_blocks_count %u (real %lu)\n", + le32_to_cpu(EXT3_SB(sb)->s_es->s_free_blocks_count), + total_free); + EXT3_SB(sb)->s_es->s_free_blocks_count = cpu_to_le32(total_free); + } + + /* distribute reserved blocks over groups -bzzz */ + for(i = sbi->s_groups_count - 1; reserved && total_free && i >= 0; i--) { + int free; + + gdp = ext3_get_group_desc (sb, i, NULL); + if (!gdp) { + ext3_error (sb, "ext3_check_descriptors", + "cant get descriptor for group %d", i); + return 0; + } + + free = le16_to_cpu(gdp->bg_free_blocks_count); + if (free > reserved) + free = reserved; + sbi->s_bgi[i].bg_reserved = free; + reserved -= free; + total_free -= free; + } + + total_free = ext3_count_free_inodes(sb); + if (total_free != le32_to_cpu(EXT3_SB(sb)->s_es->s_free_inodes_count)) { + printk("EXT3-fs: invalid s_free_inodes_count %u (real %lu)\n", + le32_to_cpu(EXT3_SB(sb)->s_es->s_free_inodes_count), + total_free); + EXT3_SB(sb)->s_es->s_free_inodes_count = cpu_to_le32(total_free); + } + + return 1; } @@ -1307,13 +1346,17 @@ static int ext3_fill_super (struct super printk (KERN_ERR "EXT3-fs: not enough memory\n"); goto failed_mount; } - sbi->s_debts = kmalloc(sbi->s_groups_count * sizeof(*sbi->s_debts), + sbi->s_bgi = kmalloc(sbi->s_groups_count * sizeof(struct ext3_bg_info), GFP_KERNEL); - if (!sbi->s_debts) { - printk ("EXT3-fs: not enough memory\n"); + if (!sbi->s_bgi) { + printk("EXT3-fs: not enough memory to allocate s_bgi\n"); goto failed_mount2; } - memset(sbi->s_debts, 0, sbi->s_groups_count * sizeof(*sbi->s_debts)); + memset(sbi->s_bgi, 0, sbi->s_groups_count * sizeof(struct ext3_bg_info)); + for (i = 0; i < sbi->s_groups_count; i++) { + spin_lock_init(&sbi->s_bgi[i].bg_balloc_lock); + spin_lock_init(&sbi->s_bgi[i].bg_ialloc_lock); + } for (i = 0; i < db_count; i++) { block = descriptor_loc(sb, logic_sb_block, i); sbi->s_group_desc[i] = sb_bread(sb, block); @@ -1329,7 +1372,6 @@ static int ext3_fill_super (struct super goto failed_mount2; } sbi->s_gdb_count = db_count; - sbi->s_dir_count = ext3_count_dirs(sb); /* * set up enough so that it can read an inode */ @@ -1432,8 +1474,7 @@ static int ext3_fill_super (struct super failed_mount3: journal_destroy(sbi->s_journal); failed_mount2: - if (sbi->s_debts) - kfree(sbi->s_debts); + kfree(sbi->s_bgi); for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); kfree(sbi->s_group_desc); @@ -1702,6 +1743,8 @@ static void ext3_commit_super (struct su if (!sbh) return; es->s_wtime = cpu_to_le32(get_seconds()); + es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb)); + es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb)); BUFFER_TRACE(sbh, "marking dirty"); mark_buffer_dirty(sbh); if (sync) diff -puN include/linux/ext3_fs.h~ext3-concurrent-block-inode-allocation include/linux/ext3_fs.h --- 25/include/linux/ext3_fs.h~ext3-concurrent-block-inode-allocation Thu Jun 5 15:14:11 2003 +++ 25-akpm/include/linux/ext3_fs.h Thu Jun 5 15:14:11 2003 @@ -344,7 +344,9 @@ struct ext3_inode { #endif #define ext3_set_bit ext2_set_bit +#define ext3_set_bit_atomic ext2_set_bit_atomic #define ext3_clear_bit ext2_clear_bit +#define ext3_clear_bit_atomic ext2_clear_bit_atomic #define ext3_test_bit ext2_test_bit #define ext3_find_first_zero_bit ext2_find_first_zero_bit #define ext3_find_next_zero_bit ext2_find_next_zero_bit diff -puN include/linux/ext3_fs_sb.h~ext3-concurrent-block-inode-allocation include/linux/ext3_fs_sb.h --- 25/include/linux/ext3_fs_sb.h~ext3-concurrent-block-inode-allocation Thu Jun 5 15:14:11 2003 +++ 25-akpm/include/linux/ext3_fs_sb.h Thu Jun 5 15:14:11 2003 @@ -21,6 +21,13 @@ #include #endif +struct ext3_bg_info { + u8 bg_debts; + spinlock_t bg_balloc_lock; + spinlock_t bg_ialloc_lock; + unsigned long bg_reserved; +} ____cacheline_aligned_in_smp; + /* * third extended-fs super-block data in memory */ @@ -50,8 +57,7 @@ struct ext3_sb_info { u32 s_next_generation; u32 s_hash_seed[4]; int s_def_hash_version; - unsigned long s_dir_count; - u8 *s_debts; + struct ext3_bg_info *s_bgi; /* Journaling */ struct inode * s_journal_inode; diff -puN fs/jbd/journal.c~ext3-concurrent-block-inode-allocation fs/jbd/journal.c --- 25/fs/jbd/journal.c~ext3-concurrent-block-inode-allocation Thu Jun 5 15:14:11 2003 +++ 25-akpm/fs/jbd/journal.c Thu Jun 5 15:14:11 2003 @@ -48,9 +48,7 @@ EXPORT_SYMBOL(journal_get_create_access) EXPORT_SYMBOL(journal_get_undo_access); EXPORT_SYMBOL(journal_dirty_data); EXPORT_SYMBOL(journal_dirty_metadata); -#if 0 EXPORT_SYMBOL(journal_release_buffer); -#endif EXPORT_SYMBOL(journal_forget); #if 0 EXPORT_SYMBOL(journal_sync_buffer); diff -puN fs/jbd/transaction.c~ext3-concurrent-block-inode-allocation fs/jbd/transaction.c --- 25/fs/jbd/transaction.c~ext3-concurrent-block-inode-allocation Thu Jun 5 15:14:11 2003 +++ 25-akpm/fs/jbd/transaction.c Thu Jun 5 15:14:11 2003 @@ -1106,7 +1106,6 @@ out_unlock: return 0; } -#if 0 /* * journal_release_buffer: undo a get_write_access without any buffer * updates, if the update decided in the end that it didn't need access. @@ -1140,7 +1139,6 @@ void journal_release_buffer (handle_t *h JBUFFER_TRACE(jh, "exit"); unlock_journal(journal); } -#endif /** * void journal_forget() - bforget() for potentially-journaled buffers. diff -puN include/linux/ext3_jbd.h~ext3-concurrent-block-inode-allocation include/linux/ext3_jbd.h --- 25/include/linux/ext3_jbd.h~ext3-concurrent-block-inode-allocation Thu Jun 5 15:14:11 2003 +++ 25-akpm/include/linux/ext3_jbd.h Thu Jun 5 15:14:11 2003 @@ -117,6 +117,12 @@ __ext3_journal_get_write_access(const ch } static inline void +ext3_journal_release_buffer(handle_t *handle, struct buffer_head *bh) +{ + journal_release_buffer(handle, bh); +} + +static inline void ext3_journal_forget(handle_t *handle, struct buffer_head *bh) { journal_forget(handle, bh); _