From: Alex Tomas here is port of your percpu_counters + group locks onto ext3. plus, I think percpu_counter_read_positive() should not return 0. because ext2/ext3 have no counters that may be 0. moreover, if percpu_counter_read_positive() return 0 for dirs, then we'll get 'divide by zero' oops. this patch fix it up. fs/ext3/balloc.c | 53 ++++++++++++++++++++++----------------------- fs/ext3/ialloc.c | 43 ++++++++++++++++++++++++------------ fs/ext3/super.c | 47 +++++++++++++++------------------------ include/linux/ext3_fs_sb.h | 15 +++++------- 4 files changed, 81 insertions(+), 77 deletions(-) diff -puN fs/ext3/balloc.c~ext3-concurrent-block-allocation-hashed fs/ext3/balloc.c --- 25/fs/ext3/balloc.c~ext3-concurrent-block-allocation-hashed 2003-03-27 21:57:25.000000000 -0800 +++ 25-akpm/fs/ext3/balloc.c 2003-03-27 21:57:25.000000000 -0800 @@ -115,6 +115,7 @@ void ext3_free_blocks (handle_t *handle, struct super_block * sb; struct ext3_group_desc * gdp; struct ext3_super_block * es; + struct ext3_sb_info *sbi; int err = 0, ret; int dquot_freed_blocks = 0; @@ -123,6 +124,7 @@ void ext3_free_blocks (handle_t *handle, printk ("ext3_free_blocks: nonexistent device"); return; } + sbi = EXT3_SB(sb); es = EXT3_SB(sb)->s_es; if (block < le32_to_cpu(es->s_first_data_block) || block + count < block || @@ -247,11 +249,12 @@ do_more: } } - spin_lock(bg_lock(sb, block_group)); + spin_lock(sb_bgl_lock(sbi, block_group)); gdp->bg_free_blocks_count = cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + dquot_freed_blocks); - spin_unlock(bg_lock(sb, block_group)); + spin_unlock(sb_bgl_lock(sbi, block_group)); + percpu_counter_mod(&sbi->s_freeblocks_counter, count); /* We dirtied the bitmap block */ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); @@ -434,7 +437,7 @@ got: have_access = 1; } - if (!claim_block(bg_lock(sb, group), goal, bitmap_bh)) { + if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group), goal, bitmap_bh)) { /* * The block was allocated by another thread, or it was * allocated and then freed by another thread @@ -482,11 +485,11 @@ ext3_new_block(handle_t *handle, struct int target_block; /* tmp */ int fatal = 0, err; int performed_allocation = 0; - int free; - int use_reserve = 0; + int free_blocks, root_blocks; struct super_block *sb; struct ext3_group_desc *gdp; struct ext3_super_block *es; + struct ext3_sb_info *sbi; #ifdef EXT3FS_DEBUG static int goal_hits = 0, goal_attempts = 0; #endif @@ -505,9 +508,19 @@ ext3_new_block(handle_t *handle, struct return 0; } + sbi = EXT3_SB(sb); es = EXT3_SB(sb)->s_es; ext3_debug("goal=%lu.\n", goal); + free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); + root_blocks = le32_to_cpu(es->s_r_blocks_count); + if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) && + sbi->s_resuid != current->fsuid && + (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) { + *errp = -ENOSPC; + return 0; + } + /* * First, test whether the goal block is free. */ @@ -520,9 +533,8 @@ ext3_new_block(handle_t *handle, struct if (!gdp) goto io_error; - free = le16_to_cpu(gdp->bg_free_blocks_count); - free -= EXT3_SB(sb)->s_bgi[group_no].bg_reserved; - if (free > 0) { + free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); + if (free_blocks > 0) { ret_block = ((goal - le32_to_cpu(es->s_first_data_block)) % EXT3_BLOCKS_PER_GROUP(sb)); bitmap_bh = read_block_bitmap(sb, group_no); @@ -540,7 +552,6 @@ ext3_new_block(handle_t *handle, struct * Now search the rest of the groups. We assume that * i and gdp correctly point to the last group visited. */ -repeat: for (bgi = 0; bgi < EXT3_SB(sb)->s_groups_count; bgi++) { group_no++; if (group_no >= EXT3_SB(sb)->s_groups_count) @@ -550,10 +561,8 @@ repeat: *errp = -EIO; goto out; } - free = le16_to_cpu(gdp->bg_free_blocks_count); - if (!use_reserve) - free -= EXT3_SB(sb)->s_bgi[group_no].bg_reserved; - if (free <= 0) + free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); + if (free_blocks <= 0) continue; brelse(bitmap_bh); @@ -568,15 +577,6 @@ repeat: goto allocated; } - if (!use_reserve && - (EXT3_SB(sb)->s_resuid == current->fsuid || - (EXT3_SB(sb)->s_resgid != 0 && in_group_p(EXT3_SB(sb)->s_resgid)) || - capable(CAP_SYS_RESOURCE))) { - use_reserve = 1; - group_no = 0; - goto repeat; - } - /* No space left on the device */ *errp = -ENOSPC; goto out; @@ -617,13 +617,13 @@ allocated: } } #endif - spin_lock(bg_lock(sb, group_no)); + spin_lock(sb_bgl_lock(sbi, group_no)); if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) J_ASSERT_BH(bitmap_bh, !ext3_test_bit(ret_block, bh2jh(bitmap_bh)->b_committed_data)); ext3_debug("found bit %d\n", ret_block); - spin_unlock(bg_lock(sb, group_no)); + spin_unlock(sb_bgl_lock(sbi, group_no)); /* ret_block was blockgroup-relative. Now it becomes fs-relative */ ret_block = target_block; @@ -644,10 +644,11 @@ allocated: ext3_debug("allocating block %d. Goal hits %d of %d.\n", ret_block, goal_hits, goal_attempts); - spin_lock(bg_lock(sb, group_no)); + spin_lock(sb_bgl_lock(sbi, group_no)); gdp->bg_free_blocks_count = cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - 1); - spin_unlock(bg_lock(sb, group_no)); + spin_unlock(sb_bgl_lock(sbi, group_no)); + percpu_counter_mod(&sbi->s_freeblocks_counter, -1); BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor"); err = ext3_journal_dirty_metadata(handle, gdp_bh); diff -puN fs/ext3/ialloc.c~ext3-concurrent-block-allocation-hashed fs/ext3/ialloc.c --- 25/fs/ext3/ialloc.c~ext3-concurrent-block-allocation-hashed 2003-03-27 21:57:25.000000000 -0800 +++ 25-akpm/fs/ext3/ialloc.c 2003-03-27 23:15:17.000000000 -0800 @@ -97,6 +97,7 @@ void ext3_free_inode (handle_t *handle, unsigned long bit; struct ext3_group_desc * gdp; struct ext3_super_block * es; + struct ext3_sb_info *sbi = EXT3_SB(sb); int fatal = 0, err; if (atomic_read(&inode->i_count) > 1) { @@ -161,13 +162,17 @@ void ext3_free_inode (handle_t *handle, if (fatal) goto error_return; if (gdp) { - spin_lock(&EXT3_SB(sb)->s_bgi[block_group].bg_ialloc_lock); + spin_lock(sb_bgl_lock(sbi, block_group)); gdp->bg_free_inodes_count = cpu_to_le16( le16_to_cpu(gdp->bg_free_inodes_count) + 1); if (is_directory) gdp->bg_used_dirs_count = cpu_to_le16( le16_to_cpu(gdp->bg_used_dirs_count) - 1); - spin_unlock(&EXT3_SB(sb)->s_bgi[block_group].bg_ialloc_lock); + spin_unlock(sb_bgl_lock(sbi, block_group)); + percpu_counter_inc(&sbi->s_freeinodes_counter); + if (is_directory) + percpu_counter_dec(&sbi->s_dirs_counter); + } BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata"); err = ext3_journal_dirty_metadata(handle, bh2); @@ -196,11 +201,14 @@ error_return: static int find_group_dir(struct super_block *sb, struct inode *parent) { int ngroups = EXT3_SB(sb)->s_groups_count; - int avefreei = ext3_count_free_inodes(sb) / ngroups; + int freei, avefreei; struct ext3_group_desc *desc, *best_desc = NULL; struct buffer_head *bh; int group, best_group = -1; + freei = percpu_counter_read_positive(&EXT3_SB(sb)->s_freeinodes_counter); + avefreei = freei / ngroups; + for (group = 0; group < ngroups; group++) { desc = ext3_get_group_desc (sb, group, &bh); if (!desc || !desc->bg_free_inodes_count) @@ -252,17 +260,20 @@ static int find_group_orlov(struct super struct ext3_super_block *es = sbi->s_es; int ngroups = sbi->s_groups_count; int inodes_per_group = EXT3_INODES_PER_GROUP(sb); - int freei = ext3_count_free_inodes(sb); - int avefreei = freei / ngroups; - int freeb = ext3_count_free_blocks(sb); - int avefreeb = freeb / ngroups; - int blocks_per_dir; - int ndirs = ext3_count_dirs(sb); + int freei, avefreei; + int freeb, avefreeb; + int blocks_per_dir, ndirs; int max_debt, max_dirs, min_blocks, min_inodes; int group = -1, i; struct ext3_group_desc *desc; struct buffer_head *bh; + freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); + avefreei = freei / ngroups; + freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter); + avefreeb = freeb / ngroups; + ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); + if ((parent == sb->s_root->d_inode) || (parent->i_flags & EXT3_TOPDIR_FL)) { int best_ndir = inodes_per_group; @@ -289,8 +300,7 @@ static int find_group_orlov(struct super goto fallback; } - blocks_per_dir = (le32_to_cpu(es->s_blocks_count) - - le32_to_cpu(es->s_free_blocks_count)) / ndirs; + blocks_per_dir = (le32_to_cpu(es->s_blocks_count) - freeb) / ndirs; max_dirs = ndirs / ngroups + inodes_per_group / 16; min_inodes = avefreei - inodes_per_group / 4; @@ -309,7 +319,7 @@ static int find_group_orlov(struct super desc = ext3_get_group_desc (sb, group, &bh); if (!desc || !desc->bg_free_inodes_count) continue; - if (sbi->s_bgi[group].bg_debts >= max_debt) + if (sbi->s_debts[group] >= max_debt) continue; if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs) continue; @@ -412,6 +422,7 @@ struct inode *ext3_new_inode(handle_t *h struct ext3_group_desc * gdp; struct ext3_super_block * es; struct ext3_inode_info *ei; + struct ext3_sb_info *sbi; int err = 0; struct inode *ret; @@ -426,6 +437,7 @@ struct inode *ext3_new_inode(handle_t *h ei = EXT3_I(inode); es = EXT3_SB(sb)->s_es; + sbi = EXT3_SB(sb); repeat: if (S_ISDIR(mode)) { if (test_opt (sb, OLDALLOC)) @@ -491,18 +503,21 @@ repeat: BUFFER_TRACE(bh2, "get_write_access"); err = ext3_journal_get_write_access(handle, bh2); if (err) goto fail; - spin_lock(&EXT3_SB(sb)->s_bgi[group].bg_ialloc_lock); + spin_lock(sb_bgl_lock(sbi, group)); gdp->bg_free_inodes_count = cpu_to_le16(le16_to_cpu(gdp->bg_free_inodes_count) - 1); if (S_ISDIR(mode)) { gdp->bg_used_dirs_count = cpu_to_le16(le16_to_cpu(gdp->bg_used_dirs_count) + 1); } - spin_unlock(&EXT3_SB(sb)->s_bgi[group].bg_ialloc_lock); + spin_unlock(sb_bgl_lock(sbi, group)); BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata"); err = ext3_journal_dirty_metadata(handle, bh2); if (err) goto fail; + percpu_counter_dec(&sbi->s_freeinodes_counter); + if (S_ISDIR(mode)) + percpu_counter_inc(&sbi->s_dirs_counter); sb->s_dirt = 1; inode->i_uid = current->fsuid; diff -puN fs/ext3/super.c~ext3-concurrent-block-allocation-hashed fs/ext3/super.c --- 25/fs/ext3/super.c~ext3-concurrent-block-allocation-hashed 2003-03-27 21:57:25.000000000 -0800 +++ 25-akpm/fs/ext3/super.c 2003-03-27 21:57:25.000000000 -0800 @@ -464,7 +464,7 @@ void ext3_put_super (struct super_block for (i = 0; i < sbi->s_gdb_count; i++) brelse(sbi->s_group_desc[i]); kfree(sbi->s_group_desc); - kfree(sbi->s_bgi); + kfree(sbi->s_debts); brelse(sbi->s_sbh); /* Debugging code just in case the in-memory inode orphan list @@ -904,7 +904,6 @@ static int ext3_check_descriptors (struc unsigned long block = le32_to_cpu(sbi->s_es->s_first_data_block); struct ext3_group_desc * gdp = NULL; unsigned long total_free; - unsigned int reserved = le32_to_cpu(sbi->s_es->s_r_blocks_count); int desc_block = 0; int i; @@ -960,25 +959,6 @@ static int ext3_check_descriptors (struc EXT3_SB(sb)->s_es->s_free_blocks_count = cpu_to_le32(total_free); } - /* distribute reserved blocks over groups -bzzz */ - for(i = sbi->s_groups_count - 1; reserved && total_free && i >= 0; i--) { - int free; - - gdp = ext3_get_group_desc (sb, i, NULL); - if (!gdp) { - ext3_error (sb, "ext3_check_descriptors", - "cant get descriptor for group %d", i); - return 0; - } - - free = le16_to_cpu(gdp->bg_free_blocks_count); - if (free > reserved) - free = reserved; - sbi->s_bgi[i].bg_reserved = free; - reserved -= free; - total_free -= free; - } - total_free = ext3_count_free_inodes(sb); if (total_free != le32_to_cpu(EXT3_SB(sb)->s_es->s_free_inodes_count)) { printk("EXT3-fs: invalid s_free_inodes_count %u (real %lu)\n", @@ -1348,17 +1328,19 @@ static int ext3_fill_super (struct super printk (KERN_ERR "EXT3-fs: not enough memory\n"); goto failed_mount; } - sbi->s_bgi = kmalloc(sbi->s_groups_count * sizeof(struct ext3_bg_info), + sbi->s_debts = kmalloc(sbi->s_groups_count * sizeof(u8), GFP_KERNEL); - if (!sbi->s_bgi) { + if (!sbi->s_debts) { printk("EXT3-fs: not enough memory to allocate s_bgi\n"); goto failed_mount2; } - memset(sbi->s_bgi, 0, sbi->s_groups_count * sizeof(struct ext3_bg_info)); - for (i = 0; i < sbi->s_groups_count; i++) { - spin_lock_init(&sbi->s_bgi[i].bg_balloc_lock); - spin_lock_init(&sbi->s_bgi[i].bg_ialloc_lock); - } + memset(sbi->s_debts, 0, sbi->s_groups_count * sizeof(u8)); + + percpu_counter_init(&sbi->s_freeblocks_counter); + percpu_counter_init(&sbi->s_freeinodes_counter); + percpu_counter_init(&sbi->s_dirs_counter); + bgl_lock_init(&sbi->s_blockgroup_lock); + for (i = 0; i < db_count; i++) { block = descriptor_loc(sb, logic_sb_block, i); sbi->s_group_desc[i] = sb_bread(sb, block); @@ -1470,12 +1452,19 @@ static int ext3_fill_super (struct super test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": "writeback"); + percpu_counter_mod(&sbi->s_freeblocks_counter, + ext3_count_free_blocks(sb)); + percpu_counter_mod(&sbi->s_freeinodes_counter, + ext3_count_free_inodes(sb)); + percpu_counter_mod(&sbi->s_dirs_counter, + ext3_count_dirs(sb)); + return 0; failed_mount3: journal_destroy(sbi->s_journal); failed_mount2: - kfree(sbi->s_bgi); + kfree(sbi->s_debts); for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); kfree(sbi->s_group_desc); diff -puN include/linux/ext3_fs_sb.h~ext3-concurrent-block-allocation-hashed include/linux/ext3_fs_sb.h --- 25/include/linux/ext3_fs_sb.h~ext3-concurrent-block-allocation-hashed 2003-03-27 21:57:25.000000000 -0800 +++ 25-akpm/include/linux/ext3_fs_sb.h 2003-03-27 21:57:25.000000000 -0800 @@ -19,15 +19,10 @@ #ifdef __KERNEL__ #include #include +#include +#include #endif -struct ext3_bg_info { - u8 bg_debts; - spinlock_t bg_balloc_lock; - spinlock_t bg_ialloc_lock; - unsigned long bg_reserved; -} ____cacheline_aligned_in_smp; - /* * third extended-fs super-block data in memory */ @@ -57,7 +52,11 @@ struct ext3_sb_info { u32 s_next_generation; u32 s_hash_seed[4]; int s_def_hash_version; - struct ext3_bg_info *s_bgi; + u8 *s_debts; + struct percpu_counter s_freeblocks_counter; + struct percpu_counter s_freeinodes_counter; + struct percpu_counter s_dirs_counter; + struct blockgroup_lock s_blockgroup_lock; /* Journaling */ struct inode * s_journal_inode; _