From: Alex Tomas This is a port from ext2 of the fuzzy counters (for Orlov allocator heuristics) and the hashed spinlocking (for the inode and bloock allocators). DESC fix ext3 inode allocator race EDESC From Alex Tomas. Fix a lockup wherein the inode allocation code will loop around thinking that a blockgroup has a free block, then finding that it didn't, then reselecting the same blockgroup. 25-akpm/fs/ext3/balloc.c | 53 +++++++-------- 25-akpm/fs/ext3/ialloc.c | 127 +++++++++++++++++++++---------------- 25-akpm/fs/ext3/super.c | 47 +++++-------- 25-akpm/include/linux/ext3_fs_sb.h | 15 ++-- 4 files changed, 126 insertions(+), 116 deletions(-) diff -puN fs/ext3/balloc.c~ext3-concurrent-block-allocation-hashed fs/ext3/balloc.c --- 25/fs/ext3/balloc.c~ext3-concurrent-block-allocation-hashed Thu Jun 5 15:14:12 2003 +++ 25-akpm/fs/ext3/balloc.c Thu Jun 5 15:14:12 2003 @@ -110,6 +110,7 @@ void ext3_free_blocks (handle_t *handle, struct super_block * sb; struct ext3_group_desc * gdp; struct ext3_super_block * es; + struct ext3_sb_info *sbi; int err = 0, ret; int dquot_freed_blocks = 0; @@ -118,6 +119,7 @@ void ext3_free_blocks (handle_t *handle, printk ("ext3_free_blocks: nonexistent device"); return; } + sbi = EXT3_SB(sb); es = EXT3_SB(sb)->s_es; if (block < le32_to_cpu(es->s_first_data_block) || block + count < block || @@ -242,11 +244,12 @@ do_more: } } - spin_lock(bg_lock(sb, block_group)); + spin_lock(sb_bgl_lock(sbi, block_group)); gdp->bg_free_blocks_count = cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + dquot_freed_blocks); - spin_unlock(bg_lock(sb, block_group)); + spin_unlock(sb_bgl_lock(sbi, block_group)); + percpu_counter_mod(&sbi->s_freeblocks_counter, count); /* We dirtied the bitmap block */ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); @@ -429,7 +432,7 @@ got: have_access = 1; } - if (!claim_block(bg_lock(sb, group), goal, bitmap_bh)) { + if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group), goal, bitmap_bh)) { /* * The block was allocated by another thread, or it was * allocated and then freed by another thread @@ -477,11 +480,11 @@ ext3_new_block(handle_t *handle, struct int target_block; /* tmp */ int fatal = 0, err; int performed_allocation = 0; - int free; - int use_reserve = 0; + int free_blocks, root_blocks; struct super_block *sb; struct ext3_group_desc *gdp; struct ext3_super_block *es; + struct ext3_sb_info *sbi; #ifdef EXT3FS_DEBUG static int goal_hits = 0, goal_attempts = 0; #endif @@ -500,9 +503,19 @@ ext3_new_block(handle_t *handle, struct return 0; } + sbi = EXT3_SB(sb); es = EXT3_SB(sb)->s_es; ext3_debug("goal=%lu.\n", goal); + free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); + root_blocks = le32_to_cpu(es->s_r_blocks_count); + if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) && + sbi->s_resuid != current->fsuid && + (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) { + *errp = -ENOSPC; + return 0; + } + /* * First, test whether the goal block is free. */ @@ -515,9 +528,8 @@ ext3_new_block(handle_t *handle, struct if (!gdp) goto io_error; - free = le16_to_cpu(gdp->bg_free_blocks_count); - free -= EXT3_SB(sb)->s_bgi[group_no].bg_reserved; - if (free > 0) { + free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); + if (free_blocks > 0) { ret_block = ((goal - le32_to_cpu(es->s_first_data_block)) % EXT3_BLOCKS_PER_GROUP(sb)); bitmap_bh = read_block_bitmap(sb, group_no); @@ -535,7 +547,6 @@ ext3_new_block(handle_t *handle, struct * Now search the rest of the groups. We assume that * i and gdp correctly point to the last group visited. */ -repeat: for (bgi = 0; bgi < EXT3_SB(sb)->s_groups_count; bgi++) { group_no++; if (group_no >= EXT3_SB(sb)->s_groups_count) @@ -545,10 +556,8 @@ repeat: *errp = -EIO; goto out; } - free = le16_to_cpu(gdp->bg_free_blocks_count); - if (!use_reserve) - free -= EXT3_SB(sb)->s_bgi[group_no].bg_reserved; - if (free <= 0) + free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); + if (free_blocks <= 0) continue; brelse(bitmap_bh); @@ -563,15 +572,6 @@ repeat: goto allocated; } - if (!use_reserve && - (EXT3_SB(sb)->s_resuid == current->fsuid || - (EXT3_SB(sb)->s_resgid != 0 && in_group_p(EXT3_SB(sb)->s_resgid)) || - capable(CAP_SYS_RESOURCE))) { - use_reserve = 1; - group_no = 0; - goto repeat; - } - /* No space left on the device */ *errp = -ENOSPC; goto out; @@ -612,13 +612,13 @@ allocated: } } #endif - spin_lock(bg_lock(sb, group_no)); + spin_lock(sb_bgl_lock(sbi, group_no)); if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) J_ASSERT_BH(bitmap_bh, !ext3_test_bit(ret_block, bh2jh(bitmap_bh)->b_committed_data)); ext3_debug("found bit %d\n", ret_block); - spin_unlock(bg_lock(sb, group_no)); + spin_unlock(sb_bgl_lock(sbi, group_no)); /* ret_block was blockgroup-relative. Now it becomes fs-relative */ ret_block = target_block; @@ -639,10 +639,11 @@ allocated: ext3_debug("allocating block %d. Goal hits %d of %d.\n", ret_block, goal_hits, goal_attempts); - spin_lock(bg_lock(sb, group_no)); + spin_lock(sb_bgl_lock(sbi, group_no)); gdp->bg_free_blocks_count = cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - 1); - spin_unlock(bg_lock(sb, group_no)); + spin_unlock(sb_bgl_lock(sbi, group_no)); + percpu_counter_mod(&sbi->s_freeblocks_counter, -1); BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor"); err = ext3_journal_dirty_metadata(handle, gdp_bh); diff -puN fs/ext3/ialloc.c~ext3-concurrent-block-allocation-hashed fs/ext3/ialloc.c --- 25/fs/ext3/ialloc.c~ext3-concurrent-block-allocation-hashed Thu Jun 5 15:14:12 2003 +++ 25-akpm/fs/ext3/ialloc.c Thu Jun 5 15:14:12 2003 @@ -97,6 +97,7 @@ void ext3_free_inode (handle_t *handle, unsigned long bit; struct ext3_group_desc * gdp; struct ext3_super_block * es; + struct ext3_sb_info *sbi = EXT3_SB(sb); int fatal = 0, err; if (atomic_read(&inode->i_count) > 1) { @@ -161,13 +162,17 @@ void ext3_free_inode (handle_t *handle, if (fatal) goto error_return; if (gdp) { - spin_lock(&EXT3_SB(sb)->s_bgi[block_group].bg_ialloc_lock); + spin_lock(sb_bgl_lock(sbi, block_group)); gdp->bg_free_inodes_count = cpu_to_le16( le16_to_cpu(gdp->bg_free_inodes_count) + 1); if (is_directory) gdp->bg_used_dirs_count = cpu_to_le16( le16_to_cpu(gdp->bg_used_dirs_count) - 1); - spin_unlock(&EXT3_SB(sb)->s_bgi[block_group].bg_ialloc_lock); + spin_unlock(sb_bgl_lock(sbi, block_group)); + percpu_counter_inc(&sbi->s_freeinodes_counter); + if (is_directory) + percpu_counter_dec(&sbi->s_dirs_counter); + } BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata"); err = ext3_journal_dirty_metadata(handle, bh2); @@ -196,11 +201,14 @@ error_return: static int find_group_dir(struct super_block *sb, struct inode *parent) { int ngroups = EXT3_SB(sb)->s_groups_count; - int avefreei = ext3_count_free_inodes(sb) / ngroups; + int freei, avefreei; struct ext3_group_desc *desc, *best_desc = NULL; struct buffer_head *bh; int group, best_group = -1; + freei = percpu_counter_read_positive(&EXT3_SB(sb)->s_freeinodes_counter); + avefreei = freei / ngroups; + for (group = 0; group < ngroups; group++) { desc = ext3_get_group_desc (sb, group, &bh); if (!desc || !desc->bg_free_inodes_count) @@ -252,17 +260,20 @@ static int find_group_orlov(struct super struct ext3_super_block *es = sbi->s_es; int ngroups = sbi->s_groups_count; int inodes_per_group = EXT3_INODES_PER_GROUP(sb); - int freei = ext3_count_free_inodes(sb); - int avefreei = freei / ngroups; - int freeb = ext3_count_free_blocks(sb); - int avefreeb = freeb / ngroups; - int blocks_per_dir; - int ndirs = ext3_count_dirs(sb); + int freei, avefreei; + int freeb, avefreeb; + int blocks_per_dir, ndirs; int max_debt, max_dirs, min_blocks, min_inodes; int group = -1, i; struct ext3_group_desc *desc; struct buffer_head *bh; + freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); + avefreei = freei / ngroups; + freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter); + avefreeb = freeb / ngroups; + ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); + if ((parent == sb->s_root->d_inode) || (parent->i_flags & EXT3_TOPDIR_FL)) { int best_ndir = inodes_per_group; @@ -289,8 +300,7 @@ static int find_group_orlov(struct super goto fallback; } - blocks_per_dir = (le32_to_cpu(es->s_blocks_count) - - le32_to_cpu(es->s_free_blocks_count)) / ndirs; + blocks_per_dir = (le32_to_cpu(es->s_blocks_count) - freeb) / ndirs; max_dirs = ndirs / ngroups + inodes_per_group / 16; min_inodes = avefreei - inodes_per_group / 4; @@ -309,7 +319,7 @@ static int find_group_orlov(struct super desc = ext3_get_group_desc (sb, group, &bh); if (!desc || !desc->bg_free_inodes_count) continue; - if (sbi->s_bgi[group].bg_debts >= max_debt) + if (sbi->s_debts[group] >= max_debt) continue; if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs) continue; @@ -416,13 +426,15 @@ struct inode *ext3_new_inode(handle_t *h struct buffer_head *bitmap_bh = NULL; struct buffer_head *bh2; int group; - unsigned long ino; + unsigned long ino = 0; struct inode * inode; - struct ext3_group_desc * gdp; + struct ext3_group_desc * gdp = NULL; struct ext3_super_block * es; struct ext3_inode_info *ei; + struct ext3_sb_info *sbi; int err = 0; struct inode *ret; + int i; /* Cannot create files in a deleted directory */ if (!dir || !dir->i_nlink) @@ -435,7 +447,7 @@ struct inode *ext3_new_inode(handle_t *h ei = EXT3_I(inode); es = EXT3_SB(sb)->s_es; -repeat: + sbi = EXT3_SB(sb); if (S_ISDIR(mode)) { if (test_opt (sb, OLDALLOC)) group = find_group_dir(sb, dir); @@ -448,46 +460,52 @@ repeat: if (group == -1) goto out; - err = -EIO; - brelse(bitmap_bh); - bitmap_bh = read_inode_bitmap(sb, group); - if (!bitmap_bh) - goto fail; - gdp = ext3_get_group_desc (sb, group, &bh2); - - if ((ino = ext3_find_first_zero_bit((unsigned long *)bitmap_bh->b_data, - EXT3_INODES_PER_GROUP(sb))) < - EXT3_INODES_PER_GROUP(sb)) { - BUFFER_TRACE(bitmap_bh, "get_write_access"); - err = ext3_journal_get_write_access(handle, bitmap_bh); - if (err) goto fail; - - if (ext3_set_bit_atomic(sb_bgl_lock(sbi, group), - ino, bitmap_bh->b_data)) - goto repeat; - BUFFER_TRACE(bitmap_bh, "call ext3_journal_dirty_metadata"); - err = ext3_journal_dirty_metadata(handle, bitmap_bh); - if (err) goto fail; - } else { - if (le16_to_cpu(gdp->bg_free_inodes_count) != 0) { - ext3_error (sb, "ext3_new_inode", - "Free inodes count corrupted in group %d", - group); - /* Is it really ENOSPC? */ - err = -ENOSPC; - if (sb->s_flags & MS_RDONLY) + for (i = 0; i < sbi->s_groups_count; i++) { + gdp = ext3_get_group_desc(sb, group, &bh2); + + err = -EIO; + brelse(bitmap_bh); + bitmap_bh = read_inode_bitmap(sb, group); + if (!bitmap_bh) + goto fail; + + ino = ext3_find_first_zero_bit((unsigned long *) + bitmap_bh->b_data, EXT3_INODES_PER_GROUP(sb)); + if (ino < EXT3_INODES_PER_GROUP(sb)) { + BUFFER_TRACE(bitmap_bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, bitmap_bh); + if (err) goto fail; - BUFFER_TRACE(bh2, "get_write_access"); - err = ext3_journal_get_write_access(handle, bh2); - if (err) goto fail; - gdp->bg_free_inodes_count = 0; - BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata"); - err = ext3_journal_dirty_metadata(handle, bh2); - if (err) goto fail; + if (!ext3_set_bit_atomic(sb_bgl_lock(sbi, group), + ino, bitmap_bh->b_data)) { + /* we won it */ + BUFFER_TRACE(bitmap_bh, + "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, + bitmap_bh); + if (err) + goto fail; + goto got; + } + /* we lost it */ + journal_release_buffer(handle, bitmap_bh); } - goto repeat; + + /* + * This case is possible in concurrent environment. It is very + * rare. We cannot repeat the find_group_xxx() call because + * that will simply return the same blockgroup, because the + * group descriptor metadata has not yet been updated. + * So we just go onto the next blockgroup. + */ + if (++group == sbi->s_groups_count) + group = 0; } + err = -ENOSPC; + goto out; + +got: ino += group * EXT3_INODES_PER_GROUP(sb) + 1; if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { ext3_error (sb, "ext3_new_inode", @@ -500,18 +518,21 @@ repeat: BUFFER_TRACE(bh2, "get_write_access"); err = ext3_journal_get_write_access(handle, bh2); if (err) goto fail; - spin_lock(&EXT3_SB(sb)->s_bgi[group].bg_ialloc_lock); + spin_lock(sb_bgl_lock(sbi, group)); gdp->bg_free_inodes_count = cpu_to_le16(le16_to_cpu(gdp->bg_free_inodes_count) - 1); if (S_ISDIR(mode)) { gdp->bg_used_dirs_count = cpu_to_le16(le16_to_cpu(gdp->bg_used_dirs_count) + 1); } - spin_unlock(&EXT3_SB(sb)->s_bgi[group].bg_ialloc_lock); + spin_unlock(sb_bgl_lock(sbi, group)); BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata"); err = ext3_journal_dirty_metadata(handle, bh2); if (err) goto fail; + percpu_counter_dec(&sbi->s_freeinodes_counter); + if (S_ISDIR(mode)) + percpu_counter_inc(&sbi->s_dirs_counter); sb->s_dirt = 1; inode->i_uid = current->fsuid; diff -puN fs/ext3/super.c~ext3-concurrent-block-allocation-hashed fs/ext3/super.c --- 25/fs/ext3/super.c~ext3-concurrent-block-allocation-hashed Thu Jun 5 15:14:12 2003 +++ 25-akpm/fs/ext3/super.c Thu Jun 5 15:14:12 2003 @@ -460,7 +460,7 @@ void ext3_put_super (struct super_block for (i = 0; i < sbi->s_gdb_count; i++) brelse(sbi->s_group_desc[i]); kfree(sbi->s_group_desc); - kfree(sbi->s_bgi); + kfree(sbi->s_debts); brelse(sbi->s_sbh); /* Debugging code just in case the in-memory inode orphan list @@ -902,7 +902,6 @@ static int ext3_check_descriptors (struc unsigned long block = le32_to_cpu(sbi->s_es->s_first_data_block); struct ext3_group_desc * gdp = NULL; unsigned long total_free; - unsigned int reserved = le32_to_cpu(sbi->s_es->s_r_blocks_count); int desc_block = 0; int i; @@ -958,25 +957,6 @@ static int ext3_check_descriptors (struc EXT3_SB(sb)->s_es->s_free_blocks_count = cpu_to_le32(total_free); } - /* distribute reserved blocks over groups -bzzz */ - for(i = sbi->s_groups_count - 1; reserved && total_free && i >= 0; i--) { - int free; - - gdp = ext3_get_group_desc (sb, i, NULL); - if (!gdp) { - ext3_error (sb, "ext3_check_descriptors", - "cant get descriptor for group %d", i); - return 0; - } - - free = le16_to_cpu(gdp->bg_free_blocks_count); - if (free > reserved) - free = reserved; - sbi->s_bgi[i].bg_reserved = free; - reserved -= free; - total_free -= free; - } - total_free = ext3_count_free_inodes(sb); if (total_free != le32_to_cpu(EXT3_SB(sb)->s_es->s_free_inodes_count)) { printk("EXT3-fs: invalid s_free_inodes_count %u (real %lu)\n", @@ -1346,17 +1326,19 @@ static int ext3_fill_super (struct super printk (KERN_ERR "EXT3-fs: not enough memory\n"); goto failed_mount; } - sbi->s_bgi = kmalloc(sbi->s_groups_count * sizeof(struct ext3_bg_info), + sbi->s_debts = kmalloc(sbi->s_groups_count * sizeof(u8), GFP_KERNEL); - if (!sbi->s_bgi) { + if (!sbi->s_debts) { printk("EXT3-fs: not enough memory to allocate s_bgi\n"); goto failed_mount2; } - memset(sbi->s_bgi, 0, sbi->s_groups_count * sizeof(struct ext3_bg_info)); - for (i = 0; i < sbi->s_groups_count; i++) { - spin_lock_init(&sbi->s_bgi[i].bg_balloc_lock); - spin_lock_init(&sbi->s_bgi[i].bg_ialloc_lock); - } + memset(sbi->s_debts, 0, sbi->s_groups_count * sizeof(u8)); + + percpu_counter_init(&sbi->s_freeblocks_counter); + percpu_counter_init(&sbi->s_freeinodes_counter); + percpu_counter_init(&sbi->s_dirs_counter); + bgl_lock_init(&sbi->s_blockgroup_lock); + for (i = 0; i < db_count; i++) { block = descriptor_loc(sb, logic_sb_block, i); sbi->s_group_desc[i] = sb_bread(sb, block); @@ -1469,12 +1451,19 @@ static int ext3_fill_super (struct super test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": "writeback"); + percpu_counter_mod(&sbi->s_freeblocks_counter, + ext3_count_free_blocks(sb)); + percpu_counter_mod(&sbi->s_freeinodes_counter, + ext3_count_free_inodes(sb)); + percpu_counter_mod(&sbi->s_dirs_counter, + ext3_count_dirs(sb)); + return 0; failed_mount3: journal_destroy(sbi->s_journal); failed_mount2: - kfree(sbi->s_bgi); + kfree(sbi->s_debts); for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); kfree(sbi->s_group_desc); diff -puN include/linux/ext3_fs_sb.h~ext3-concurrent-block-allocation-hashed include/linux/ext3_fs_sb.h --- 25/include/linux/ext3_fs_sb.h~ext3-concurrent-block-allocation-hashed Thu Jun 5 15:14:12 2003 +++ 25-akpm/include/linux/ext3_fs_sb.h Thu Jun 5 15:14:12 2003 @@ -19,15 +19,10 @@ #ifdef __KERNEL__ #include #include +#include +#include #endif -struct ext3_bg_info { - u8 bg_debts; - spinlock_t bg_balloc_lock; - spinlock_t bg_ialloc_lock; - unsigned long bg_reserved; -} ____cacheline_aligned_in_smp; - /* * third extended-fs super-block data in memory */ @@ -57,7 +52,11 @@ struct ext3_sb_info { u32 s_next_generation; u32 s_hash_seed[4]; int s_def_hash_version; - struct ext3_bg_info *s_bgi; + u8 *s_debts; + struct percpu_counter s_freeblocks_counter; + struct percpu_counter s_freeinodes_counter; + struct percpu_counter s_dirs_counter; + struct blockgroup_lock s_blockgroup_lock; /* Journaling */ struct inode * s_journal_inode; _