From: "Theodore Ts'o" Here is a reworked version of my patch to ext3 to retry certain filesystem operations after an ENOSPC error. The ext3_should_retry_alloc() function will not wait on the currently running transaction if there is a currently active handle; hence this should avoid deadlocks in the Lustre use case. The patch is versus BK-recent. I've also included a simple, reliable test case which demonstrates the problem this patch is intended to fix. (Note that BK-recent is not sufficient to address this test case, and waiting on the commiting transaction in ext3_new_block is also not sufficient. Been there, tried that, didn't work. We need to do the full-bore retry from the top level. The ext3_should_retry_alloc() will only wait on the committing transaction if there is an active handle; hence Lustre will probably also need to use ext3_should_retry_alloc() if it wants to reliably avoid this particular problem.) #!/bin/sh # # TEST_DIR=/tmp IMAGE=$TEST_DIR/retry.img MNTPT=$TEST_DIR/retry.mnt TEST_SRC=/usr/projects/e2fsprogs/e2fsprogs/build MKE2FS_OPTS="" IMAGE_SIZE=8192 umount $MNTPT dd if=/dev/zero of=$IMAGE bs=4k count=$IMAGE_SIZE mke2fs -j -F $MKE2FS_OPTS $IMAGE function test_log () { echo $* logger -p local4.notice $* } mkdir -p $MNTPT mount -o loop -t ext3 $IMAGE $MNTPT test_log Retry test: BEGIN for i in `seq 1 3` do test_log "Retry test: Loop $i" echo 2 > /proc/sys/fs/jbd-debug while ! mkdir -p $MNTPT/foo/bar do test_log "Retry test: mkdir failed" sleep 1 done echo 0 > /proc/sys/fs/jbd-debug cp -r $TEST_SRC $MNTPT/foo/bar 2> /dev/null rm -rf $MNTPT/* done umount $MNTPT test_log "Retry test: END" --- 25-akpm/fs/ext3/acl.c | 10 +++++- 25-akpm/fs/ext3/balloc.c | 62 ++++++++++++++++++++++++++++++++++++---- 25-akpm/fs/ext3/inode.c | 19 +++--------- 25-akpm/fs/ext3/namei.c | 25 ++++++++++++---- 25-akpm/fs/ext3/xattr.c | 6 +++ 25-akpm/include/linux/ext3_fs.h | 1 6 files changed, 96 insertions(+), 27 deletions(-) diff -puN fs/ext3/acl.c~ext3-retry-allocation-after-transaction-commit-v2 fs/ext3/acl.c --- 25/fs/ext3/acl.c~ext3-retry-allocation-after-transaction-commit-v2 Fri May 21 17:02:11 2004 +++ 25-akpm/fs/ext3/acl.c Fri May 21 17:02:11 2004 @@ -428,7 +428,9 @@ ext3_acl_chmod(struct inode *inode) error = posix_acl_chmod_masq(clone, inode->i_mode); if (!error) { handle_t *handle; + int retries = 0; + retry: handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS); if (IS_ERR(handle)) { error = PTR_ERR(handle); @@ -437,6 +439,9 @@ ext3_acl_chmod(struct inode *inode) } error = ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, clone); ext3_journal_stop(handle); + if (error == -ENOSPC && + ext3_should_retry_alloc(inode->i_sb, &retries)) + goto retry; } out: posix_acl_release(clone); @@ -516,7 +521,7 @@ ext3_xattr_set_acl(struct inode *inode, { handle_t *handle; struct posix_acl *acl; - int error; + int error, retries = 0; if (!test_opt(inode->i_sb, POSIX_ACL)) return -EOPNOTSUPP; @@ -535,11 +540,14 @@ ext3_xattr_set_acl(struct inode *inode, } else acl = NULL; +retry: handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS); if (IS_ERR(handle)) return PTR_ERR(handle); error = ext3_set_acl(handle, inode, type, acl); ext3_journal_stop(handle); + if (error == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) + goto retry; release_and_out: posix_acl_release(acl); diff -puN fs/ext3/balloc.c~ext3-retry-allocation-after-transaction-commit-v2 fs/ext3/balloc.c --- 25/fs/ext3/balloc.c~ext3-retry-allocation-after-transaction-commit-v2 Fri May 21 17:02:11 2004 +++ 25-akpm/fs/ext3/balloc.c Fri May 21 17:02:11 2004 @@ -962,6 +962,60 @@ out: return ret; } +static int ext3_has_free_blocks(struct ext3_sb_info *sbi) +{ + int free_blocks, root_blocks; + + free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); + root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count); + if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) && + sbi->s_resuid != current->fsuid && + (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) { + return 0; + } + return 1; +} + +/* + * Ext3_should_retry_alloc is called when ENOSPC is returned, and if + * it is profitable to retry the operation, this function will wait + * for the current or commiting transaction to complete, and then + * return TRUE. + */ +int ext3_should_retry_alloc(struct super_block *sb, int *retries) +{ + transaction_t *transaction = NULL; + journal_t *journal = EXT3_SB(sb)->s_journal; + tid_t tid; + + if (!ext3_has_free_blocks(EXT3_SB(sb)) || (*retries)++ > 3) + return 0; + + jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); + + /* + * We can only force the running transaction if we don't have + * an active handle; otherwise, we will deadlock. + */ + spin_lock(&journal->j_state_lock); + if (journal->j_running_transaction && !current->journal_info) { + transaction = journal->j_running_transaction; + __log_start_commit(journal, transaction->t_tid); + } else if (journal->j_committing_transaction) + transaction = journal->j_committing_transaction; + + if (!transaction) { + spin_unlock(&journal->j_state_lock); + return 0; /* Nothing to retry */ + } + + tid = transaction->t_tid; + spin_unlock(&journal->j_state_lock); + log_wait_commit(journal, tid); + + return 1; +} + /* * ext3_new_block uses a goal block to assist allocation. If the goal is * free, or there is a free block within 32 blocks of the goal, that block @@ -982,7 +1036,7 @@ int ext3_new_block(handle_t *handle, str int target_block; int fatal = 0, err; int performed_allocation = 0; - int free_blocks, root_blocks; + int free_blocks; struct super_block *sb; struct ext3_group_desc *gdp; struct ext3_super_block *es; @@ -1011,11 +1065,7 @@ int ext3_new_block(handle_t *handle, str ext3_debug("goal=%lu.\n", goal); if (test_opt(sb, RESERVATION) && S_ISREG(inode->i_mode)) my_rsv = &EXT3_I(inode)->i_rsv_window; - free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); - root_blocks = le32_to_cpu(es->s_r_blocks_count); - if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) && - sbi->s_resuid != current->fsuid && - (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) { + if (!ext3_has_free_blocks(sbi)) { *errp = -ENOSPC; goto out; } diff -puN fs/ext3/inode.c~ext3-retry-allocation-after-transaction-commit-v2 fs/ext3/inode.c --- 25/fs/ext3/inode.c~ext3-retry-allocation-after-transaction-commit-v2 Fri May 21 17:02:11 2004 +++ 25-akpm/fs/ext3/inode.c Fri May 21 17:02:11 2004 @@ -987,7 +987,7 @@ static int ext3_prepare_write(struct fil struct inode *inode = page->mapping->host; int ret, needed_blocks = ext3_writepage_trans_blocks(inode); handle_t *handle; - int tried_commit = 0; + int retries = 0; retry: handle = ext3_journal_start(inode, needed_blocks); @@ -996,19 +996,8 @@ retry: goto out; } ret = block_prepare_write(page, from, to, ext3_get_block); - if (ret) { - if (ret != -ENOSPC || tried_commit) - goto prepare_write_failed; - /* - * It could be that there _is_ free space, but it's all tied up - * in uncommitted bitmaps. So force a commit here, which makes - * those blocks allocatable and try again. - */ - tried_commit = 1; - handle->h_sync = 1; - ext3_journal_stop(handle); - goto retry; - } + if (ret) + goto prepare_write_failed; if (ext3_should_journal_data(inode)) { ret = walk_page_buffers(handle, page_buffers(page), @@ -1017,6 +1006,8 @@ retry: prepare_write_failed: if (ret) ext3_journal_stop(handle); + if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) + goto retry; out: return ret; } diff -puN fs/ext3/namei.c~ext3-retry-allocation-after-transaction-commit-v2 fs/ext3/namei.c --- 25/fs/ext3/namei.c~ext3-retry-allocation-after-transaction-commit-v2 Fri May 21 17:02:11 2004 +++ 25-akpm/fs/ext3/namei.c Fri May 21 17:02:11 2004 @@ -1630,8 +1630,9 @@ static int ext3_create (struct inode * d { handle_t *handle; struct inode * inode; - int err; + int err, retries = 0; +retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + 2*EXT3_QUOTA_INIT_BLOCKS); @@ -1650,6 +1651,8 @@ static int ext3_create (struct inode * d err = ext3_add_nondir(handle, dentry, inode); } ext3_journal_stop(handle); + if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) + goto retry; return err; } @@ -1658,11 +1661,12 @@ static int ext3_mknod (struct inode * di { handle_t *handle; struct inode *inode; - int err; + int err, retries = 0; if (!new_valid_dev(rdev)) return -EINVAL; +retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + 2*EXT3_QUOTA_INIT_BLOCKS); @@ -1682,6 +1686,8 @@ static int ext3_mknod (struct inode * di err = ext3_add_nondir(handle, dentry, inode); } ext3_journal_stop(handle); + if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) + goto retry; return err; } @@ -1691,11 +1697,12 @@ static int ext3_mkdir(struct inode * dir struct inode * inode; struct buffer_head * dir_block; struct ext3_dir_entry_2 * de; - int err; + int err, retries = 0; if (dir->i_nlink >= EXT3_LINK_MAX) return -EMLINK; +retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + 2*EXT3_QUOTA_INIT_BLOCKS); @@ -1753,6 +1760,8 @@ static int ext3_mkdir(struct inode * dir d_instantiate(dentry, inode); out_stop: ext3_journal_stop(handle); + if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) + goto retry; return err; } @@ -2094,12 +2103,13 @@ static int ext3_symlink (struct inode * { handle_t *handle; struct inode * inode; - int l, err; + int l, err, retries = 0; l = strlen(symname)+1; if (l > dir->i_sb->s_blocksize) return -ENAMETOOLONG; +retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 + 2*EXT3_QUOTA_INIT_BLOCKS); @@ -2138,6 +2148,8 @@ static int ext3_symlink (struct inode * err = ext3_add_nondir(handle, dentry, inode); out_stop: ext3_journal_stop(handle); + if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) + goto retry; return err; } @@ -2146,11 +2158,12 @@ static int ext3_link (struct dentry * ol { handle_t *handle; struct inode *inode = old_dentry->d_inode; - int err; + int err, retries = 0; if (inode->i_nlink >= EXT3_LINK_MAX) return -EMLINK; +retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + EXT3_INDEX_EXTRA_TRANS_BLOCKS); if (IS_ERR(handle)) @@ -2165,6 +2178,8 @@ static int ext3_link (struct dentry * ol err = ext3_add_nondir(handle, dentry, inode); ext3_journal_stop(handle); + if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) + goto retry; return err; } diff -puN fs/ext3/xattr.c~ext3-retry-allocation-after-transaction-commit-v2 fs/ext3/xattr.c --- 25/fs/ext3/xattr.c~ext3-retry-allocation-after-transaction-commit-v2 Fri May 21 17:02:11 2004 +++ 25-akpm/fs/ext3/xattr.c Fri May 21 17:02:11 2004 @@ -875,8 +875,9 @@ ext3_xattr_set(struct inode *inode, int const void *value, size_t value_len, int flags) { handle_t *handle; - int error; + int error, retries = 0; +retry: handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS); if (IS_ERR(handle)) { error = PTR_ERR(handle); @@ -886,6 +887,9 @@ ext3_xattr_set(struct inode *inode, int error = ext3_xattr_set_handle(handle, inode, name_index, name, value, value_len, flags); error2 = ext3_journal_stop(handle); + if (error == -ENOSPC && + ext3_should_retry_alloc(inode->i_sb, &retries)) + goto retry; if (error == 0) error = error2; } diff -puN include/linux/ext3_fs.h~ext3-retry-allocation-after-transaction-commit-v2 include/linux/ext3_fs.h --- 25/include/linux/ext3_fs.h~ext3-retry-allocation-after-transaction-commit-v2 Fri May 21 17:02:11 2004 +++ 25-akpm/include/linux/ext3_fs.h Fri May 21 17:02:11 2004 @@ -690,6 +690,7 @@ extern void ext3_check_blocks_bitmap (st extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb, unsigned int block_group, struct buffer_head ** bh); +extern int ext3_should_retry_alloc(struct super_block *sb, int *retries); /* dir.c */ extern int ext3_check_dir_entry(const char *, struct inode *, _