diff options
Diffstat (limited to 'fs')
74 files changed, 991 insertions, 735 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 360a5304ec03c..b01b1bbf24937 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -344,17 +344,21 @@ void v9fs_evict_inode(struct inode *inode) struct v9fs_inode __maybe_unused *v9inode = V9FS_I(inode); __le32 __maybe_unused version; - truncate_inode_pages_final(&inode->i_data); + if (!is_bad_inode(inode)) { + truncate_inode_pages_final(&inode->i_data); - version = cpu_to_le32(v9inode->qid.version); - netfs_clear_inode_writeback(inode, &version); + version = cpu_to_le32(v9inode->qid.version); + netfs_clear_inode_writeback(inode, &version); - clear_inode(inode); - filemap_fdatawrite(&inode->i_data); + clear_inode(inode); + filemap_fdatawrite(&inode->i_data); #ifdef CONFIG_9P_FSCACHE - fscache_relinquish_cookie(v9fs_inode_cookie(v9inode), false); + if (v9fs_inode_cookie(v9inode)) + fscache_relinquish_cookie(v9fs_inode_cookie(v9inode), false); #endif + } else + clear_inode(inode); } struct inode *v9fs_fid_iget(struct super_block *sb, struct p9_fid *fid) diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index ef9db3e035062..55dde186041a3 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -78,11 +78,11 @@ struct inode *v9fs_fid_iget_dotl(struct super_block *sb, struct p9_fid *fid) retval = v9fs_init_inode(v9ses, inode, &fid->qid, st->st_mode, new_decode_dev(st->st_rdev)); + v9fs_stat2inode_dotl(st, inode, 0); kfree(st); if (retval) goto error; - v9fs_stat2inode_dotl(st, inode, 0); v9fs_set_netfs_context(inode); v9fs_cache_inode_get_cookie(inode); retval = v9fs_get_acl(inode, fid); @@ -297,7 +297,6 @@ static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap, umode_t omode) { int err; - struct v9fs_session_info *v9ses; struct p9_fid *fid = NULL, *dfid = NULL; kgid_t gid; const unsigned char *name; @@ -307,7 +306,6 @@ static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap, struct posix_acl *dacl = NULL, *pacl = NULL; p9_debug(P9_DEBUG_VFS, "name %pd\n", dentry); - v9ses = v9fs_inode2v9ses(dir); omode |= S_IFDIR; if (dir->i_mode & S_ISGID) @@ -739,7 +737,6 @@ v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir, kgid_t gid; const unsigned char *name; umode_t mode; - struct v9fs_session_info *v9ses; struct p9_fid *fid = NULL, *dfid = NULL; struct inode *inode; struct p9_qid qid; @@ -749,7 +746,6 @@ v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir, dir->i_ino, dentry, omode, MAJOR(rdev), MINOR(rdev)); - v9ses = v9fs_inode2v9ses(dir); dfid = v9fs_parent_fid(dentry); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index b02796c8a5953..f6d86eb4b9765 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -67,6 +67,7 @@ bcachefs-y := \ quota.o \ rebalance.o \ recovery.o \ + recovery_passes.o \ reflink.o \ replicas.o \ sb-clean.o \ diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 8cb35ea572cb9..762c8ddfc5e73 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -29,8 +29,7 @@ static bool extent_matches_bp(struct bch_fs *c, if (p.ptr.cached) continue; - bch2_extent_ptr_to_bp(c, btree_id, level, k, p, - &bucket2, &bp2); + bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bucket2, &bp2); if (bpos_eq(bucket, bucket2) && !memcmp(&bp, &bp2, sizeof(bp))) return true; @@ -44,6 +43,11 @@ int bch2_backpointer_invalid(struct bch_fs *c, struct bkey_s_c k, struct printbuf *err) { struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); + + /* these will be caught by fsck */ + if (!bch2_dev_exists2(c, bp.k->p.inode)) + return 0; + struct bpos bucket = bp_pos_to_bucket(c, bp.k->p); int ret = 0; @@ -378,7 +382,7 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_ backpointer_to_missing_alloc, "backpointer for nonexistent alloc key: %llu:%llu:0\n%s", alloc_iter.pos.inode, alloc_iter.pos.offset, - (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ret = bch2_btree_delete_at(trans, bp_iter, 0); goto out; } @@ -502,8 +506,7 @@ static int check_extent_to_backpointers(struct btree_trans *trans, if (p.ptr.cached) continue; - bch2_extent_ptr_to_bp(c, btree, level, - k, p, &bucket_pos, &bp); + bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bucket_pos, &bp); ret = check_bp_exists(trans, s, bucket_pos, bp, k); if (ret) diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index 327365a9feac4..da012ca7daee5 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -90,20 +90,40 @@ static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans, return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k.k_i); } -static inline enum bch_data_type bkey_ptr_data_type(enum btree_id btree_id, unsigned level, - struct bkey_s_c k, struct extent_ptr_decoded p) +static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k, + struct extent_ptr_decoded p, + const union bch_extent_entry *entry) { - return level ? BCH_DATA_btree : - p.has_ec ? BCH_DATA_stripe : - BCH_DATA_user; + switch (k.k->type) { + case KEY_TYPE_btree_ptr: + case KEY_TYPE_btree_ptr_v2: + return BCH_DATA_btree; + case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: + return p.has_ec ? BCH_DATA_stripe : BCH_DATA_user; + case KEY_TYPE_stripe: { + const struct bch_extent_ptr *ptr = &entry->ptr; + struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); + + BUG_ON(ptr < s.v->ptrs || + ptr >= s.v->ptrs + s.v->nr_blocks); + + return ptr >= s.v->ptrs + s.v->nr_blocks - s.v->nr_redundant + ? BCH_DATA_parity + : BCH_DATA_user; + } + default: + BUG(); + } } static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, enum btree_id btree_id, unsigned level, struct bkey_s_c k, struct extent_ptr_decoded p, + const union bch_extent_entry *entry, struct bpos *bucket_pos, struct bch_backpointer *bp) { - enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p); + enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry); s64 sectors = level ? btree_sectors(c) : k.k->size; u32 bucket_offset; diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 799aa32b6b4d9..963162a627cd6 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -209,7 +209,7 @@ #include "fifo.h" #include "nocow_locking_types.h" #include "opts.h" -#include "recovery_types.h" +#include "recovery_passes_types.h" #include "sb-errors_types.h" #include "seqmutex.h" #include "time_stats.h" @@ -810,7 +810,6 @@ struct bch_fs { /* snapshot.c: */ struct snapshot_table __rcu *snapshots; - size_t snapshot_table_size; struct mutex snapshot_table_lock; struct rw_semaphore snapshot_create_lock; diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c index 3fd1085b6c61e..3bb477840eab6 100644 --- a/fs/bcachefs/bset.c +++ b/fs/bcachefs/bset.c @@ -134,18 +134,24 @@ void bch2_dump_btree_node_iter(struct btree *b, printbuf_exit(&buf); } -#ifdef CONFIG_BCACHEFS_DEBUG - -void __bch2_verify_btree_nr_keys(struct btree *b) +struct btree_nr_keys bch2_btree_node_count_keys(struct btree *b) { struct bset_tree *t; struct bkey_packed *k; - struct btree_nr_keys nr = { 0 }; + struct btree_nr_keys nr = {}; for_each_bset(b, t) bset_tree_for_each_key(b, t, k) if (!bkey_deleted(k)) btree_keys_account_key_add(&nr, t - b->set, k); + return nr; +} + +#ifdef CONFIG_BCACHEFS_DEBUG + +void __bch2_verify_btree_nr_keys(struct btree *b) +{ + struct btree_nr_keys nr = bch2_btree_node_count_keys(b); BUG_ON(memcmp(&nr, &b->nr, sizeof(nr))); } diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h index 79c77baaa3838..120a79fd456bd 100644 --- a/fs/bcachefs/bset.h +++ b/fs/bcachefs/bset.h @@ -458,6 +458,8 @@ struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *, /* Accounting: */ +struct btree_nr_keys bch2_btree_node_count_keys(struct btree *); + static inline void btree_keys_account_key(struct btree_nr_keys *n, unsigned bset, struct bkey_packed *k, diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 562561a9a510e..84474324dba9b 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -808,7 +808,8 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) prt_printf(&buf, "\nmax "); bch2_bpos_to_text(&buf, b->data->max_key); - bch2_fs_inconsistent(c, "%s", buf.buf); + bch2_fs_topology_error(c, "%s", buf.buf); + printbuf_exit(&buf); } @@ -1134,6 +1135,8 @@ void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k) b = btree_cache_find(bc, k); if (!b) return; + + BUG_ON(b == btree_node_root(trans->c, b)); wait_on_io: /* not allowed to wait on io with btree locks held: */ diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index bdaed29f084a4..e5d2c6daa663d 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -7,6 +7,7 @@ #include "bcachefs.h" #include "alloc_background.h" #include "alloc_foreground.h" +#include "backpointers.h" #include "bkey_methods.h" #include "bkey_buf.h" #include "btree_journal_iter.h" @@ -24,7 +25,7 @@ #include "journal.h" #include "keylist.h" #include "move.h" -#include "recovery.h" +#include "recovery_passes.h" #include "reflink.h" #include "replicas.h" #include "super-io.h" @@ -70,90 +71,6 @@ static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) __gc_pos_set(c, new_pos); } -/* - * Missing: if an interior btree node is empty, we need to do something - - * perhaps just kill it - */ -static int bch2_gc_check_topology(struct bch_fs *c, - struct btree *b, - struct bkey_buf *prev, - struct bkey_buf cur, - bool is_last) -{ - struct bpos node_start = b->data->min_key; - struct bpos node_end = b->data->max_key; - struct bpos expected_start = bkey_deleted(&prev->k->k) - ? node_start - : bpos_successor(prev->k->k.p); - struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; - int ret = 0; - - if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) { - struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k); - - if (!bpos_eq(expected_start, bp->v.min_key)) { - bch2_topology_error(c); - - if (bkey_deleted(&prev->k->k)) { - prt_printf(&buf1, "start of node: "); - bch2_bpos_to_text(&buf1, node_start); - } else { - bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(prev->k)); - } - bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(cur.k)); - - if (__fsck_err(c, - FSCK_CAN_FIX| - FSCK_CAN_IGNORE| - FSCK_NO_RATELIMIT, - btree_node_topology_bad_min_key, - "btree node with incorrect min_key at btree %s level %u:\n" - " prev %s\n" - " cur %s", - bch2_btree_id_str(b->c.btree_id), b->c.level, - buf1.buf, buf2.buf) && should_restart_for_topology_repair(c)) { - bch_info(c, "Halting mark and sweep to start topology repair pass"); - ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); - goto err; - } else { - set_bit(BCH_FS_initial_gc_unfixed, &c->flags); - } - } - } - - if (is_last && !bpos_eq(cur.k->k.p, node_end)) { - bch2_topology_error(c); - - printbuf_reset(&buf1); - printbuf_reset(&buf2); - - bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(cur.k)); - bch2_bpos_to_text(&buf2, node_end); - - if (__fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE|FSCK_NO_RATELIMIT, - btree_node_topology_bad_max_key, - "btree node with incorrect max_key at btree %s level %u:\n" - " %s\n" - " expected %s", - bch2_btree_id_str(b->c.btree_id), b->c.level, - buf1.buf, buf2.buf) && - should_restart_for_topology_repair(c)) { - bch_info(c, "Halting mark and sweep to start topology repair pass"); - ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); - goto err; - } else { - set_bit(BCH_FS_initial_gc_unfixed, &c->flags); - } - } - - bch2_bkey_buf_copy(prev, c, cur.k); -err: -fsck_err: - printbuf_exit(&buf2); - printbuf_exit(&buf1); - return ret; -} - static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst) { switch (b->key.k.type) { @@ -445,6 +362,7 @@ again: prev = NULL; if (ret == DROP_PREV_NODE) { + bch_info(c, "dropped prev node"); bch2_btree_node_evict(trans, prev_k.k); ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level, prev_k.k->k.p); @@ -591,7 +509,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id bkey_for_each_ptr_decode(k->k, ptrs_c, p, entry_c) { struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); - enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry_c->ptr); + enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, p, entry_c); if (fsck_err_on(!g->gen_valid, c, ptr_to_missing_alloc_key, @@ -657,7 +575,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id continue; if (fsck_err_on(bucket_data_type(g->data_type) && - bucket_data_type(g->data_type) != data_type, c, + bucket_data_type(g->data_type) != + bucket_data_type(data_type), c, ptr_bucket_data_type_mismatch, "bucket %u:%zu different types of data in same bucket: %s, %s\n" "while marking %s", @@ -698,18 +617,13 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id } if (do_update) { - struct bkey_ptrs ptrs; - union bch_extent_entry *entry; - struct bch_extent_ptr *ptr; - struct bkey_i *new; - if (is_root) { bch_err(c, "cannot update btree roots yet"); ret = -EINVAL; goto err; } - new = kmalloc(bkey_bytes(k->k), GFP_KERNEL); + struct bkey_i *new = kmalloc(bkey_bytes(k->k), GFP_KERNEL); if (!new) { ret = -BCH_ERR_ENOMEM_gc_repair_key; bch_err_msg(c, ret, "allocating new key"); @@ -724,7 +638,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id * btree node isn't there anymore, the read path will * sort it out: */ - ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); bkey_for_each_ptr(ptrs, ptr) { struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); struct bucket *g = PTR_GC_BUCKET(ca, ptr); @@ -732,19 +646,26 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id ptr->gen = g->gen; } } else { - bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({ - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket *g = PTR_GC_BUCKET(ca, ptr); - enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr); - - (ptr->cached && - (!g->gen_valid || gen_cmp(ptr->gen, g->gen) > 0)) || - (!ptr->cached && - gen_cmp(ptr->gen, g->gen) < 0) || - gen_cmp(g->gen, ptr->gen) > BUCKET_GC_GEN_MAX || - (g->data_type && - g->data_type != data_type); - })); + struct bkey_ptrs ptrs; + union bch_extent_entry *entry; +restart_drop_ptrs: + ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); + bkey_for_each_ptr_decode(bkey_i_to_s(new).k, ptrs, p, entry) { + struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); + struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); + enum bch_data_type data_type = bch2_bkey_ptr_data_type(bkey_i_to_s_c(new), p, entry); + + if ((p.ptr.cached && + (!g->gen_valid || gen_cmp(p.ptr.gen, g->gen) > 0)) || + (!p.ptr.cached && + gen_cmp(p.ptr.gen, g->gen) < 0) || + gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX || + (g->data_type && + g->data_type != data_type)) { + bch2_bkey_drop_ptr(bkey_i_to_s(new), &entry->ptr); + goto restart_drop_ptrs; + } + } again: ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); bkey_extent_entry_for_each(ptrs, entry) { @@ -774,12 +695,6 @@ found: } } - ret = bch2_journal_key_insert_take(c, btree_id, level, new); - if (ret) { - kfree(new); - goto err; - } - if (level) bch2_btree_node_update_key_early(trans, btree_id, level - 1, *k, new); @@ -793,6 +708,12 @@ found: bch_info(c, "new key %s", buf.buf); } + ret = bch2_journal_key_insert_take(c, btree_id, level, new); + if (ret) { + kfree(new); + goto err; + } + *k = bkey_i_to_s_c(new); } err: @@ -819,10 +740,6 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, BUG_ON(bch2_journal_seq_verify && k->k->version.lo > atomic64_read(&c->journal.seq)); - ret = bch2_check_fix_ptrs(trans, btree_id, level, is_root, k); - if (ret) - goto err; - if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c, bkey_version_in_future, "key version number higher than recorded: %llu > %llu", @@ -831,8 +748,13 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, atomic64_set(&c->key_version, k->k->version.lo); } + ret = bch2_check_fix_ptrs(trans, btree_id, level, is_root, k); + if (ret) + goto err; + ret = commit_do(trans, NULL, NULL, 0, - bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(*k), BTREE_TRIGGER_GC)); + bch2_key_trigger(trans, btree_id, level, old, + unsafe_bkey_s_c_to_s(*k), BTREE_TRIGGER_GC)); fsck_err: err: bch_err_fn(c, ret); @@ -841,42 +763,30 @@ err: static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, bool initial) { - struct bch_fs *c = trans->c; struct btree_node_iter iter; struct bkey unpacked; struct bkey_s_c k; - struct bkey_buf prev, cur; int ret = 0; + ret = bch2_btree_node_check_topology(trans, b); + if (ret) + return ret; + if (!btree_node_type_needs_gc(btree_node_type(b))) return 0; bch2_btree_node_iter_init_from_start(&iter, b); - bch2_bkey_buf_init(&prev); - bch2_bkey_buf_init(&cur); - bkey_init(&prev.k->k); while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) { ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false, &k, initial); if (ret) - break; + return ret; bch2_btree_node_iter_advance(&iter, b); - - if (b->c.level) { - bch2_bkey_buf_reassemble(&cur, c, k); - - ret = bch2_gc_check_topology(c, b, &prev, cur, - bch2_btree_node_iter_end(&iter)); - if (ret) - break; - } } - bch2_bkey_buf_exit(&cur, c); - bch2_bkey_buf_exit(&prev, c); - return ret; + return 0; } static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id, @@ -925,14 +835,16 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b struct bch_fs *c = trans->c; struct btree_and_journal_iter iter; struct bkey_s_c k; - struct bkey_buf cur, prev; + struct bkey_buf cur; struct printbuf buf = PRINTBUF; int ret = 0; + ret = bch2_btree_node_check_topology(trans, b); + if (ret) + return ret; + bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); - bch2_bkey_buf_init(&prev); bch2_bkey_buf_init(&cur); - bkey_init(&prev.k->k); while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { BUG_ON(bpos_lt(k.k->p, b->data->min_key)); @@ -943,20 +855,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b if (ret) goto fsck_err; - if (b->c.level) { - bch2_bkey_buf_reassemble(&cur, c, k); - k = bkey_i_to_s_c(cur.k); - - bch2_btree_and_journal_iter_advance(&iter); - - ret = bch2_gc_check_topology(c, b, - &prev, cur, - !bch2_btree_and_journal_iter_peek(&iter).k); - if (ret) - goto fsck_err; - } else { - bch2_btree_and_journal_iter_advance(&iter); - } + bch2_btree_and_journal_iter_advance(&iter); } if (b->c.level > target_depth) { @@ -1015,7 +914,6 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b } fsck_err: bch2_bkey_buf_exit(&cur, c); - bch2_bkey_buf_exit(&prev, c); bch2_btree_and_journal_iter_exit(&iter); printbuf_exit(&buf); return ret; diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 34df8ccc5fecc..9c71e6fb9c41b 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -654,6 +654,7 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b) */ bch2_bset_set_no_aux_tree(b, b->set); bch2_btree_build_aux_trees(b); + b->nr = bch2_btree_node_count_keys(b); struct bkey_s_c k; struct bkey unpacked; @@ -1657,7 +1658,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, prt_str(&buf, "btree node read error: no device to read from\n at "); bch2_btree_pos_to_text(&buf, c, b); - bch_err(c, "%s", buf.buf); + bch_err_ratelimited(c, "%s", buf.buf); if (c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology) && c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 51bcdc6c6d1cd..2a211a4bebd15 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -927,8 +927,22 @@ static __always_inline int btree_path_down(struct btree_trans *trans, if (ret) goto err; } else { - bch2_bkey_buf_unpack(&tmp, c, l->b, - bch2_btree_node_iter_peek(&l->iter, l->b)); + struct bkey_packed *k = bch2_btree_node_iter_peek(&l->iter, l->b); + if (!k) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "node not found at pos "); + bch2_bpos_to_text(&buf, path->pos); + prt_str(&buf, " within parent node "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&l->b->key)); + + bch2_fs_fatal_error(c, "%s", buf.buf); + printbuf_exit(&buf); + ret = -BCH_ERR_btree_need_topology_repair; + goto err; + } + + bch2_bkey_buf_unpack(&tmp, c, l->b, k); if ((flags & BTREE_ITER_PREFETCH) && c->opts.btree_node_prefetch) { @@ -962,7 +976,6 @@ err: return ret; } - static int bch2_btree_path_traverse_all(struct btree_trans *trans) { struct bch_fs *c = trans->c; @@ -2790,6 +2803,31 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) struct btree_transaction_stats *s = btree_trans_stats(trans); s->max_mem = max(s->max_mem, new_bytes); + if (trans->used_mempool) { + if (trans->mem_bytes >= new_bytes) + goto out_change_top; + + /* No more space from mempool item, need malloc new one */ + new_mem = kmalloc(new_bytes, GFP_NOWAIT|__GFP_NOWARN); + if (unlikely(!new_mem)) { + bch2_trans_unlock(trans); + + new_mem = kmalloc(new_bytes, GFP_KERNEL); + if (!new_mem) + return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc); + + ret = bch2_trans_relock(trans); + if (ret) { + kfree(new_mem); + return ERR_PTR(ret); + } + } + memcpy(new_mem, trans->mem, trans->mem_top); + trans->used_mempool = false; + mempool_free(trans->mem, &c->btree_trans_mem_pool); + goto out_new_mem; + } + new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN); if (unlikely(!new_mem)) { bch2_trans_unlock(trans); @@ -2798,6 +2836,8 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) { new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); new_bytes = BTREE_TRANS_MEM_MAX; + memcpy(new_mem, trans->mem, trans->mem_top); + trans->used_mempool = true; kfree(trans->mem); } @@ -2811,7 +2851,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) if (ret) return ERR_PTR(ret); } - +out_new_mem: trans->mem = new_mem; trans->mem_bytes = new_bytes; @@ -2819,7 +2859,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced)); } - +out_change_top: p = trans->mem + trans->mem_top; trans->mem_top += size; memset(p, 0, size); @@ -3093,7 +3133,7 @@ void bch2_trans_put(struct btree_trans *trans) if (paths_allocated != trans->_paths_allocated) kvfree_rcu_mightsleep(paths_allocated); - if (trans->mem_bytes == BTREE_TRANS_MEM_MAX) + if (trans->used_mempool) mempool_free(trans->mem, &c->btree_trans_mem_pool); else kfree(trans->mem); diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index 50e04356d72c8..1f588264575db 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -261,6 +261,22 @@ int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id, return bch2_journal_key_insert(c, id, level, &whiteout); } +bool bch2_key_deleted_in_journal(struct btree_trans *trans, enum btree_id btree, + unsigned level, struct bpos pos) +{ + struct journal_keys *keys = &trans->c->journal_keys; + size_t idx = bch2_journal_key_search(keys, btree, level, pos); + + if (!trans->journal_replay_not_finished) + return false; + + return (idx < keys->size && + keys->data[idx].btree_id == btree && + keys->data[idx].level == level && + bpos_eq(keys->data[idx].k->k.p, pos) && + bkey_deleted(&keys->data[idx].k->k)); +} + void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, unsigned level, struct bpos pos) { @@ -363,7 +379,7 @@ static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) { - struct bkey_s_c btree_k, journal_k, ret; + struct bkey_s_c btree_k, journal_k = bkey_s_c_null, ret; if (iter->prefetch && iter->journal.level) btree_and_journal_iter_prefetch(iter); @@ -375,9 +391,10 @@ again: bpos_lt(btree_k.k->p, iter->pos)) bch2_journal_iter_advance_btree(iter); - while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k && - bpos_lt(journal_k.k->p, iter->pos)) - bch2_journal_iter_advance(&iter->journal); + if (iter->trans->journal_replay_not_finished) + while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k && + bpos_lt(journal_k.k->p, iter->pos)) + bch2_journal_iter_advance(&iter->journal); ret = journal_k.k && (!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p)) @@ -435,7 +452,9 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, bch2_btree_node_iter_init_from_start(&node_iter, b); __bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key); - list_add(&iter->journal.list, &trans->c->journal_iters); + if (trans->journal_replay_not_finished && + !test_bit(BCH_FS_may_go_rw, &trans->c->flags)) + list_add(&iter->journal.list, &trans->c->journal_iters); } /* sort and dedup all keys in the journal: */ diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h index c9d19da3ea048..09cd53091a05b 100644 --- a/fs/bcachefs/btree_journal_iter.h +++ b/fs/bcachefs/btree_journal_iter.h @@ -40,8 +40,8 @@ int bch2_journal_key_insert(struct bch_fs *, enum btree_id, unsigned, struct bkey_i *); int bch2_journal_key_delete(struct bch_fs *, enum btree_id, unsigned, struct bpos); -void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, - unsigned, struct bpos); +bool bch2_key_deleted_in_journal(struct btree_trans *, enum btree_id, unsigned, struct bpos); +void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, unsigned, struct bpos); void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 30d69a6d133ee..96669fede7d34 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -318,7 +318,7 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) && test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) && i->k->k.p.snapshot && - bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot)); + bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot) > 0); } static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans, diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index a4b40c1656a54..8e47e260eba59 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -38,6 +38,9 @@ static noinline int extent_front_merge(struct btree_trans *trans, struct bkey_i *update; int ret; + if (unlikely(trans->journal_replay_not_finished)) + return 0; + update = bch2_bkey_make_mut_noupdate(trans, k); ret = PTR_ERR_OR_ZERO(update); if (ret) @@ -69,6 +72,9 @@ static noinline int extent_back_merge(struct btree_trans *trans, struct bch_fs *c = trans->c; int ret; + if (unlikely(trans->journal_replay_not_finished)) + return 0; + ret = bch2_key_has_snapshot_overwrites(trans, iter->btree_id, insert->k.p) ?: bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p); if (ret < 0) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index b2f5f2e50f7e1..983bb27298cc2 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -2,6 +2,7 @@ #include "bcachefs.h" #include "alloc_foreground.h" +#include "bkey_buf.h" #include "bkey_methods.h" #include "btree_cache.h" #include "btree_gc.h" @@ -18,6 +19,7 @@ #include "journal.h" #include "journal_reclaim.h" #include "keylist.h" +#include "recovery_passes.h" #include "replicas.h" #include "super-io.h" #include "trace.h" @@ -44,56 +46,103 @@ static btree_path_idx_t get_unlocked_mut_path(struct btree_trans *trans, return path_idx; } -/* Debug code: */ - /* * Verify that child nodes correctly span parent node's range: */ -static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) +int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) { -#ifdef CONFIG_BCACHEFS_DEBUG - struct bpos next_node = b->data->min_key; - struct btree_node_iter iter; + struct bch_fs *c = trans->c; + struct bpos node_min = b->key.k.type == KEY_TYPE_btree_ptr_v2 + ? bkey_i_to_btree_ptr_v2(&b->key)->v.min_key + : b->data->min_key; + struct btree_and_journal_iter iter; struct bkey_s_c k; - struct bkey_s_c_btree_ptr_v2 bp; - struct bkey unpacked; - struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + struct printbuf buf = PRINTBUF; + struct bkey_buf prev; + int ret = 0; - BUG_ON(!b->c.level); + BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && + !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key, + b->data->min_key)); - if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) - return; + if (!b->c.level) + return 0; - bch2_btree_node_iter_init_from_start(&iter, b); + bch2_bkey_buf_init(&prev); + bkey_init(&prev.k->k); + bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); - while (1) { - k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked); + while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { if (k.k->type != KEY_TYPE_btree_ptr_v2) - break; - bp = bkey_s_c_to_btree_ptr_v2(k); + goto out; - if (!bpos_eq(next_node, bp.v->min_key)) { - bch2_dump_btree_node(c, b); - bch2_bpos_to_text(&buf1, next_node); - bch2_bpos_to_text(&buf2, bp.v->min_key); - panic("expected next min_key %s got %s\n", buf1.buf, buf2.buf); - } + struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); - bch2_btree_node_iter_advance(&iter, b); + struct bpos expected_min = bkey_deleted(&prev.k->k) + ? node_min + : bpos_successor(prev.k->k.p); - if (bch2_btree_node_iter_end(&iter)) { - if (!bpos_eq(k.k->p, b->key.k.p)) { - bch2_dump_btree_node(c, b); - bch2_bpos_to_text(&buf1, b->key.k.p); - bch2_bpos_to_text(&buf2, k.k->p); - panic("expected end %s got %s\n", buf1.buf, buf2.buf); - } - break; + if (!bpos_eq(expected_min, bp.v->min_key)) { + bch2_topology_error(c); + + printbuf_reset(&buf); + prt_str(&buf, "end of prev node doesn't match start of next node\n"), + prt_printf(&buf, " in btree %s level %u node ", + bch2_btree_id_str(b->c.btree_id), b->c.level); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + prt_str(&buf, "\n prev "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); + prt_str(&buf, "\n next "); + bch2_bkey_val_to_text(&buf, c, k); + + need_fsck_err(c, btree_node_topology_bad_min_key, "%s", buf.buf); + goto topology_repair; } - next_node = bpos_successor(k.k->p); + bch2_bkey_buf_reassemble(&prev, c, k); + bch2_btree_and_journal_iter_advance(&iter); } -#endif + + if (bkey_deleted(&prev.k->k)) { + bch2_topology_error(c); + + printbuf_reset(&buf); + prt_str(&buf, "empty interior node\n"); + prt_printf(&buf, " in btree %s level %u node ", + bch2_btree_id_str(b->c.btree_id), b->c.level); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + + need_fsck_err(c, btree_node_topology_empty_interior_node, "%s", buf.buf); + goto topology_repair; + } else if (!bpos_eq(prev.k->k.p, b->key.k.p)) { + bch2_topology_error(c); + + printbuf_reset(&buf); + prt_str(&buf, "last child node doesn't end at end of parent node\n"); + prt_printf(&buf, " in btree %s level %u node ", + bch2_btree_id_str(b->c.btree_id), b->c.level); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + prt_str(&buf, "\n last key "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); + + need_fsck_err(c, btree_node_topology_bad_max_key, "%s", buf.buf); + goto topology_repair; + } +out: +fsck_err: + bch2_btree_and_journal_iter_exit(&iter); + bch2_bkey_buf_exit(&prev, c); + printbuf_exit(&buf); + return ret; +topology_repair: + if ((c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology)) && + c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) { + bch2_inconsistent_error(c); + ret = -BCH_ERR_btree_need_topology_repair; + } else { + ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); + } + goto out; } /* Calculate ideal packed bkey format for new btree nodes: */ @@ -1380,9 +1429,16 @@ static void __btree_split_node(struct btree_update *as, if (bkey_deleted(k)) continue; + uk = bkey_unpack_key(b, k); + + if (b->c.level && + u64s < n1_u64s && + u64s + k->u64s >= n1_u64s && + bch2_key_deleted_in_journal(trans, b->c.btree_id, b->c.level, uk.p)) + n1_u64s += k->u64s; + i = u64s >= n1_u64s; u64s += k->u64s; - uk = bkey_unpack_key(b, k); if (!i) n1_pos = uk.p; bch2_bkey_format_add_key(&format[i], &uk); @@ -1441,8 +1497,7 @@ static void __btree_split_node(struct btree_update *as, bch2_verify_btree_nr_keys(n[i]); - if (b->c.level) - btree_node_interior_verify(as->c, n[i]); + BUG_ON(bch2_btree_node_check_topology(trans, n[i])); } } @@ -1473,7 +1528,7 @@ static void btree_split_insert_keys(struct btree_update *as, __bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys); - btree_node_interior_verify(as->c, b); + BUG_ON(bch2_btree_node_check_topology(trans, b)); } } @@ -1488,9 +1543,14 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, u64 start_time = local_clock(); int ret = 0; + bch2_verify_btree_nr_keys(b); BUG_ON(!parent && (b != btree_node_root(c, b))); BUG_ON(parent && !btree_node_intent_locked(trans->paths + path, b->c.level + 1)); + ret = bch2_btree_node_check_topology(trans, b); + if (ret) + return ret; + bch2_btree_interior_update_will_free_node(as, b); if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) { @@ -1710,7 +1770,11 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t goto split; } - btree_node_interior_verify(c, b); + ret = bch2_btree_node_check_topology(trans, b); + if (ret) { + bch2_btree_node_unlock_write(trans, path, b); + return ret; + } bch2_btree_insert_keys_interior(as, trans, path, b, keys); @@ -1728,7 +1792,7 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t bch2_btree_node_unlock_write(trans, path, b); - btree_node_interior_verify(c, b); + BUG_ON(bch2_btree_node_check_topology(trans, b)); return 0; split: /* @@ -1818,9 +1882,12 @@ int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path, { struct bch_fs *c = trans->c; struct btree *b = bch2_btree_id_root(c, trans->paths[path].btree_id)->b; + + if (btree_node_fake(b)) + return bch2_btree_split_leaf(trans, path, flags); + struct btree_update *as = - bch2_btree_update_start(trans, trans->paths + path, - b->c.level, true, flags); + bch2_btree_update_start(trans, trans->paths + path, b->c.level, true, flags); if (IS_ERR(as)) return PTR_ERR(as); diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index f651dd48aaa04..7ac3e17b84fde 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -10,6 +10,8 @@ #define BTREE_UPDATE_JOURNAL_RES (BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1)) +int bch2_btree_node_check_topology(struct btree_trans *, struct btree *); + /* * Tracks an in progress split/rewrite of a btree node and the update to the * parent node: diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index 5cbad8445782c..baf63e2fddb64 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -11,6 +11,7 @@ #include "journal_reclaim.h" #include <linux/prefetch.h> +#include <linux/sort.h> static int bch2_btree_write_buffer_journal_flush(struct journal *, struct journal_entry_pin *, u64); @@ -46,6 +47,14 @@ static inline bool wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_ke #endif } +static int wb_key_seq_cmp(const void *_l, const void *_r) +{ + const struct btree_write_buffered_key *l = _l; + const struct btree_write_buffered_key *r = _r; + + return cmp_int(l->journal_seq, r->journal_seq); +} + /* Compare excluding idx, the low 24 bits: */ static inline bool wb_key_eq(const void *_l, const void *_r) { @@ -357,6 +366,11 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) */ trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, wb->flushing.keys.nr); + sort(wb->flushing.keys.data, + wb->flushing.keys.nr, + sizeof(wb->flushing.keys.data[0]), + wb_key_seq_cmp, NULL); + darray_for_each(wb->flushing.keys, i) { if (!i->journal_seq) continue; diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 96edf2c34d433..941401a210f56 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -525,6 +525,7 @@ int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, "different types of data in same bucket: %s, %s", bch2_data_type_str(g->data_type), bch2_data_type_str(data_type))) { + BUG(); ret = -EIO; goto err; } @@ -628,6 +629,7 @@ int bch2_check_bucket_ref(struct btree_trans *trans, bch2_data_type_str(ptr_data_type), (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + BUG(); ret = -EIO; goto err; } @@ -815,14 +817,14 @@ static int __mark_pointer(struct btree_trans *trans, static int bch2_trigger_pointer(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c k, struct extent_ptr_decoded p, - s64 *sectors, - unsigned flags) + const union bch_extent_entry *entry, + s64 *sectors, unsigned flags) { bool insert = !(flags & BTREE_TRIGGER_OVERWRITE); struct bpos bucket; struct bch_backpointer bp; - bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket, &bp); + bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, entry, &bucket, &bp); *sectors = insert ? bp.bucket_len : -((s64) bp.bucket_len); if (flags & BTREE_TRIGGER_TRANSACTIONAL) { @@ -851,7 +853,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans, if (flags & BTREE_TRIGGER_GC) { struct bch_fs *c = trans->c; struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); - enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p); + enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry); percpu_down_read(&c->mark_lock); struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); @@ -979,7 +981,7 @@ static int __trigger_extent(struct btree_trans *trans, bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { s64 disk_sectors; - ret = bch2_trigger_pointer(trans, btree_id, level, k, p, &disk_sectors, flags); + ret = bch2_trigger_pointer(trans, btree_id, level, k, p, entry, &disk_sectors, flags); if (ret < 0) return ret; diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index 38defa19d52d7..cbfa6459bdbce 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -7,7 +7,7 @@ #include "chardev.h" #include "journal.h" #include "move.h" -#include "recovery.h" +#include "recovery_passes.h" #include "replicas.h" #include "super.h" #include "super-io.h" diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 4150feca42a2e..b564404dad711 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -14,6 +14,7 @@ #include "move.h" #include "nocow_locking.h" #include "rebalance.h" +#include "snapshot.h" #include "subvolume.h" #include "trace.h" @@ -509,6 +510,14 @@ int bch2_data_update_init(struct btree_trans *trans, unsigned ptrs_locked = 0; int ret = 0; + /* + * fs is corrupt we have a key for a snapshot node that doesn't exist, + * and we have to check for this because we go rw before repairing the + * snapshots table - just skip it, we can move it later. + */ + if (unlikely(k.k->p.snapshot && !bch2_snapshot_equiv(c, k.k->p.snapshot))) + return -BCH_ERR_data_update_done; + bch2_bkey_buf_init(&m->k); bch2_bkey_buf_reassemble(&m->k, c, k); m->btree_id = btree_id; diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index af25d8ec60f22..01a79fa3eacb2 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -252,7 +252,8 @@ x(BCH_ERR_nopromote, nopromote_in_flight) \ x(BCH_ERR_nopromote, nopromote_no_writes) \ x(BCH_ERR_nopromote, nopromote_enomem) \ - x(0, need_inode_lock) + x(0, need_inode_lock) \ + x(0, invalid_snapshot_node) enum bch_errcode { BCH_ERR_START = 2048, diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 043431206799d..82a6656c941c5 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -1,7 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "error.h" -#include "recovery.h" +#include "journal.h" +#include "recovery_passes.h" #include "super.h" #include "thread_with_file.h" @@ -16,7 +17,8 @@ bool bch2_inconsistent_error(struct bch_fs *c) return false; case BCH_ON_ERROR_ro: if (bch2_fs_emergency_read_only(c)) - bch_err(c, "inconsistency detected - emergency read only"); + bch_err(c, "inconsistency detected - emergency read only at journal seq %llu", + journal_cur_seq(&c->journal)); return true; case BCH_ON_ERROR_panic: panic(bch2_fmt(c, "panic after error")); diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index ae1d6674c512d..36caedf72d89a 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -32,6 +32,12 @@ bool bch2_inconsistent_error(struct bch_fs *); int bch2_topology_error(struct bch_fs *); +#define bch2_fs_topology_error(c, ...) \ +({ \ + bch_err(c, "btree topology error: " __VA_ARGS__); \ + bch2_topology_error(c); \ +}) + #define bch2_fs_inconsistent(c, ...) \ ({ \ bch_err(c, __VA_ARGS__); \ diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 61395b113df9b..b2432d88cda64 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -189,13 +189,18 @@ int bch2_btree_ptr_v2_invalid(struct bch_fs *c, struct bkey_s_c k, enum bkey_invalid_flags flags, struct printbuf *err) { + struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); int ret = 0; - bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX, c, err, - btree_ptr_v2_val_too_big, + bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX, + c, err, btree_ptr_v2_val_too_big, "value too big (%zu > %zu)", bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX); + bkey_fsck_err_on(bpos_ge(bp.v->min_key, bp.k->p), + c, err, btree_ptr_v2_min_key_bad, + "min_key > key"); + ret = bch2_bkey_ptrs_invalid(c, k, flags, err); fsck_err: return ret; diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index fd2669cdd76f3..3fd0169b98c18 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -596,30 +596,6 @@ static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k) return ret; } -static inline unsigned bch2_bkey_ptr_data_type(struct bkey_s_c k, const struct bch_extent_ptr *ptr) -{ - switch (k.k->type) { - case KEY_TYPE_btree_ptr: - case KEY_TYPE_btree_ptr_v2: - return BCH_DATA_btree; - case KEY_TYPE_extent: - case KEY_TYPE_reflink_v: - return BCH_DATA_user; - case KEY_TYPE_stripe: { - struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); - - BUG_ON(ptr < s.v->ptrs || - ptr >= s.v->ptrs + s.v->nr_blocks); - - return ptr >= s.v->ptrs + s.v->nr_blocks - s.v->nr_redundant - ? BCH_DATA_parity - : BCH_DATA_user; - } - default: - BUG(); - } -} - unsigned bch2_bkey_nr_ptrs(struct bkey_s_c); unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c); unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c); diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c index 33cb6da3a5ad2..f49e6c0f0f683 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c @@ -536,7 +536,7 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio) if (likely(!dio->iter.count) || dio->op.error) break; - bio_reset(bio, NULL, REQ_OP_WRITE); + bio_reset(bio, NULL, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE); } out: return bch2_dio_write_done(dio); @@ -618,7 +618,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) bio = bio_alloc_bioset(NULL, bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), - REQ_OP_WRITE, + REQ_OP_WRITE | REQ_SYNC | REQ_IDLE, GFP_KERNEL, &c->dio_write_bioset); dio = container_of(bio, struct dio_write, op.wbio.bio); diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 0ccee05f6887b..b5ea9fa1259d1 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1997,6 +1997,7 @@ out: return dget(sb->s_root); err_put_super: + __bch2_fs_stop(c); deactivate_locked_super(sb); return ERR_PTR(bch2_err_class(ret)); } diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 47d4eefaba7ba..cbb8b43e419fe 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -12,7 +12,7 @@ #include "fsck.h" #include "inode.h" #include "keylist.h" -#include "recovery.h" +#include "recovery_passes.h" #include "snapshot.h" #include "super.h" #include "xattr.h" @@ -158,9 +158,10 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos) bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT); - ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, - &dir_hash_info, &iter, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_hash_delete_at(trans, bch2_dirent_hash_desc, + &dir_hash_info, &iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); bch2_trans_iter_exit(trans, &iter); err: bch_err_fn(c, ret); @@ -1371,10 +1372,6 @@ static int check_overlapping_extents(struct btree_trans *trans, goto err; } - ret = extent_ends_at(c, extent_ends, seen, k); - if (ret) - goto err; - extent_ends->last_pos = k.k->p; err: return ret; @@ -1504,6 +1501,12 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, i->seen_this_pos = true; } + + if (k.k->type != KEY_TYPE_whiteout) { + ret = extent_ends_at(c, extent_ends, s, k); + if (ret) + goto err; + } out: err: fsck_err: @@ -2098,17 +2101,21 @@ static int check_root_trans(struct btree_trans *trans) if (mustfix_fsck_err_on(ret, c, root_subvol_missing, "root subvol missing")) { - struct bkey_i_subvolume root_subvol; + struct bkey_i_subvolume *root_subvol = + bch2_trans_kmalloc(trans, sizeof(*root_subvol)); + ret = PTR_ERR_OR_ZERO(root_subvol); + if (ret) + goto err; snapshot = U32_MAX; inum = BCACHEFS_ROOT_INO; - bkey_subvolume_init(&root_subvol.k_i); - root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL; - root_subvol.v.flags = 0; - root_subvol.v.snapshot = cpu_to_le32(snapshot); - root_subvol.v.inode = cpu_to_le64(inum); - ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol.k_i, 0); + bkey_subvolume_init(&root_subvol->k_i); + root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL; + root_subvol->v.flags = 0; + root_subvol->v.snapshot = cpu_to_le32(snapshot); + root_subvol->v.inode = cpu_to_le64(inum); + ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0); bch_err_msg(c, ret, "writing root subvol"); if (ret) goto err; diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 2b5e06770ab39..ca4a066e9a542 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -552,8 +552,8 @@ static void __bch2_inode_unpacked_to_text(struct printbuf *out, prt_printf(out, "bi_sectors=%llu", inode->bi_sectors); prt_newline(out); - prt_newline(out); prt_printf(out, "bi_version=%llu", inode->bi_version); + prt_newline(out); #define x(_name, _bits) \ prt_printf(out, #_name "=%llu", (u64) inode->_name); \ diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c index 1baf78594ccaf..82f9170dab3fd 100644 --- a/fs/bcachefs/io_misc.c +++ b/fs/bcachefs/io_misc.c @@ -264,6 +264,7 @@ static int __bch2_resume_logged_op_truncate(struct btree_trans *trans, ret = 0; err: bch2_logged_op_finish(trans, op_k); + bch_err_fn(c, ret); return ret; } @@ -476,6 +477,7 @@ case LOGGED_OP_FINSERT_finish: break; } err: + bch_err_fn(c, ret); bch2_logged_op_finish(trans, op_k); bch2_trans_iter_exit(trans, &iter); return ret; diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c index 9fac838d123e8..b82f8209041ff 100644 --- a/fs/bcachefs/logged_ops.c +++ b/fs/bcachefs/logged_ops.c @@ -37,7 +37,6 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter, const struct bch_logged_op_fn *fn = logged_op_fn(k.k->type); struct bkey_buf sk; u32 restart_count = trans->restart_count; - int ret; if (!fn) return 0; @@ -45,11 +44,11 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter, bch2_bkey_buf_init(&sk); bch2_bkey_buf_reassemble(&sk, c, k); - ret = drop_locks_do(trans, (bch2_fs_lazy_rw(c), 0)) ?: - fn->resume(trans, sk.k) ?: trans_was_restarted(trans, restart_count); + fn->resume(trans, sk.k); bch2_bkey_buf_exit(&sk, c); - return ret; + + return trans_was_restarted(trans, restart_count); } int bch2_resume_logged_ops(struct bch_fs *c) diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index 08ea0cfc4aef0..e1800c4119b5f 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -7,6 +7,7 @@ #include "disk_groups.h" #include "error.h" #include "opts.h" +#include "recovery_passes.h" #include "super-io.h" #include "util.h" @@ -205,6 +206,9 @@ const struct bch_option bch2_opt_table[] = { #define OPT_STR(_choices) .type = BCH_OPT_STR, \ .min = 0, .max = ARRAY_SIZE(_choices), \ .choices = _choices +#define OPT_STR_NOLIMIT(_choices) .type = BCH_OPT_STR, \ + .min = 0, .max = U64_MAX, \ + .choices = _choices #define OPT_FN(_fn) .type = BCH_OPT_FN, .fn = _fn #define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help) \ diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 136083c11f3a3..084247c13bd8f 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -362,7 +362,12 @@ enum fsck_err_opts { OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ - NULL, "Don't replay the journal") \ + NULL, "Exit recovery immediately prior to journal replay")\ + x(recovery_pass_last, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_STR_NOLIMIT(bch2_recovery_passes), \ + BCH2_NO_SB_OPT, 0, \ + NULL, "Exit recovery after specified pass") \ x(keep_journal, u8, \ 0, \ OPT_BOOL(), \ diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 03f9d6afe4678..e8b4340092936 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -1,35 +1,30 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" -#include "backpointers.h" -#include "bkey_buf.h" #include "alloc_background.h" -#include "btree_gc.h" +#include "bkey_buf.h" #include "btree_journal_iter.h" #include "btree_update.h" #include "btree_update_interior.h" #include "btree_io.h" #include "buckets.h" #include "dirent.h" -#include "ec.h" #include "errcode.h" #include "error.h" #include "fs-common.h" -#include "fsck.h" #include "journal_io.h" #include "journal_reclaim.h" #include "journal_seq_blacklist.h" -#include "lru.h" #include "logged_ops.h" #include "move.h" #include "quota.h" #include "rebalance.h" #include "recovery.h" +#include "recovery_passes.h" #include "replicas.h" #include "sb-clean.h" #include "sb-downgrade.h" #include "snapshot.h" -#include "subvolume.h" #include "super-io.h" #include <linux/sort.h> @@ -186,7 +181,7 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r) return cmp_int(l->journal_seq, r->journal_seq); } -static int bch2_journal_replay(struct bch_fs *c) +int bch2_journal_replay(struct bch_fs *c) { struct journal_keys *keys = &c->journal_keys; DARRAY(struct journal_key *) keys_sorted = { 0 }; @@ -194,6 +189,7 @@ static int bch2_journal_replay(struct bch_fs *c) u64 start_seq = c->journal_replay_seq_start; u64 end_seq = c->journal_replay_seq_start; struct btree_trans *trans = bch2_trans_get(c); + bool immediate_flush = false; int ret = 0; if (keys->nr) { @@ -215,6 +211,13 @@ static int bch2_journal_replay(struct bch_fs *c) darray_for_each(*keys, k) { cond_resched(); + /* + * k->allocated means the key wasn't read in from the journal, + * rather it was from early repair code + */ + if (k->allocated) + immediate_flush = true; + /* Skip fastpath if we're low on space in the journal */ ret = c->journal.watermark ? -1 : commit_do(trans, NULL, NULL, @@ -266,7 +269,8 @@ static int bch2_journal_replay(struct bch_fs *c) bch2_trans_put(trans); trans = NULL; - if (!c->opts.keep_journal) + if (!c->opts.keep_journal && + c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) bch2_journal_keys_put_initial(c); replay_now_at(j, j->replay_journal_seq_end); @@ -274,6 +278,12 @@ static int bch2_journal_replay(struct bch_fs *c) bch2_journal_set_replay_done(j); + /* if we did any repair, flush it immediately */ + if (immediate_flush) { + bch2_journal_flush_all_pins(&c->journal); + ret = bch2_journal_meta(&c->journal); + } + if (keys->nr) bch2_journal_log_msg(c, "journal replay finished"); err: @@ -471,150 +481,6 @@ fsck_err: return ret; } -static int bch2_initialize_subvolumes(struct bch_fs *c) -{ - struct bkey_i_snapshot_tree root_tree; - struct bkey_i_snapshot root_snapshot; - struct bkey_i_subvolume root_volume; - int ret; - - bkey_snapshot_tree_init(&root_tree.k_i); - root_tree.k.p.offset = 1; - root_tree.v.master_subvol = cpu_to_le32(1); - root_tree.v.root_snapshot = cpu_to_le32(U32_MAX); - - bkey_snapshot_init(&root_snapshot.k_i); - root_snapshot.k.p.offset = U32_MAX; - root_snapshot.v.flags = 0; - root_snapshot.v.parent = 0; - root_snapshot.v.subvol = cpu_to_le32(BCACHEFS_ROOT_SUBVOL); - root_snapshot.v.tree = cpu_to_le32(1); - SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true); - - bkey_subvolume_init(&root_volume.k_i); - root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL; - root_volume.v.flags = 0; - root_volume.v.snapshot = cpu_to_le32(U32_MAX); - root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO); - - ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0) ?: - bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0) ?: - bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0); - bch_err_fn(c, ret); - return ret; -} - -static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans) -{ - struct btree_iter iter; - struct bkey_s_c k; - struct bch_inode_unpacked inode; - int ret; - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0); - ret = bkey_err(k); - if (ret) - return ret; - - if (!bkey_is_inode(k.k)) { - bch_err(trans->c, "root inode not found"); - ret = -BCH_ERR_ENOENT_inode; - goto err; - } - - ret = bch2_inode_unpack(k, &inode); - BUG_ON(ret); - - inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; - - ret = bch2_inode_write(trans, &iter, &inode); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -/* set bi_subvol on root inode */ -noinline_for_stack -static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c) -{ - int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw, - __bch2_fs_upgrade_for_subvolumes(trans)); - bch_err_fn(c, ret); - return ret; -} - -const char * const bch2_recovery_passes[] = { -#define x(_fn, ...) #_fn, - BCH_RECOVERY_PASSES() -#undef x - NULL -}; - -static int bch2_check_allocations(struct bch_fs *c) -{ - return bch2_gc(c, true, c->opts.norecovery); -} - -static int bch2_set_may_go_rw(struct bch_fs *c) -{ - struct journal_keys *keys = &c->journal_keys; - - /* - * After we go RW, the journal keys buffer can't be modified (except for - * setting journal_key->overwritten: it will be accessed by multiple - * threads - */ - move_gap(keys, keys->nr); - - set_bit(BCH_FS_may_go_rw, &c->flags); - - if (keys->nr || c->opts.fsck || !c->sb.clean) - return bch2_fs_read_write_early(c); - return 0; -} - -struct recovery_pass_fn { - int (*fn)(struct bch_fs *); - unsigned when; -}; - -static struct recovery_pass_fn recovery_pass_fns[] = { -#define x(_fn, _id, _when) { .fn = bch2_##_fn, .when = _when }, - BCH_RECOVERY_PASSES() -#undef x -}; - -u64 bch2_recovery_passes_to_stable(u64 v) -{ - static const u8 map[] = { -#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n, - BCH_RECOVERY_PASSES() -#undef x - }; - - u64 ret = 0; - for (unsigned i = 0; i < ARRAY_SIZE(map); i++) - if (v & BIT_ULL(i)) - ret |= BIT_ULL(map[i]); - return ret; -} - -u64 bch2_recovery_passes_from_stable(u64 v) -{ - static const u8 map[] = { -#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n, - BCH_RECOVERY_PASSES() -#undef x - }; - - u64 ret = 0; - for (unsigned i = 0; i < ARRAY_SIZE(map); i++) - if (v & BIT_ULL(i)) - ret |= BIT_ULL(map[i]); - return ret; -} - static bool check_version_upgrade(struct bch_fs *c) { unsigned latest_version = bcachefs_metadata_version_current; @@ -687,96 +553,6 @@ static bool check_version_upgrade(struct bch_fs *c) return false; } -u64 bch2_fsck_recovery_passes(void) -{ - u64 ret = 0; - - for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) - if (recovery_pass_fns[i].when & PASS_FSCK) - ret |= BIT_ULL(i); - return ret; -} - -static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) -{ - struct recovery_pass_fn *p = recovery_pass_fns + pass; - - if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read) - return false; - if (c->recovery_passes_explicit & BIT_ULL(pass)) - return true; - if ((p->when & PASS_FSCK) && c->opts.fsck) - return true; - if ((p->when & PASS_UNCLEAN) && !c->sb.clean) - return true; - if (p->when & PASS_ALWAYS) - return true; - return false; -} - -static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) -{ - struct recovery_pass_fn *p = recovery_pass_fns + pass; - int ret; - - if (!(p->when & PASS_SILENT)) - bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."), - bch2_recovery_passes[pass]); - ret = p->fn(c); - if (ret) - return ret; - if (!(p->when & PASS_SILENT)) - bch2_print(c, KERN_CONT " done\n"); - - return 0; -} - -static int bch2_run_recovery_passes(struct bch_fs *c) -{ - int ret = 0; - - while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) { - if (should_run_recovery_pass(c, c->curr_recovery_pass)) { - unsigned pass = c->curr_recovery_pass; - - ret = bch2_run_recovery_pass(c, c->curr_recovery_pass); - if (bch2_err_matches(ret, BCH_ERR_restart_recovery) || - (ret && c->curr_recovery_pass < pass)) - continue; - if (ret) - break; - - c->recovery_passes_complete |= BIT_ULL(c->curr_recovery_pass); - } - c->curr_recovery_pass++; - c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass); - } - - return ret; -} - -int bch2_run_online_recovery_passes(struct bch_fs *c) -{ - int ret = 0; - - for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) { - struct recovery_pass_fn *p = recovery_pass_fns + i; - - if (!(p->when & PASS_ONLINE)) - continue; - - ret = bch2_run_recovery_pass(c, i); - if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) { - i = c->curr_recovery_pass; - continue; - } - if (ret) - break; - } - - return ret; -} - int bch2_fs_recovery(struct bch_fs *c) { struct bch_sb_field_clean *clean = NULL; @@ -809,24 +585,14 @@ int bch2_fs_recovery(struct bch_fs *c) goto err; } - if (c->opts.fsck && c->opts.norecovery) { - bch_err(c, "cannot select both norecovery and fsck"); - ret = -EINVAL; - goto err; - } + if (c->opts.norecovery) + c->opts.recovery_pass_last = BCH_RECOVERY_PASS_journal_replay - 1; if (!c->opts.nochanges) { mutex_lock(&c->sb_lock); + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); bool write_sb = false; - struct bch_sb_field_ext *ext = - bch2_sb_field_get_minsize(&c->disk_sb, ext, sizeof(*ext) / sizeof(u64)); - if (!ext) { - ret = -BCH_ERR_ENOSPC_sb; - mutex_unlock(&c->sb_lock); - goto err; - } - if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) { ext->recovery_passes_required[0] |= cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology))); @@ -1017,6 +783,12 @@ use_clean: clear_bit(BCH_FS_fsck_running, &c->flags); + /* fsync if we fixed errors */ + if (test_bit(BCH_FS_errors_fixed, &c->flags)) { + bch2_journal_flush_all_pins(&c->journal); + bch2_journal_meta(&c->journal); + } + /* If we fixed errors, verify that fs is actually clean now: */ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && test_bit(BCH_FS_errors_fixed, &c->flags) && @@ -1051,6 +823,7 @@ use_clean: } mutex_lock(&c->sb_lock); + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); bool write_sb = false; if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) != le16_to_cpu(c->disk_sb.sb->version)) { @@ -1064,15 +837,12 @@ use_clean: write_sb = true; } - if (!test_bit(BCH_FS_error, &c->flags)) { - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - if (ext && - (!bch2_is_zero(ext->recovery_passes_required, sizeof(ext->recovery_passes_required)) || - !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent)))) { - memset(ext->recovery_passes_required, 0, sizeof(ext->recovery_passes_required)); - memset(ext->errors_silent, 0, sizeof(ext->errors_silent)); - write_sb = true; - } + if (!test_bit(BCH_FS_error, &c->flags) && + (!bch2_is_zero(ext->recovery_passes_required, sizeof(ext->recovery_passes_required)) || + !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent)))) { + memset(ext->recovery_passes_required, 0, sizeof(ext->recovery_passes_required)); + memset(ext->errors_silent, 0, sizeof(ext->errors_silent)); + write_sb = true; } if (c->opts.fsck && @@ -1155,7 +925,7 @@ int bch2_fs_initialize(struct bch_fs *c) } mutex_unlock(&c->sb_lock); - c->curr_recovery_pass = ARRAY_SIZE(recovery_pass_fns); + c->curr_recovery_pass = BCH_RECOVERY_PASS_NR; set_bit(BCH_FS_may_go_rw, &c->flags); for (unsigned i = 0; i < BTREE_ID_NR; i++) @@ -1230,7 +1000,7 @@ int bch2_fs_initialize(struct bch_fs *c) if (ret) goto err; - c->recovery_pass_done = ARRAY_SIZE(recovery_pass_fns) - 1; + c->recovery_pass_done = BCH_RECOVERY_PASS_NR - 1; if (enabled_qtypes(c)) { ret = bch2_fs_quota_read(c); diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h index 4e9d24719b2e8..3962fd87b50d8 100644 --- a/fs/bcachefs/recovery.h +++ b/fs/bcachefs/recovery.h @@ -2,37 +2,7 @@ #ifndef _BCACHEFS_RECOVERY_H #define _BCACHEFS_RECOVERY_H -extern const char * const bch2_recovery_passes[]; - -u64 bch2_recovery_passes_to_stable(u64 v); -u64 bch2_recovery_passes_from_stable(u64 v); - -/* - * For when we need to rewind recovery passes and run a pass we skipped: - */ -static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c, - enum bch_recovery_pass pass) -{ - if (c->recovery_passes_explicit & BIT_ULL(pass)) - return 0; - - bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)", - bch2_recovery_passes[pass], pass, - bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass); - - c->recovery_passes_explicit |= BIT_ULL(pass); - - if (c->curr_recovery_pass >= pass) { - c->curr_recovery_pass = pass; - c->recovery_passes_complete &= (1ULL << pass) >> 1; - return -BCH_ERR_restart_recovery; - } else { - return 0; - } -} - -int bch2_run_online_recovery_passes(struct bch_fs *); -u64 bch2_fsck_recovery_passes(void); +int bch2_journal_replay(struct bch_fs *); int bch2_fs_recovery(struct bch_fs *); int bch2_fs_initialize(struct bch_fs *); diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c new file mode 100644 index 0000000000000..0089d9456b100 --- /dev/null +++ b/fs/bcachefs/recovery_passes.c @@ -0,0 +1,225 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "alloc_background.h" +#include "backpointers.h" +#include "btree_gc.h" +#include "ec.h" +#include "fsck.h" +#include "inode.h" +#include "journal.h" +#include "lru.h" +#include "logged_ops.h" +#include "rebalance.h" +#include "recovery.h" +#include "recovery_passes.h" +#include "snapshot.h" +#include "subvolume.h" +#include "super.h" +#include "super-io.h" + +const char * const bch2_recovery_passes[] = { +#define x(_fn, ...) #_fn, + BCH_RECOVERY_PASSES() +#undef x + NULL +}; + +static int bch2_check_allocations(struct bch_fs *c) +{ + return bch2_gc(c, true, false); +} + +static int bch2_set_may_go_rw(struct bch_fs *c) +{ + struct journal_keys *keys = &c->journal_keys; + + /* + * After we go RW, the journal keys buffer can't be modified (except for + * setting journal_key->overwritten: it will be accessed by multiple + * threads + */ + move_gap(keys, keys->nr); + + set_bit(BCH_FS_may_go_rw, &c->flags); + + if (keys->nr || c->opts.fsck || !c->sb.clean) + return bch2_fs_read_write_early(c); + return 0; +} + +struct recovery_pass_fn { + int (*fn)(struct bch_fs *); + unsigned when; +}; + +static struct recovery_pass_fn recovery_pass_fns[] = { +#define x(_fn, _id, _when) { .fn = bch2_##_fn, .when = _when }, + BCH_RECOVERY_PASSES() +#undef x +}; + +u64 bch2_recovery_passes_to_stable(u64 v) +{ + static const u8 map[] = { +#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n, + BCH_RECOVERY_PASSES() +#undef x + }; + + u64 ret = 0; + for (unsigned i = 0; i < ARRAY_SIZE(map); i++) + if (v & BIT_ULL(i)) + ret |= BIT_ULL(map[i]); + return ret; +} + +u64 bch2_recovery_passes_from_stable(u64 v) +{ + static const u8 map[] = { +#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n, + BCH_RECOVERY_PASSES() +#undef x + }; + + u64 ret = 0; + for (unsigned i = 0; i < ARRAY_SIZE(map); i++) + if (v & BIT_ULL(i)) + ret |= BIT_ULL(map[i]); + return ret; +} + +/* + * For when we need to rewind recovery passes and run a pass we skipped: + */ +int bch2_run_explicit_recovery_pass(struct bch_fs *c, + enum bch_recovery_pass pass) +{ + if (c->recovery_passes_explicit & BIT_ULL(pass)) + return 0; + + bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)", + bch2_recovery_passes[pass], pass, + bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass); + + c->recovery_passes_explicit |= BIT_ULL(pass); + + if (c->curr_recovery_pass >= pass) { + c->curr_recovery_pass = pass; + c->recovery_passes_complete &= (1ULL << pass) >> 1; + return -BCH_ERR_restart_recovery; + } else { + return 0; + } +} + +int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c, + enum bch_recovery_pass pass) +{ + __le64 s = cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(pass))); + + mutex_lock(&c->sb_lock); + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + + if (!(ext->recovery_passes_required[0] & s)) { + ext->recovery_passes_required[0] |= s; + bch2_write_super(c); + } + mutex_unlock(&c->sb_lock); + + return bch2_run_explicit_recovery_pass(c, pass); +} + +u64 bch2_fsck_recovery_passes(void) +{ + u64 ret = 0; + + for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) + if (recovery_pass_fns[i].when & PASS_FSCK) + ret |= BIT_ULL(i); + return ret; +} + +static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) +{ + struct recovery_pass_fn *p = recovery_pass_fns + pass; + + if (c->recovery_passes_explicit & BIT_ULL(pass)) + return true; + if ((p->when & PASS_FSCK) && c->opts.fsck) + return true; + if ((p->when & PASS_UNCLEAN) && !c->sb.clean) + return true; + if (p->when & PASS_ALWAYS) + return true; + return false; +} + +static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) +{ + struct recovery_pass_fn *p = recovery_pass_fns + pass; + int ret; + + if (!(p->when & PASS_SILENT)) + bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."), + bch2_recovery_passes[pass]); + ret = p->fn(c); + if (ret) + return ret; + if (!(p->when & PASS_SILENT)) + bch2_print(c, KERN_CONT " done\n"); + + return 0; +} + +int bch2_run_online_recovery_passes(struct bch_fs *c) +{ + int ret = 0; + + for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) { + struct recovery_pass_fn *p = recovery_pass_fns + i; + + if (!(p->when & PASS_ONLINE)) + continue; + + ret = bch2_run_recovery_pass(c, i); + if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) { + i = c->curr_recovery_pass; + continue; + } + if (ret) + break; + } + + return ret; +} + +int bch2_run_recovery_passes(struct bch_fs *c) +{ + int ret = 0; + + while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) { + if (c->opts.recovery_pass_last && + c->curr_recovery_pass > c->opts.recovery_pass_last) + break; + + if (should_run_recovery_pass(c, c->curr_recovery_pass)) { + unsigned pass = c->curr_recovery_pass; + + ret = bch2_run_recovery_pass(c, c->curr_recovery_pass); + if (bch2_err_matches(ret, BCH_ERR_restart_recovery) || + (ret && c->curr_recovery_pass < pass)) + continue; + if (ret) + break; + + c->recovery_passes_complete |= BIT_ULL(c->curr_recovery_pass); + } + + c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass); + + c->curr_recovery_pass++; + } + + return ret; +} diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h new file mode 100644 index 0000000000000..99b464e127b80 --- /dev/null +++ b/fs/bcachefs/recovery_passes.h @@ -0,0 +1,17 @@ +#ifndef _BCACHEFS_RECOVERY_PASSES_H +#define _BCACHEFS_RECOVERY_PASSES_H + +extern const char * const bch2_recovery_passes[]; + +u64 bch2_recovery_passes_to_stable(u64 v); +u64 bch2_recovery_passes_from_stable(u64 v); + +u64 bch2_fsck_recovery_passes(void); + +int bch2_run_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass); +int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, enum bch_recovery_pass); + +int bch2_run_online_recovery_passes(struct bch_fs *); +int bch2_run_recovery_passes(struct bch_fs *); + +#endif /* _BCACHEFS_RECOVERY_PASSES_H */ diff --git a/fs/bcachefs/recovery_types.h b/fs/bcachefs/recovery_passes_types.h index 4959e95e7c746..f305212857068 100644 --- a/fs/bcachefs/recovery_types.h +++ b/fs/bcachefs/recovery_passes_types.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_RECOVERY_TYPES_H -#define _BCACHEFS_RECOVERY_TYPES_H +#ifndef _BCACHEFS_RECOVERY_PASSES_TYPES_H +#define _BCACHEFS_RECOVERY_PASSES_TYPES_H #define PASS_SILENT BIT(0) #define PASS_FSCK BIT(1) @@ -37,7 +37,6 @@ x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \ x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \ x(fs_upgrade_for_subvolumes, 22, 0) \ - x(resume_logged_ops, 23, PASS_ALWAYS) \ x(check_inodes, 24, PASS_FSCK) \ x(check_extents, 25, PASS_FSCK) \ x(check_indirect_extents, 26, PASS_FSCK) \ @@ -47,6 +46,7 @@ x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \ x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ x(check_nlinks, 31, PASS_FSCK) \ + x(resume_logged_ops, 23, PASS_ALWAYS) \ x(delete_dead_inodes, 32, PASS_FSCK|PASS_UNCLEAN) \ x(fix_reflink_p, 33, 0) \ x(set_fs_needs_rebalance, 34, 0) \ @@ -56,6 +56,7 @@ enum bch_recovery_pass { #define x(n, id, when) BCH_RECOVERY_PASS_##n, BCH_RECOVERY_PASSES() #undef x + BCH_RECOVERY_PASS_NR }; /* But we also need stable identifiers that can be used in the superblock */ @@ -65,4 +66,4 @@ enum bch_recovery_pass_stable { #undef x }; -#endif /* _BCACHEFS_RECOVERY_TYPES_H */ +#endif /* _BCACHEFS_RECOVERY_PASSES_TYPES_H */ diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index c47c66c2b394d..ff7864731a073 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -185,8 +185,7 @@ not_found: } else { bkey_error_init(update); update->k.p = p.k->p; - update->k.p.offset = next_idx; - update->k.size = next_idx - *idx; + update->k.size = p.k->size; set_bkey_val_u64s(&update->k, 0); } diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index e4396cb0bacb0..d6f81179c3a29 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -7,7 +7,7 @@ #include "bcachefs.h" #include "darray.h" -#include "recovery.h" +#include "recovery_passes.h" #include "sb-downgrade.h" #include "sb-errors.h" #include "super-io.h" diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h index 5178bf579f7c5..73e9634df8ffb 100644 --- a/fs/bcachefs/sb-errors_types.h +++ b/fs/bcachefs/sb-errors_types.h @@ -265,7 +265,9 @@ x(subvol_children_bad, 257) \ x(subvol_loop, 258) \ x(subvol_unreachable, 259) \ - x(btree_node_bkey_bad_u64s, 260) + x(btree_node_bkey_bad_u64s, 260) \ + x(btree_node_topology_empty_interior_node, 261) \ + x(btree_ptr_v2_min_key_bad, 262) enum bch_sb_error_id { #define x(t, n) BCH_FSCK_ERR_##t = n, diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 39debe814bf39..4577ee7939a2a 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -93,8 +93,10 @@ static int bch2_snapshot_tree_create(struct btree_trans *trans, static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id, u32 ancestor) { - while (id && id < ancestor) - id = __snapshot_t(t, id)->parent; + while (id && id < ancestor) { + const struct snapshot_t *s = __snapshot_t(t, id); + id = s ? s->parent : 0; + } return id == ancestor; } @@ -110,6 +112,8 @@ static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancest static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor) { const struct snapshot_t *s = __snapshot_t(t, id); + if (!s) + return 0; if (s->skip[2] <= ancestor) return s->skip[2]; @@ -127,7 +131,7 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) rcu_read_lock(); struct snapshot_table *t = rcu_dereference(c->snapshots); - if (unlikely(c->recovery_pass_done <= BCH_RECOVERY_PASS_check_snapshots)) { + if (unlikely(c->recovery_pass_done < BCH_RECOVERY_PASS_check_snapshots)) { ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor); goto out; } @@ -151,36 +155,39 @@ out: static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id) { size_t idx = U32_MAX - id; - size_t new_size; struct snapshot_table *new, *old; - new_size = max(16UL, roundup_pow_of_two(idx + 1)); + size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1)); + size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]); - new = kvzalloc(struct_size(new, s, new_size), GFP_KERNEL); + new = kvzalloc(new_bytes, GFP_KERNEL); if (!new) return NULL; + new->nr = new_size; + old = rcu_dereference_protected(c->snapshots, true); if (old) - memcpy(new->s, - rcu_dereference_protected(c->snapshots, true)->s, - sizeof(new->s[0]) * c->snapshot_table_size); + memcpy(new->s, old->s, sizeof(old->s[0]) * old->nr); rcu_assign_pointer(c->snapshots, new); - c->snapshot_table_size = new_size; - kvfree_rcu_mightsleep(old); + kvfree_rcu(old, rcu); - return &rcu_dereference_protected(c->snapshots, true)->s[idx]; + return &rcu_dereference_protected(c->snapshots, + lockdep_is_held(&c->snapshot_table_lock))->s[idx]; } static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id) { size_t idx = U32_MAX - id; + struct snapshot_table *table = + rcu_dereference_protected(c->snapshots, + lockdep_is_held(&c->snapshot_table_lock)); lockdep_assert_held(&c->snapshot_table_lock); - if (likely(idx < c->snapshot_table_size)) - return &rcu_dereference_protected(c->snapshots, true)->s[idx]; + if (likely(table && idx < table->nr)) + return &table->s[idx]; return __snapshot_t_mut(c, id); } diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index 7c66ffc06385d..331f20fd8d03d 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -33,7 +33,11 @@ int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned, static inline struct snapshot_t *__snapshot_t(struct snapshot_table *t, u32 id) { - return &t->s[U32_MAX - id]; + u32 idx = U32_MAX - id; + + return likely(t && idx < t->nr) + ? &t->s[idx] + : NULL; } static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id) @@ -44,7 +48,8 @@ static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id) static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id) { rcu_read_lock(); - id = snapshot_t(c, id)->tree; + const struct snapshot_t *s = snapshot_t(c, id); + id = s ? s->tree : 0; rcu_read_unlock(); return id; @@ -52,7 +57,8 @@ static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id) static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id) { - return snapshot_t(c, id)->parent; + const struct snapshot_t *s = snapshot_t(c, id); + return s ? s->parent : 0; } static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id) @@ -66,19 +72,19 @@ static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id) static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id) { -#ifdef CONFIG_BCACHEFS_DEBUG - u32 parent = snapshot_t(c, id)->parent; + const struct snapshot_t *s = snapshot_t(c, id); + if (!s) + return 0; - if (parent && - snapshot_t(c, id)->depth != snapshot_t(c, parent)->depth + 1) + u32 parent = s->parent; + if (IS_ENABLED(CONFIG_BCACHEFS_DEBU) && + parent && + s->depth != snapshot_t(c, parent)->depth + 1) panic("id %u depth=%u parent %u depth=%u\n", id, snapshot_t(c, id)->depth, parent, snapshot_t(c, parent)->depth); return parent; -#else - return snapshot_t(c, id)->parent; -#endif } static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id) @@ -116,7 +122,8 @@ static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id) static inline u32 __bch2_snapshot_equiv(struct bch_fs *c, u32 id) { - return snapshot_t(c, id)->equiv; + const struct snapshot_t *s = snapshot_t(c, id); + return s ? s->equiv : 0; } static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id) @@ -133,38 +140,22 @@ static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id) return id == bch2_snapshot_equiv(c, id); } -static inline bool bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id) +static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id) { - const struct snapshot_t *s; - bool ret; - rcu_read_lock(); - s = snapshot_t(c, id); - ret = s->children[0]; + const struct snapshot_t *s = snapshot_t(c, id); + int ret = s ? s->children[0] : -BCH_ERR_invalid_snapshot_node; rcu_read_unlock(); return ret; } -static inline u32 bch2_snapshot_is_leaf(struct bch_fs *c, u32 id) -{ - return !bch2_snapshot_is_internal_node(c, id); -} - -static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id) +static inline int bch2_snapshot_is_leaf(struct bch_fs *c, u32 id) { - const struct snapshot_t *s; - u32 parent = __bch2_snapshot_parent(c, id); - - if (!parent) - return 0; - - s = snapshot_t(c, __bch2_snapshot_parent(c, id)); - if (id == s->children[0]) - return s->children[1]; - if (id == s->children[1]) - return s->children[0]; - return 0; + int ret = bch2_snapshot_is_internal_node(c, id); + if (ret < 0) + return ret; + return !ret; } static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent) @@ -249,7 +240,7 @@ static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans, struct bpos pos) { if (!btree_type_has_snapshots(id) || - bch2_snapshot_is_leaf(trans->c, pos.snapshot)) + bch2_snapshot_is_leaf(trans->c, pos.snapshot) > 0) return 0; return __bch2_key_has_snapshot_overwrites(trans, id, pos); diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index ce7aed1219423..88a79c8232768 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -595,6 +595,78 @@ err: return ret; } +int bch2_initialize_subvolumes(struct bch_fs *c) +{ + struct bkey_i_snapshot_tree root_tree; + struct bkey_i_snapshot root_snapshot; + struct bkey_i_subvolume root_volume; + int ret; + + bkey_snapshot_tree_init(&root_tree.k_i); + root_tree.k.p.offset = 1; + root_tree.v.master_subvol = cpu_to_le32(1); + root_tree.v.root_snapshot = cpu_to_le32(U32_MAX); + + bkey_snapshot_init(&root_snapshot.k_i); + root_snapshot.k.p.offset = U32_MAX; + root_snapshot.v.flags = 0; + root_snapshot.v.parent = 0; + root_snapshot.v.subvol = cpu_to_le32(BCACHEFS_ROOT_SUBVOL); + root_snapshot.v.tree = cpu_to_le32(1); + SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true); + + bkey_subvolume_init(&root_volume.k_i); + root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL; + root_volume.v.flags = 0; + root_volume.v.snapshot = cpu_to_le32(U32_MAX); + root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO); + + ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0) ?: + bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0) ?: + bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0); + bch_err_fn(c, ret); + return ret; +} + +static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bch_inode_unpacked inode; + int ret; + + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, + SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0); + ret = bkey_err(k); + if (ret) + return ret; + + if (!bkey_is_inode(k.k)) { + bch_err(trans->c, "root inode not found"); + ret = -BCH_ERR_ENOENT_inode; + goto err; + } + + ret = bch2_inode_unpack(k, &inode); + BUG_ON(ret); + + inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; + + ret = bch2_inode_write(trans, &iter, &inode); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +/* set bi_subvol on root inode */ +int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c) +{ + int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw, + __bch2_fs_upgrade_for_subvolumes(trans)); + bch_err_fn(c, ret); + return ret; +} + int bch2_fs_subvolumes_init(struct bch_fs *c) { INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work); diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h index 903c05162c068..d2015d549bd2a 100644 --- a/fs/bcachefs/subvolume.h +++ b/fs/bcachefs/subvolume.h @@ -37,6 +37,9 @@ void bch2_delete_dead_snapshots_async(struct bch_fs *); int bch2_subvolume_unlink(struct btree_trans *, u32); int bch2_subvolume_create(struct btree_trans *, u64, u32, u32, u32 *, u32 *, bool); +int bch2_initialize_subvolumes(struct bch_fs *); +int bch2_fs_upgrade_for_subvolumes(struct bch_fs *); + int bch2_fs_subvolumes_init(struct bch_fs *); #endif /* _BCACHEFS_SUBVOLUME_H */ diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h index ae644adfc3916..9b10c8947828e 100644 --- a/fs/bcachefs/subvolume_types.h +++ b/fs/bcachefs/subvolume_types.h @@ -20,6 +20,8 @@ struct snapshot_t { }; struct snapshot_table { + struct rcu_head rcu; + size_t nr; #ifndef RUST_BINDGEN DECLARE_FLEX_ARRAY(struct snapshot_t, s); #else diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index ad28e370b6404..e15f8b1f30c2a 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -8,7 +8,7 @@ #include "journal.h" #include "journal_sb.h" #include "journal_seq_blacklist.h" -#include "recovery.h" +#include "recovery_passes.h" #include "replicas.h" #include "quota.h" #include "sb-clean.h" diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 1ad6e5cd9476c..bc026a77eb99d 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -365,7 +365,7 @@ void bch2_fs_read_only(struct bch_fs *c) !test_bit(BCH_FS_emergency_ro, &c->flags) && test_bit(BCH_FS_started, &c->flags) && test_bit(BCH_FS_clean_shutdown, &c->flags) && - !c->opts.norecovery) { + c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) { BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal)); BUG_ON(atomic_read(&c->btree_cache.dirty)); BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty)); @@ -510,7 +510,8 @@ err: int bch2_fs_read_write(struct bch_fs *c) { - if (c->opts.norecovery) + if (c->opts.recovery_pass_last && + c->opts.recovery_pass_last < BCH_RECOVERY_PASS_journal_replay) return -BCH_ERR_erofs_norecovery; if (c->opts.nochanges) @@ -1015,8 +1016,16 @@ int bch2_fs_start(struct bch_fs *c) for_each_online_member(c, ca) bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = cpu_to_le64(now); + struct bch_sb_field_ext *ext = + bch2_sb_field_get_minsize(&c->disk_sb, ext, sizeof(*ext) / sizeof(u64)); mutex_unlock(&c->sb_lock); + if (!ext) { + bch_err(c, "insufficient space in superblock for sb_field_ext"); + ret = -BCH_ERR_ENOSPC_sb; + goto err; + } + for_each_rw_member(c, ca) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 1920ed69279b5..3314249e86747 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1359,7 +1359,7 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, SET_UID(psinfo->pr_uid, from_kuid_munged(cred->user_ns, cred->uid)); SET_GID(psinfo->pr_gid, from_kgid_munged(cred->user_ns, cred->gid)); rcu_read_unlock(); - strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname)); + get_task_comm(psinfo->pr_fname, p); return 0; } diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 5f7587ca1ca77..1e09aeea69c22 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1559,7 +1559,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * needing to allocate extents from the block group. */ used = btrfs_space_info_used(space_info, true); - if (space_info->total_bytes - block_group->length < used) { + if (space_info->total_bytes - block_group->length < used && + block_group->zone_unusable < block_group->length) { /* * Add a reference for the list, compensate for the ref * drop under the "next" label for the diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 7441245b1ceb1..61594eaf1f896 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4333,6 +4333,19 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, if (test_and_set_bit(EXTENT_BUFFER_READING, &eb->bflags)) goto done; + /* + * Between the initial test_bit(EXTENT_BUFFER_UPTODATE) and the above + * test_and_set_bit(EXTENT_BUFFER_READING), someone else could have + * started and finished reading the same eb. In this case, UPTODATE + * will now be set, and we shouldn't read it in again. + */ + if (unlikely(test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))) { + clear_bit(EXTENT_BUFFER_READING, &eb->bflags); + smp_mb__after_atomic(); + wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING); + return 0; + } + clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); eb->read_mirror = 0; check_buffer_tree_ref(eb); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 347ca13d15a97..445f7716f1e2f 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -309,7 +309,7 @@ int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) btrfs_warn(fs_info, "no extent map found for inode %llu (root %lld) when unpinning extent range [%llu, %llu), generation %llu", btrfs_ino(inode), btrfs_root_id(inode->root), - start, len, gen); + start, start + len, gen); ret = -ENOENT; goto out; } @@ -318,7 +318,7 @@ int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) btrfs_warn(fs_info, "found extent map for inode %llu (root %lld) with unexpected start offset %llu when unpinning extent range [%llu, %llu), generation %llu", btrfs_ino(inode), btrfs_root_id(inode->root), - em->start, start, len, gen); + em->start, start, start + len, gen); ret = -EUCLEAN; goto out; } @@ -340,9 +340,9 @@ int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) em->mod_len = em->len; } - free_extent_map(em); out: write_unlock(&tree->lock); + free_extent_map(em); return ret; } @@ -629,13 +629,13 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, */ ret = merge_extent_mapping(em_tree, existing, em, start); - if (ret) { + if (WARN_ON(ret)) { free_extent_map(em); *em_in = NULL; - WARN_ONCE(ret, -"extent map merge error existing [%llu, %llu) with em [%llu, %llu) start %llu\n", - existing->start, existing->len, - orig_start, orig_len, start); + btrfs_warn(fs_info, +"extent map merge error existing [%llu, %llu) with em [%llu, %llu) start %llu", + existing->start, extent_map_end(existing), + orig_start, orig_start + orig_len, start); } free_extent_map(existing); } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index c4bd0e60db592..fa25004ab04e7 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2812,7 +2812,17 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, gen = btrfs_get_last_trans_committed(fs_info); for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { - bytenr = btrfs_sb_offset(i); + ret = btrfs_sb_log_location(scrub_dev, i, 0, &bytenr); + if (ret == -ENOENT) + break; + + if (ret) { + spin_lock(&sctx->stat_lock); + sctx->stat.super_errors++; + spin_unlock(&sctx->stat_lock); + continue; + } + if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->commit_total_bytes) break; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1dc1f1946ae0e..f15591f3e54fa 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -692,6 +692,16 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, device->bdev = file_bdev(bdev_file); clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); + if (device->devt != device->bdev->bd_dev) { + btrfs_warn(NULL, + "device %s maj:min changed from %d:%d to %d:%d", + device->name->str, MAJOR(device->devt), + MINOR(device->devt), MAJOR(device->bdev->bd_dev), + MINOR(device->bdev->bd_dev)); + + device->devt = device->bdev->bd_dev; + } + fs_devices->open_devices++; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && device->devid != BTRFS_DEV_REPLACE_DEVID) { @@ -1174,23 +1184,30 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, struct btrfs_device *device; struct btrfs_device *latest_dev = NULL; struct btrfs_device *tmp_device; + int ret = 0; list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, dev_list) { - int ret; + int ret2; - ret = btrfs_open_one_device(fs_devices, device, flags, holder); - if (ret == 0 && + ret2 = btrfs_open_one_device(fs_devices, device, flags, holder); + if (ret2 == 0 && (!latest_dev || device->generation > latest_dev->generation)) { latest_dev = device; - } else if (ret == -ENODATA) { + } else if (ret2 == -ENODATA) { fs_devices->num_devices--; list_del(&device->dev_list); btrfs_free_device(device); } + if (ret == 0 && ret2 != 0) + ret = ret2; } - if (fs_devices->open_devices == 0) + + if (fs_devices->open_devices == 0) { + if (ret) + return ret; return -EINVAL; + } fs_devices->opened = 1; fs_devices->latest_dev = latest_dev; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 5a3d5ec75c5a9..4cba80b34387c 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1574,11 +1574,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) if (!map) return -EINVAL; - cache->physical_map = btrfs_clone_chunk_map(map, GFP_NOFS); - if (!cache->physical_map) { - ret = -ENOMEM; - goto out; - } + cache->physical_map = map; zone_info = kcalloc(map->num_stripes, sizeof(*zone_info), GFP_NOFS); if (!zone_info) { @@ -1690,7 +1686,6 @@ out: } bitmap_free(active); kfree(zone_info); - btrfs_free_chunk_map(map); return ret; } @@ -2175,6 +2170,7 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ struct btrfs_chunk_map *map; const bool is_metadata = (block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)); + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; int ret = 0; int i; @@ -2250,6 +2246,7 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ btrfs_clear_data_reloc_bg(block_group); spin_unlock(&block_group->lock); + down_read(&dev_replace->rwsem); map = block_group->physical_map; for (i = 0; i < map->num_stripes; i++) { struct btrfs_device *device = map->stripes[i].dev; @@ -2266,13 +2263,16 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ zinfo->zone_size >> SECTOR_SHIFT); memalloc_nofs_restore(nofs_flags); - if (ret) + if (ret) { + up_read(&dev_replace->rwsem); return ret; + } if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA)) zinfo->reserved_active_zones++; btrfs_dev_clear_active_zone(device, physical); } + up_read(&dev_replace->rwsem); if (!fully_written) btrfs_dec_block_group_ro(block_group); diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 69308fd73e4a9..c0eb139adb07a 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -430,7 +430,6 @@ static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode) switch (mode) { case EROFS_MOUNT_DAX_ALWAYS: - warnfc(fc, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); set_opt(&ctx->opt, DAX_ALWAYS); clear_opt(&ctx->opt, DAX_NEVER); return true; diff --git a/fs/exec.c b/fs/exec.c index ff6f26671cfc0..cf1df7f16e55c 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -895,6 +895,7 @@ int transfer_args_to_stack(struct linux_binprm *bprm, goto out; } + bprm->exec += *sp_location - MAX_ARG_PAGES * PAGE_SIZE; *sp_location = sp; out: diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 789af5c8fade9..aa1626955b2cf 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1718,7 +1718,8 @@ static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length) struct buffer_head *dibh, *bh; struct gfs2_holder rd_gh; unsigned int bsize_shift = sdp->sd_sb.sb_bsize_shift; - u64 lblock = (offset + (1 << bsize_shift) - 1) >> bsize_shift; + unsigned int bsize = 1 << bsize_shift; + u64 lblock = (offset + bsize - 1) >> bsize_shift; __u16 start_list[GFS2_MAX_META_HEIGHT]; __u16 __end_list[GFS2_MAX_META_HEIGHT], *end_list = NULL; unsigned int start_aligned, end_aligned; @@ -1729,7 +1730,7 @@ static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length) u64 prev_bnr = 0; __be64 *start, *end; - if (offset >= maxsize) { + if (offset + bsize - 1 >= maxsize) { /* * The starting point lies beyond the allocated metadata; * there are no blocks to deallocate. diff --git a/fs/namei.c b/fs/namei.c index ceb9ddf8dfdd4..c5b2a25be7d04 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4050,6 +4050,8 @@ retry: case 0: case S_IFREG: error = vfs_create(idmap, path.dentry->d_inode, dentry, mode, true); + if (!error) + security_path_post_mknod(idmap, dentry); break; case S_IFCHR: case S_IFBLK: error = vfs_mknod(idmap, path.dentry->d_inode, @@ -4060,11 +4062,6 @@ retry: dentry, mode, 0); break; } - - if (error) - goto out2; - - security_path_post_mknod(idmap, dentry); out2: done_path_create(&path, dentry); if (retry_estale(error, lookup_flags)) { diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 1a93c7fcf76c5..2391ab3c32319 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3831,15 +3831,20 @@ nfsd4_create_session(struct svc_rqst *rqstp, else cs_slot = &unconf->cl_cs_slot; status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); - if (status) { - if (status == nfserr_replay_cache) { - status = nfsd4_replay_create_session(cr_ses, cs_slot); - goto out_free_conn; - } + switch (status) { + case nfs_ok: + cs_slot->sl_seqid++; + cr_ses->seqid = cs_slot->sl_seqid; + break; + case nfserr_replay_cache: + status = nfsd4_replay_create_session(cr_ses, cs_slot); + fallthrough; + case nfserr_jukebox: + /* The server MUST NOT cache NFS4ERR_DELAY */ + goto out_free_conn; + default: goto out_cache_error; } - cs_slot->sl_seqid++; - cr_ses->seqid = cs_slot->sl_seqid; /* RFC 8881 Section 18.36.4 Phase 3: Client ID confirmation. */ if (conf) { @@ -3859,10 +3864,8 @@ nfsd4_create_session(struct svc_rqst *rqstp, old = find_confirmed_client_by_name(&unconf->cl_name, nn); if (old) { status = mark_client_expired_locked(old); - if (status) { - old = NULL; - goto out_cache_error; - } + if (status) + goto out_expired_error; trace_nfsd_clid_replaced(&old->cl_clientid); } move_to_confirmed(unconf); @@ -3894,6 +3897,17 @@ nfsd4_create_session(struct svc_rqst *rqstp, expire_client(old); return status; +out_expired_error: + old = NULL; + /* + * Revert the slot seq_nr change so the server will process + * the client's resend instead of returning a cached response. + */ + if (status == nfserr_jukebox) { + cs_slot->sl_seqid--; + cr_ses->seqid = cs_slot->sl_seqid; + goto out_free_conn; + } out_cache_error: nfsd4_cache_create_session(cr_ses, cs_slot, status); out_free_conn: diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 6a9464262fae6..2e41eb4c3cec7 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1852,7 +1852,7 @@ retry: trap = lock_rename(tdentry, fdentry); if (IS_ERR(trap)) { err = (rqstp->rq_vers == 2) ? nfserr_acces : nfserr_xdev; - goto out; + goto out_want_write; } err = fh_fill_pre_attrs(ffhp); if (err != nfs_ok) @@ -1922,6 +1922,7 @@ retry: } out_unlock: unlock_rename(tdentry, fdentry); +out_want_write: fh_drop_write(ffhp); /* diff --git a/fs/proc/Makefile b/fs/proc/Makefile index bd08616ed8bad..7b4db9c56e6a7 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -5,7 +5,7 @@ obj-y += proc.o -CFLAGS_task_mmu.o += $(call cc-option,-Wno-override-init,) +CFLAGS_task_mmu.o += -Wno-override-init proc-y := nommu.o task_nommu.o proc-$(CONFIG_MMU) := task_mmu.o diff --git a/fs/smb/client/dir.c b/fs/smb/client/dir.c index 89333d9bce36e..d11dc3aa458ba 100644 --- a/fs/smb/client/dir.c +++ b/fs/smb/client/dir.c @@ -612,11 +612,18 @@ int cifs_mknod(struct mnt_idmap *idmap, struct inode *inode, goto mknod_out; } + trace_smb3_mknod_enter(xid, tcon->ses->Suid, tcon->tid, full_path); + rc = tcon->ses->server->ops->make_node(xid, inode, direntry, tcon, full_path, mode, device_number); mknod_out: + if (rc) + trace_smb3_mknod_err(xid, tcon->ses->Suid, tcon->tid, rc); + else + trace_smb3_mknod_done(xid, tcon->ses->Suid, tcon->tid); + free_dentry_path(page); free_xid(xid); cifs_put_tlink(tlink); diff --git a/fs/smb/client/fscache.c b/fs/smb/client/fscache.c index c4a3cb736881a..340efce8f0529 100644 --- a/fs/smb/client/fscache.c +++ b/fs/smb/client/fscache.c @@ -12,6 +12,16 @@ #include "cifs_fs_sb.h" #include "cifsproto.h" +/* + * Key for fscache inode. [!] Contents must match comparisons in cifs_find_inode(). + */ +struct cifs_fscache_inode_key { + + __le64 uniqueid; /* server inode number */ + __le64 createtime; /* creation time on server */ + u8 type; /* S_IFMT file type */ +} __packed; + static void cifs_fscache_fill_volume_coherency( struct cifs_tcon *tcon, struct cifs_fscache_volume_coherency_data *cd) @@ -97,15 +107,19 @@ void cifs_fscache_release_super_cookie(struct cifs_tcon *tcon) void cifs_fscache_get_inode_cookie(struct inode *inode) { struct cifs_fscache_inode_coherency_data cd; + struct cifs_fscache_inode_key key; struct cifsInodeInfo *cifsi = CIFS_I(inode); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + key.uniqueid = cpu_to_le64(cifsi->uniqueid); + key.createtime = cpu_to_le64(cifsi->createtime); + key.type = (inode->i_mode & S_IFMT) >> 12; cifs_fscache_fill_coherency(&cifsi->netfs.inode, &cd); cifsi->netfs.cache = fscache_acquire_cookie(tcon->fscache, 0, - &cifsi->uniqueid, sizeof(cifsi->uniqueid), + &key, sizeof(key), &cd, sizeof(cd), i_size_read(&cifsi->netfs.inode)); if (cifsi->netfs.cache) diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index d28ab0af60493..91b07ef9e25ca 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -1351,6 +1351,8 @@ cifs_find_inode(struct inode *inode, void *opaque) { struct cifs_fattr *fattr = opaque; + /* [!] The compared values must be the same in struct cifs_fscache_inode_key. */ + /* don't match inode with different uniqueid */ if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid) return 0; diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h index f9c1fd32d0b8c..5e83cb9da9028 100644 --- a/fs/smb/client/trace.h +++ b/fs/smb/client/trace.h @@ -375,6 +375,7 @@ DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(get_reparse_compound_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(delete_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mkdir_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(tdis_enter); +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mknod_enter); DECLARE_EVENT_CLASS(smb3_inf_compound_done_class, TP_PROTO(unsigned int xid, @@ -415,7 +416,7 @@ DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(query_wsl_ea_compound_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(delete_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mkdir_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(tdis_done); - +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mknod_done); DECLARE_EVENT_CLASS(smb3_inf_compound_err_class, TP_PROTO(unsigned int xid, @@ -461,6 +462,7 @@ DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(query_wsl_ea_compound_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mkdir_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(delete_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(tdis_err); +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mknod_err); /* * For logging SMB3 Status code and Command for responses which return errors diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index d991eec054368..73a4b895de670 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -530,7 +530,8 @@ xfs_validate_sb_common( } if (!xfs_validate_stripe_geometry(mp, XFS_FSB_TO_B(mp, sbp->sb_unit), - XFS_FSB_TO_B(mp, sbp->sb_width), 0, false)) + XFS_FSB_TO_B(mp, sbp->sb_width), 0, + xfs_buf_daddr(bp) == XFS_SB_DADDR, false)) return -EFSCORRUPTED; /* @@ -1323,8 +1324,10 @@ xfs_sb_get_secondary( } /* - * sunit, swidth, sectorsize(optional with 0) should be all in bytes, - * so users won't be confused by values in error messages. + * sunit, swidth, sectorsize(optional with 0) should be all in bytes, so users + * won't be confused by values in error messages. This function returns false + * if the stripe geometry is invalid and the caller is unable to repair the + * stripe configuration later in the mount process. */ bool xfs_validate_stripe_geometry( @@ -1332,20 +1335,21 @@ xfs_validate_stripe_geometry( __s64 sunit, __s64 swidth, int sectorsize, + bool may_repair, bool silent) { if (swidth > INT_MAX) { if (!silent) xfs_notice(mp, "stripe width (%lld) is too large", swidth); - return false; + goto check_override; } if (sunit > swidth) { if (!silent) xfs_notice(mp, "stripe unit (%lld) is larger than the stripe width (%lld)", sunit, swidth); - return false; + goto check_override; } if (sectorsize && (int)sunit % sectorsize) { @@ -1353,21 +1357,21 @@ xfs_validate_stripe_geometry( xfs_notice(mp, "stripe unit (%lld) must be a multiple of the sector size (%d)", sunit, sectorsize); - return false; + goto check_override; } if (sunit && !swidth) { if (!silent) xfs_notice(mp, "invalid stripe unit (%lld) and stripe width of 0", sunit); - return false; + goto check_override; } if (!sunit && swidth) { if (!silent) xfs_notice(mp, "invalid stripe width (%lld) and stripe unit of 0", swidth); - return false; + goto check_override; } if (sunit && (int)swidth % (int)sunit) { @@ -1375,9 +1379,27 @@ xfs_validate_stripe_geometry( xfs_notice(mp, "stripe width (%lld) must be a multiple of the stripe unit (%lld)", swidth, sunit); - return false; + goto check_override; } return true; + +check_override: + if (!may_repair) + return false; + /* + * During mount, mp->m_dalign will not be set unless the sunit mount + * option was set. If it was set, ignore the bad stripe alignment values + * and allow the validation and overwrite later in the mount process to + * attempt to overwrite the bad stripe alignment values with the values + * supplied by mount options. + */ + if (!mp->m_dalign) + return false; + if (!silent) + xfs_notice(mp, +"Will try to correct with specified mount options sunit (%d) and swidth (%d)", + BBTOB(mp->m_dalign), BBTOB(mp->m_swidth)); + return true; } /* diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h index 2e8e8d63d4eb2..37b1ed1bc2095 100644 --- a/fs/xfs/libxfs/xfs_sb.h +++ b/fs/xfs/libxfs/xfs_sb.h @@ -35,8 +35,9 @@ extern int xfs_sb_get_secondary(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, struct xfs_buf **bpp); -extern bool xfs_validate_stripe_geometry(struct xfs_mount *mp, - __s64 sunit, __s64 swidth, int sectorsize, bool silent); +bool xfs_validate_stripe_geometry(struct xfs_mount *mp, + __s64 sunit, __s64 swidth, int sectorsize, bool may_repair, + bool silent); uint8_t xfs_compute_rextslog(xfs_rtbxlen_t rtextents); diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index abff79a77c72b..47a20cf5205f0 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -1044,9 +1044,7 @@ xchk_irele( struct xfs_scrub *sc, struct xfs_inode *ip) { - if (current->journal_info != NULL) { - ASSERT(current->journal_info == sc->tp); - + if (sc->tp) { /* * If we are in a transaction, we /cannot/ drop the inode * ourselves, because the VFS will trigger writeback, which diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 1698507d1ac73..3f428620ebf2a 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -503,13 +503,6 @@ xfs_vm_writepages( { struct xfs_writepage_ctx wpc = { }; - /* - * Writing back data in a transaction context can result in recursive - * transactions. This is bad, so issue a warning and get out of here. - */ - if (WARN_ON_ONCE(current->journal_info)) - return 0; - xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops); } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index e64265bc0b337..74f1812b03cbd 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -2039,8 +2039,10 @@ xfs_inodegc_want_queue_work( * - Memory shrinkers queued the inactivation worker and it hasn't finished. * - The queue depth exceeds the maximum allowable percpu backlog. * - * Note: If the current thread is running a transaction, we don't ever want to - * wait for other transactions because that could introduce a deadlock. + * Note: If we are in a NOFS context here (e.g. current thread is running a + * transaction) the we don't want to block here as inodegc progress may require + * filesystem resources we hold to make progress and that could result in a + * deadlock. Hence we skip out of here if we are in a scoped NOFS context. */ static inline bool xfs_inodegc_want_flush_work( @@ -2048,7 +2050,7 @@ xfs_inodegc_want_flush_work( unsigned int items, unsigned int shrinker_hits) { - if (current->journal_info) + if (current->flags & PF_MEMALLOC_NOFS) return false; if (shrinker_hits > 0) diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 3f7e3a09a49ff..1636663707dc0 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -268,19 +268,14 @@ static inline void xfs_trans_set_context( struct xfs_trans *tp) { - ASSERT(current->journal_info == NULL); tp->t_pflags = memalloc_nofs_save(); - current->journal_info = tp; } static inline void xfs_trans_clear_context( struct xfs_trans *tp) { - if (current->journal_info == tp) { - memalloc_nofs_restore(tp->t_pflags); - current->journal_info = NULL; - } + memalloc_nofs_restore(tp->t_pflags); } static inline void @@ -288,10 +283,8 @@ xfs_trans_switch_context( struct xfs_trans *old_tp, struct xfs_trans *new_tp) { - ASSERT(current->journal_info == old_tp); new_tp->t_pflags = old_tp->t_pflags; old_tp->t_pflags = 0; - current->journal_info = new_tp; } #endif /* __XFS_TRANS_H__ */ |