aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-04-11 11:24:55 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2024-04-11 11:24:55 -0700
commite1dc191dbf3f35cf07790b52110267bef55515a2 (patch)
tree7a6dbd43e37f53e03f4a1fa0b5d8bd85c921a1ef
parent346668f02a770f84bfcbac0c07e70569349eba08 (diff)
parent1189bdda6c991cbf9342d84410042dd5f3a792e0 (diff)
downloadlinux-e1dc191dbf3f35cf07790b52110267bef55515a2.tar.gz
Merge tag 'bcachefs-2024-04-10' of https://evilpiepirate.org/git/bcachefs
Pull more bcachefs fixes from Kent Overstreet: "Notable user impacting bugs - On multi device filesystems, recovery was looping in btree_trans_too_many_iters(). This checks if a transaction has touched too many btree paths (because of iteration over many keys), and isuses a restart to drop unneeded paths. But it's now possible for some paths to exceed the previous limit without iteration in the interior btree update path, since the transaction commit will do alloc updates for every old and new btree node, and during journal replay we don't use the btree write buffer for locking reasons and thus those updates use btree paths when they wouldn't normally. - Fix a corner case in rebalance when moving extents on a durability=0 device. This wouldn't be hit when a device was formatted with durability=0 since in that case we'll only use it as a write through cache (only cached extents will live on it), but durability can now be changed on an existing device. - bch2_get_acl() could rarely forget to handle a transaction restart; this manifested as the occasional missing acl that came back after dropping caches. - Fix a major performance regression on high iops multithreaded write workloads (only since 6.9-rc1); a previous fix for a deadlock in the interior btree update path to check the journal watermark introduced a dependency on the state of btree write buffer flushing that we didn't want. - Assorted other repair paths and recovery fixes" * tag 'bcachefs-2024-04-10' of https://evilpiepirate.org/git/bcachefs: (25 commits) bcachefs: Fix __bch2_btree_and_journal_iter_init_node_iter() bcachefs: Kill read lock dropping in bch2_btree_node_lock_write_nofail() bcachefs: Fix a race in btree_update_nodes_written() bcachefs: btree_node_scan: Respect member.data_allowed bcachefs: Don't scan for btree nodes when we can reconstruct bcachefs: Fix check_topology() when using node scan bcachefs: fix eytzinger0_find_gt() bcachefs: fix bch2_get_acl() transaction restart handling bcachefs: fix the count of nr_freed_pcpu after changing bc->freed_nonpcpu list bcachefs: Fix gap buffer bug in bch2_journal_key_insert_take() bcachefs: Rename struct field swap to prevent macro naming collision MAINTAINERS: Add entry for bcachefs documentation Documentation: filesystems: Add bcachefs toctree bcachefs: JOURNAL_SPACE_LOW bcachefs: Disable errors=panic for BCH_IOCTL_FSCK_OFFLINE bcachefs: Fix BCH_IOCTL_FSCK_OFFLINE for encrypted filesystems bcachefs: fix rand_delete unit test bcachefs: fix ! vs ~ typo in __clear_bit_le64() bcachefs: Fix rebalance from durability=0 device bcachefs: Print shutdown journal sequence number ...
-rw-r--r--Documentation/filesystems/bcachefs/index.rst11
-rw-r--r--Documentation/filesystems/index.rst1
-rw-r--r--MAINTAINERS1
-rw-r--r--fs/bcachefs/acl.c30
-rw-r--r--fs/bcachefs/bcachefs_format.h14
-rw-r--r--fs/bcachefs/btree_gc.c13
-rw-r--r--fs/bcachefs/btree_iter.h2
-rw-r--r--fs/bcachefs/btree_journal_iter.c67
-rw-r--r--fs/bcachefs/btree_key_cache.c4
-rw-r--r--fs/bcachefs/btree_locking.c28
-rw-r--r--fs/bcachefs/btree_node_scan.c11
-rw-r--r--fs/bcachefs/btree_types.h14
-rw-r--r--fs/bcachefs/btree_update_interior.c128
-rw-r--r--fs/bcachefs/btree_update_interior.h3
-rw-r--r--fs/bcachefs/chardev.c98
-rw-r--r--fs/bcachefs/data_update.c17
-rw-r--r--fs/bcachefs/debug.c75
-rw-r--r--fs/bcachefs/eytzinger.c8
-rw-r--r--fs/bcachefs/eytzinger.h26
-rw-r--r--fs/bcachefs/journal_reclaim.c2
-rw-r--r--fs/bcachefs/journal_types.h1
-rw-r--r--fs/bcachefs/recovery.c14
-rw-r--r--fs/bcachefs/snapshot.c19
-rw-r--r--fs/bcachefs/super.c5
-rw-r--r--fs/bcachefs/sysfs.c6
-rw-r--r--fs/bcachefs/tests.c2
-rw-r--r--fs/bcachefs/util.h10
27 files changed, 372 insertions, 238 deletions
diff --git a/Documentation/filesystems/bcachefs/index.rst b/Documentation/filesystems/bcachefs/index.rst
new file mode 100644
index 00000000000000..e2bd61ccd96fff
--- /dev/null
+++ b/Documentation/filesystems/bcachefs/index.rst
@@ -0,0 +1,11 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+======================
+bcachefs Documentation
+======================
+
+.. toctree::
+ :maxdepth: 2
+ :numbered:
+
+ errorcodes
diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst
index 0ea1e44fa02823..1f9b4c905a6a7c 100644
--- a/Documentation/filesystems/index.rst
+++ b/Documentation/filesystems/index.rst
@@ -69,6 +69,7 @@ Documentation for filesystem implementations.
afs
autofs
autofs-mount-control
+ bcachefs/index
befs
bfs
btrfs
diff --git a/MAINTAINERS b/MAINTAINERS
index ae726e28e7ecc2..c64203a9f50e40 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3573,6 +3573,7 @@ S: Supported
C: irc://irc.oftc.net/bcache
T: git https://evilpiepirate.org/git/bcachefs.git
F: fs/bcachefs/
+F: Documentation/filesystems/bcachefs/
BDISP ST MEDIA DRIVER
M: Fabien Dessenne <fabien.dessenne@foss.st.com>
diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index 3640f417cce118..5c180fdc3efbdf 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -281,7 +281,6 @@ struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap,
struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0);
struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter = { NULL };
- struct bkey_s_c_xattr xattr;
struct posix_acl *acl = NULL;
struct bkey_s_c k;
int ret;
@@ -290,28 +289,27 @@ retry:
ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
&hash, inode_inum(inode), &search, 0);
- if (ret) {
- if (!bch2_err_matches(ret, ENOENT))
- acl = ERR_PTR(ret);
- goto out;
- }
+ if (ret)
+ goto err;
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
- if (ret) {
- acl = ERR_PTR(ret);
- goto out;
- }
+ if (ret)
+ goto err;
- xattr = bkey_s_c_to_xattr(k);
+ struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
acl = bch2_acl_from_disk(trans, xattr_val(xattr.v),
- le16_to_cpu(xattr.v->x_val_len));
+ le16_to_cpu(xattr.v->x_val_len));
+ ret = PTR_ERR_OR_ZERO(acl);
+err:
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto retry;
- if (!IS_ERR(acl))
+ if (ret)
+ acl = !bch2_err_matches(ret, ENOENT) ? ERR_PTR(ret) : NULL;
+
+ if (!IS_ERR_OR_NULL(acl))
set_cached_acl(&inode->v, type, acl);
-out:
- if (bch2_err_matches(PTR_ERR_OR_ZERO(acl), BCH_ERR_transaction_restart))
- goto retry;
bch2_trans_iter_exit(trans, &iter);
bch2_trans_put(trans);
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 63102992d9556d..364ae42022af17 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1535,6 +1535,20 @@ enum btree_id {
BTREE_ID_NR
};
+static inline bool btree_id_is_alloc(enum btree_id id)
+{
+ switch (id) {
+ case BTREE_ID_alloc:
+ case BTREE_ID_backpointers:
+ case BTREE_ID_need_discard:
+ case BTREE_ID_freespace:
+ case BTREE_ID_bucket_gens:
+ return true;
+ default:
+ return false;
+ }
+}
+
#define BTREE_MAX_DEPTH 4U
/* Btree nodes */
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 6280da1244b550..d2555da55c6da3 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -368,11 +368,16 @@ again:
buf.buf)) {
bch2_btree_node_evict(trans, cur_k.k);
cur = NULL;
- ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?:
- bch2_journal_key_delete(c, b->c.btree_id,
- b->c.level, cur_k.k->k.p);
+ ret = bch2_journal_key_delete(c, b->c.btree_id,
+ b->c.level, cur_k.k->k.p);
if (ret)
break;
+
+ if (!btree_id_is_alloc(b->c.btree_id)) {
+ ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
+ if (ret)
+ break;
+ }
continue;
}
@@ -544,12 +549,12 @@ reconstruct_root:
bch2_btree_root_alloc_fake(c, i, 0);
} else {
bch2_btree_root_alloc_fake(c, i, 1);
+ bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX);
if (ret)
break;
}
- bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
reconstructed_root = true;
}
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 24772538e4cc74..1d58d447b386cd 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -642,7 +642,7 @@ int __bch2_btree_trans_too_many_iters(struct btree_trans *);
static inline int btree_trans_too_many_iters(struct btree_trans *trans)
{
- if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_INITIAL - 8)
+ if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_NORMAL_LIMIT - 8)
return __bch2_btree_trans_too_many_iters(trans);
return 0;
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index 5cbcbfe85235b8..1e8cf49a693531 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -130,12 +130,30 @@ struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree
return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx);
}
+static void journal_iter_verify(struct journal_iter *iter)
+{
+ struct journal_keys *keys = iter->keys;
+ size_t gap_size = keys->size - keys->nr;
+
+ BUG_ON(iter->idx >= keys->gap &&
+ iter->idx < keys->gap + gap_size);
+
+ if (iter->idx < keys->size) {
+ struct journal_key *k = keys->data + iter->idx;
+
+ int cmp = cmp_int(k->btree_id, iter->btree_id) ?:
+ cmp_int(k->level, iter->level);
+ BUG_ON(cmp < 0);
+ }
+}
+
static void journal_iters_fix(struct bch_fs *c)
{
struct journal_keys *keys = &c->journal_keys;
/* The key we just inserted is immediately before the gap: */
size_t gap_end = keys->gap + (keys->size - keys->nr);
- struct btree_and_journal_iter *iter;
+ struct journal_key *new_key = &keys->data[keys->gap - 1];
+ struct journal_iter *iter;
/*
* If an iterator points one after the key we just inserted, decrement
@@ -143,9 +161,14 @@ static void journal_iters_fix(struct bch_fs *c)
* decrement was unnecessary, bch2_btree_and_journal_iter_peek() will
* handle that:
*/
- list_for_each_entry(iter, &c->journal_iters, journal.list)
- if (iter->journal.idx == gap_end)
- iter->journal.idx = keys->gap - 1;
+ list_for_each_entry(iter, &c->journal_iters, list) {
+ journal_iter_verify(iter);
+ if (iter->idx == gap_end &&
+ new_key->btree_id == iter->btree_id &&
+ new_key->level == iter->level)
+ iter->idx = keys->gap - 1;
+ journal_iter_verify(iter);
+ }
}
static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap)
@@ -192,7 +215,12 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
if (idx > keys->gap)
idx -= keys->size - keys->nr;
+ size_t old_gap = keys->gap;
+
if (keys->nr == keys->size) {
+ journal_iters_move_gap(c, old_gap, keys->size);
+ old_gap = keys->size;
+
struct journal_keys new_keys = {
.nr = keys->nr,
.size = max_t(size_t, keys->size, 8) * 2,
@@ -216,7 +244,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
keys->gap = keys->nr;
}
- journal_iters_move_gap(c, keys->gap, idx);
+ journal_iters_move_gap(c, old_gap, idx);
move_gap(keys, idx);
@@ -301,16 +329,21 @@ static void bch2_journal_iter_advance(struct journal_iter *iter)
static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
{
- struct journal_key *k = iter->keys->data + iter->idx;
+ journal_iter_verify(iter);
+
+ while (iter->idx < iter->keys->size) {
+ struct journal_key *k = iter->keys->data + iter->idx;
+
+ int cmp = cmp_int(k->btree_id, iter->btree_id) ?:
+ cmp_int(k->level, iter->level);
+ if (cmp > 0)
+ break;
+ BUG_ON(cmp);
- while (k < iter->keys->data + iter->keys->size &&
- k->btree_id == iter->btree_id &&
- k->level == iter->level) {
if (!k->overwritten)
return bkey_i_to_s_c(k->k);
bch2_journal_iter_advance(iter);
- k = iter->keys->data + iter->idx;
}
return bkey_s_c_null;
@@ -330,6 +363,8 @@ static void bch2_journal_iter_init(struct bch_fs *c,
iter->level = level;
iter->keys = &c->journal_keys;
iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos);
+
+ journal_iter_verify(iter);
}
static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
@@ -434,10 +469,15 @@ void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
iter->trans = trans;
iter->b = b;
iter->node_iter = node_iter;
- bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos);
- INIT_LIST_HEAD(&iter->journal.list);
iter->pos = b->data->min_key;
iter->at_end = false;
+ INIT_LIST_HEAD(&iter->journal.list);
+
+ if (trans->journal_replay_not_finished) {
+ bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos);
+ if (!test_bit(BCH_FS_may_go_rw, &trans->c->flags))
+ list_add(&iter->journal.list, &trans->c->journal_iters);
+ }
}
/*
@@ -452,9 +492,6 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
bch2_btree_node_iter_init_from_start(&node_iter, b);
__bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key);
- if (trans->journal_replay_not_finished &&
- !test_bit(BCH_FS_may_go_rw, &trans->c->flags))
- list_add(&iter->journal.list, &trans->c->journal_iters);
}
/* sort and dedup all keys in the journal: */
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 581edcb0911bfa..88a3582a32757e 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -169,6 +169,7 @@ static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
} else {
mutex_lock(&bc->lock);
list_move_tail(&ck->list, &bc->freed_pcpu);
+ bc->nr_freed_pcpu++;
mutex_unlock(&bc->lock);
}
}
@@ -245,6 +246,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
if (!list_empty(&bc->freed_pcpu)) {
ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list);
list_del_init(&ck->list);
+ bc->nr_freed_pcpu--;
}
mutex_unlock(&bc->lock);
}
@@ -659,7 +661,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
commit_flags |= BCH_WATERMARK_reclaim;
if (ck->journal.seq != journal_last_seq(j) ||
- j->watermark == BCH_WATERMARK_stripe)
+ !test_bit(JOURNAL_SPACE_LOW, &c->journal.flags))
commit_flags |= BCH_TRANS_COMMIT_no_journal_res;
ret = bch2_btree_iter_traverse(&b_iter) ?:
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index b9b151e693ed60..f2caf491957efc 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -440,33 +440,7 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
struct btree_path *path,
struct btree_bkey_cached_common *b)
{
- struct btree_path *linked;
- unsigned i, iter;
- int ret;
-
- /*
- * XXX BIG FAT NOTICE
- *
- * Drop all read locks before taking a write lock:
- *
- * This is a hack, because bch2_btree_node_lock_write_nofail() is a
- * hack - but by dropping read locks first, this should never fail, and
- * we only use this in code paths where whatever read locks we've
- * already taken are no longer needed:
- */
-
- trans_for_each_path(trans, linked, iter) {
- if (!linked->nodes_locked)
- continue;
-
- for (i = 0; i < BTREE_MAX_DEPTH; i++)
- if (btree_node_read_locked(linked, i)) {
- btree_node_unlock(trans, linked, i);
- btree_path_set_dirty(linked, BTREE_ITER_NEED_RELOCK);
- }
- }
-
- ret = __btree_node_lock_write(trans, path, b, true);
+ int ret = __btree_node_lock_write(trans, path, b, true);
BUG_ON(ret);
}
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
index 3f33be7e5e5c26..556f76f5c84e16 100644
--- a/fs/bcachefs/btree_node_scan.c
+++ b/fs/bcachefs/btree_node_scan.c
@@ -133,6 +133,9 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
if (le64_to_cpu(bn->magic) != bset_magic(c))
return;
+ if (btree_id_is_alloc(BTREE_NODE_ID(bn)))
+ return;
+
rcu_read_lock();
struct found_btree_node n = {
.btree_id = BTREE_NODE_ID(bn),
@@ -213,6 +216,9 @@ static int read_btree_nodes(struct find_btree_nodes *f)
closure_init_stack(&cl);
for_each_online_member(c, ca) {
+ if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree)))
+ continue;
+
struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL);
struct task_struct *t;
@@ -290,7 +296,7 @@ again:
found_btree_node_to_text(&buf, c, n);
bch_err(c, "%s", buf.buf);
printbuf_exit(&buf);
- return -1;
+ return -BCH_ERR_fsck_repair_unimplemented;
}
}
@@ -436,6 +442,9 @@ bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree)
int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
unsigned level, struct bpos node_min, struct bpos node_max)
{
+ if (btree_id_is_alloc(btree))
+ return 0;
+
struct find_btree_nodes *f = &c->found_btree_nodes;
int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 9404d96c38f3b3..e0c982a4195c76 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -364,7 +364,21 @@ struct btree_insert_entry {
unsigned long ip_allocated;
};
+/* Number of btree paths we preallocate, usually enough */
#define BTREE_ITER_INITIAL 64
+/*
+ * Lmiit for btree_trans_too_many_iters(); this is enough that almost all code
+ * paths should run inside this limit, and if they don't it usually indicates a
+ * bug (leaking/duplicated btree paths).
+ *
+ * exception: some fsck paths
+ *
+ * bugs with excessive path usage seem to have possibly been eliminated now, so
+ * we might consider eliminating this (and btree_trans_too_many_iter()) at some
+ * point.
+ */
+#define BTREE_ITER_NORMAL_LIMIT 256
+/* never exceed limit */
#define BTREE_ITER_MAX (1U << 10)
struct btree_trans_commit_hook;
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 32397b99752fd2..c4a5e83a56a436 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -26,9 +26,9 @@
#include <linux/random.h>
-const char * const bch2_btree_update_modes[] = {
+static const char * const bch2_btree_update_modes[] = {
#define x(t) #t,
- BCH_WATERMARKS()
+ BTREE_UPDATE_MODES()
#undef x
NULL
};
@@ -704,9 +704,13 @@ static void btree_update_nodes_written(struct btree_update *as)
bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
"%s", bch2_err_str(ret));
err:
- if (as->b) {
-
- b = as->b;
+ /*
+ * We have to be careful because another thread might be getting ready
+ * to free as->b and calling btree_update_reparent() on us - we'll
+ * recheck under btree_update_lock below:
+ */
+ b = READ_ONCE(as->b);
+ if (b) {
btree_path_idx_t path_idx = get_unlocked_mut_path(trans,
as->btree_id, b->c.level, b->key.k.p);
struct btree_path *path = trans->paths + path_idx;
@@ -850,15 +854,17 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b)
{
struct bch_fs *c = as->c;
- mutex_lock(&c->btree_interior_update_lock);
- list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
-
BUG_ON(as->mode != BTREE_UPDATE_none);
+ BUG_ON(as->update_level_end < b->c.level);
BUG_ON(!btree_node_dirty(b));
BUG_ON(!b->c.level);
+ mutex_lock(&c->btree_interior_update_lock);
+ list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
+
as->mode = BTREE_UPDATE_node;
as->b = b;
+ as->update_level_end = b->c.level;
set_btree_node_write_blocked(b);
list_add(&as->write_blocked_list, &b->write_blocked);
@@ -1100,7 +1106,7 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *
static struct btree_update *
bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
- unsigned level, bool split, unsigned flags)
+ unsigned level_start, bool split, unsigned flags)
{
struct bch_fs *c = trans->c;
struct btree_update *as;
@@ -1108,7 +1114,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
int disk_res_flags = (flags & BCH_TRANS_COMMIT_no_enospc)
? BCH_DISK_RESERVATION_NOFAIL : 0;
unsigned nr_nodes[2] = { 0, 0 };
- unsigned update_level = level;
+ unsigned level_end = level_start;
enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
int ret = 0;
u32 restart_count = trans->restart_count;
@@ -1123,34 +1129,30 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
flags &= ~BCH_WATERMARK_MASK;
flags |= watermark;
- if (watermark < c->journal.watermark) {
- struct journal_res res = { 0 };
- unsigned journal_flags = watermark|JOURNAL_RES_GET_CHECK;
+ if (watermark < BCH_WATERMARK_reclaim &&
+ test_bit(JOURNAL_SPACE_LOW, &c->journal.flags)) {
+ if (flags & BCH_TRANS_COMMIT_journal_reclaim)
+ return ERR_PTR(-BCH_ERR_journal_reclaim_would_deadlock);
- if ((flags & BCH_TRANS_COMMIT_journal_reclaim) &&
- watermark < BCH_WATERMARK_reclaim)
- journal_flags |= JOURNAL_RES_GET_NONBLOCK;
-
- ret = drop_locks_do(trans,
- bch2_journal_res_get(&c->journal, &res, 1, journal_flags));
- if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
- ret = -BCH_ERR_journal_reclaim_would_deadlock;
+ bch2_trans_unlock(trans);
+ wait_event(c->journal.wait, !test_bit(JOURNAL_SPACE_LOW, &c->journal.flags));
+ ret = bch2_trans_relock(trans);
if (ret)
return ERR_PTR(ret);
}
while (1) {
- nr_nodes[!!update_level] += 1 + split;
- update_level++;
+ nr_nodes[!!level_end] += 1 + split;
+ level_end++;
- ret = bch2_btree_path_upgrade(trans, path, update_level + 1);
+ ret = bch2_btree_path_upgrade(trans, path, level_end + 1);
if (ret)
return ERR_PTR(ret);
- if (!btree_path_node(path, update_level)) {
+ if (!btree_path_node(path, level_end)) {
/* Allocating new root? */
nr_nodes[1] += split;
- update_level = BTREE_MAX_DEPTH;
+ level_end = BTREE_MAX_DEPTH;
break;
}
@@ -1158,11 +1160,11 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
* Always check for space for two keys, even if we won't have to
* split at prior level - it might have been a merge instead:
*/
- if (bch2_btree_node_insert_fits(path->l[update_level].b,
+ if (bch2_btree_node_insert_fits(path->l[level_end].b,
BKEY_BTREE_PTR_U64s_MAX * 2))
break;
- split = path->l[update_level].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c);
+ split = path->l[level_end].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c);
}
if (!down_read_trylock(&c->gc_lock)) {
@@ -1176,14 +1178,15 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOFS);
memset(as, 0, sizeof(*as));
closure_init(&as->cl, NULL);
- as->c = c;
- as->start_time = start_time;
- as->ip_started = _RET_IP_;
- as->mode = BTREE_UPDATE_none;
- as->watermark = watermark;
- as->took_gc_lock = true;
- as->btree_id = path->btree_id;
- as->update_level = update_level;
+ as->c = c;
+ as->start_time = start_time;
+ as->ip_started = _RET_IP_;
+ as->mode = BTREE_UPDATE_none;
+ as->watermark = watermark;
+ as->took_gc_lock = true;
+ as->btree_id = path->btree_id;
+ as->update_level_start = level_start;
+ as->update_level_end = level_end;
INIT_LIST_HEAD(&as->list);
INIT_LIST_HEAD(&as->unwritten_list);
INIT_LIST_HEAD(&as->write_blocked_list);
@@ -1373,12 +1376,12 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
}
static void
-__bch2_btree_insert_keys_interior(struct btree_update *as,
- struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b,
- struct btree_node_iter node_iter,
- struct keylist *keys)
+bch2_btree_insert_keys_interior(struct btree_update *as,
+ struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b,
+ struct btree_node_iter node_iter,
+ struct keylist *keys)
{
struct bkey_i *insert = bch2_keylist_front(keys);
struct bkey_packed *k;
@@ -1534,7 +1537,7 @@ static void btree_split_insert_keys(struct btree_update *as,
bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p);
- __bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
+ bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
BUG_ON(bch2_btree_node_check_topology(trans, b));
}
@@ -1714,27 +1717,6 @@ err:
goto out;
}
-static void
-bch2_btree_insert_keys_interior(struct btree_update *as,
- struct btree_trans *trans,
- struct btree_path *path,
- struct btree *b,
- struct keylist *keys)
-{
- struct btree_path *linked;
- unsigned i;
-
- __bch2_btree_insert_keys_interior(as, trans, path, b,
- path->l[b->c.level].iter, keys);
-
- btree_update_updated_node(as, b);
-
- trans_for_each_path_with_node(trans, b, linked, i)
- bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
-
- bch2_trans_verify_paths(trans);
-}
-
/**
* bch2_btree_insert_node - insert bkeys into a given btree node
*
@@ -1755,7 +1737,8 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
struct keylist *keys)
{
struct bch_fs *c = as->c;
- struct btree_path *path = trans->paths + path_idx;
+ struct btree_path *path = trans->paths + path_idx, *linked;
+ unsigned i;
int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
int old_live_u64s = b->nr.live_u64s;
int live_u64s_added, u64s_added;
@@ -1784,7 +1767,13 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
return ret;
}
- bch2_btree_insert_keys_interior(as, trans, path, b, keys);
+ bch2_btree_insert_keys_interior(as, trans, path, b,
+ path->l[b->c.level].iter, keys);
+
+ trans_for_each_path_with_node(trans, b, linked, i)
+ bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
+
+ bch2_trans_verify_paths(trans);
live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
@@ -1798,6 +1787,7 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
bch2_maybe_compact_whiteouts(c, b))
bch2_trans_node_reinit_iter(trans, b);
+ btree_update_updated_node(as, b);
bch2_btree_node_unlock_write(trans, path, b);
BUG_ON(bch2_btree_node_check_topology(trans, b));
@@ -1807,7 +1797,7 @@ split:
* We could attempt to avoid the transaction restart, by calling
* bch2_btree_path_upgrade() and allocating more nodes:
*/
- if (b->c.level >= as->update_level) {
+ if (b->c.level >= as->update_level_end) {
trace_and_count(c, trans_restart_split_race, trans, _THIS_IP_, b);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
}
@@ -2519,9 +2509,11 @@ void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned lev
static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as)
{
- prt_printf(out, "%ps: btree=%s watermark=%s mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n",
+ prt_printf(out, "%ps: btree=%s l=%u-%u watermark=%s mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n",
(void *) as->ip_started,
bch2_btree_id_str(as->btree_id),
+ as->update_level_start,
+ as->update_level_end,
bch2_watermarks[as->watermark],
bch2_btree_update_modes[as->mode],
as->nodes_written,
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 88dcf5a22a3bd6..c1a479ebaad121 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -57,7 +57,8 @@ struct btree_update {
unsigned took_gc_lock:1;
enum btree_id btree_id;
- unsigned update_level;
+ unsigned update_level_start;
+ unsigned update_level_end;
struct disk_reservation disk_res;
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index cbfa6459bdbcee..72781aad6ba70c 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -134,42 +134,38 @@ static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg
struct fsck_thread {
struct thread_with_stdio thr;
struct bch_fs *c;
- char **devs;
- size_t nr_devs;
struct bch_opts opts;
};
static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr)
{
struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr);
- if (thr->devs)
- for (size_t i = 0; i < thr->nr_devs; i++)
- kfree(thr->devs[i]);
- kfree(thr->devs);
kfree(thr);
}
static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio)
{
struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
- struct bch_fs *c = bch2_fs_open(thr->devs, thr->nr_devs, thr->opts);
-
- if (IS_ERR(c))
- return PTR_ERR(c);
+ struct bch_fs *c = thr->c;
- int ret = 0;
- if (test_bit(BCH_FS_errors_fixed, &c->flags))
- ret |= 1;
- if (test_bit(BCH_FS_error, &c->flags))
- ret |= 4;
+ int ret = PTR_ERR_OR_ZERO(c);
+ if (ret)
+ return ret;
- bch2_fs_stop(c);
+ ret = bch2_fs_start(thr->c);
+ if (ret)
+ goto err;
- if (ret & 1)
+ if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name);
- if (ret & 4)
+ ret |= 1;
+ }
+ if (test_bit(BCH_FS_error, &c->flags)) {
bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name);
-
+ ret |= 4;
+ }
+err:
+ bch2_fs_stop(c);
return ret;
}
@@ -182,7 +178,7 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a
{
struct bch_ioctl_fsck_offline arg;
struct fsck_thread *thr = NULL;
- u64 *devs = NULL;
+ darray_str(devs) = {};
long ret = 0;
if (copy_from_user(&arg, user_arg, sizeof(arg)))
@@ -194,29 +190,32 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!(devs = kcalloc(arg.nr_devs, sizeof(*devs), GFP_KERNEL)) ||
- !(thr = kzalloc(sizeof(*thr), GFP_KERNEL)) ||
- !(thr->devs = kcalloc(arg.nr_devs, sizeof(*thr->devs), GFP_KERNEL))) {
- ret = -ENOMEM;
- goto err;
- }
+ for (size_t i = 0; i < arg.nr_devs; i++) {
+ u64 dev_u64;
+ ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64));
+ if (ret)
+ goto err;
- thr->opts = bch2_opts_empty();
- thr->nr_devs = arg.nr_devs;
+ char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX);
+ ret = PTR_ERR_OR_ZERO(dev_str);
+ if (ret)
+ goto err;
- if (copy_from_user(devs, &user_arg->devs[0],
- array_size(sizeof(user_arg->devs[0]), arg.nr_devs))) {
- ret = -EINVAL;
- goto err;
+ ret = darray_push(&devs, dev_str);
+ if (ret) {
+ kfree(dev_str);
+ goto err;
+ }
}
- for (size_t i = 0; i < arg.nr_devs; i++) {
- thr->devs[i] = strndup_user((char __user *)(unsigned long) devs[i], PATH_MAX);
- ret = PTR_ERR_OR_ZERO(thr->devs[i]);
- if (ret)
- goto err;
+ thr = kzalloc(sizeof(*thr), GFP_KERNEL);
+ if (!thr) {
+ ret = -ENOMEM;
+ goto err;
}
+ thr->opts = bch2_opts_empty();
+
if (arg.opts) {
char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
@@ -230,15 +229,26 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a
opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
+ /* We need request_key() to be called before we punt to kthread: */
+ opt_set(thr->opts, nostart, true);
+
+ thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts);
+
+ if (!IS_ERR(thr->c) &&
+ thr->c->opts.errors == BCH_ON_ERROR_panic)
+ thr->c->opts.errors = BCH_ON_ERROR_ro;
+
ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_offline_fsck_ops);
-err:
- if (ret < 0) {
- if (thr)
- bch2_fsck_thread_exit(&thr->thr);
- pr_err("ret %s", bch2_err_str(ret));
- }
- kfree(devs);
+out:
+ darray_for_each(devs, i)
+ kfree(*i);
+ darray_exit(&devs);
return ret;
+err:
+ if (thr)
+ bch2_fsck_thread_exit(&thr->thr);
+ pr_err("ret %s", bch2_err_str(ret));
+ goto out;
}
static long bch2_global_ioctl(unsigned cmd, void __user *arg)
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 34731ee0217f62..0022b51ce3c09c 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -598,6 +598,8 @@ int bch2_data_update_init(struct btree_trans *trans,
i++;
}
+ unsigned durability_required = max(0, (int) (io_opts.data_replicas - durability_have));
+
/*
* If current extent durability is less than io_opts.data_replicas,
* we're not trying to rereplicate the extent up to data_replicas here -
@@ -607,7 +609,7 @@ int bch2_data_update_init(struct btree_trans *trans,
* rereplicate, currently, so that users don't get an unexpected -ENOSPC
*/
if (!(m->data_opts.write_flags & BCH_WRITE_CACHED) &&
- durability_have >= io_opts.data_replicas) {
+ !durability_required) {
m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs;
m->data_opts.rewrite_ptrs = 0;
/* if iter == NULL, it's just a promote */
@@ -616,11 +618,18 @@ int bch2_data_update_init(struct btree_trans *trans,
goto done;
}
- m->op.nr_replicas = min(durability_removing, io_opts.data_replicas - durability_have) +
+ m->op.nr_replicas = min(durability_removing, durability_required) +
m->data_opts.extra_replicas;
- m->op.nr_replicas_required = m->op.nr_replicas;
- BUG_ON(!m->op.nr_replicas);
+ /*
+ * If device(s) were set to durability=0 after data was written to them
+ * we can end up with a duribilty=0 extent, and the normal algorithm
+ * that tries not to increase durability doesn't work:
+ */
+ if (!(durability_have + durability_removing))
+ m->op.nr_replicas = max((unsigned) m->op.nr_replicas, 1);
+
+ m->op.nr_replicas_required = m->op.nr_replicas;
if (reserve_sectors) {
ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 208ce6f0fc4317..cd99b739941447 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -13,6 +13,7 @@
#include "btree_iter.h"
#include "btree_locking.h"
#include "btree_update.h"
+#include "btree_update_interior.h"
#include "buckets.h"
#include "debug.h"
#include "error.h"
@@ -668,7 +669,7 @@ static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
i->size = size;
i->ret = 0;
- do {
+ while (1) {
err = flush_buf(i);
if (err)
return err;
@@ -676,9 +677,12 @@ static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
if (!i->size)
break;
+ if (done)
+ break;
+
done = bch2_journal_seq_pins_to_text(&i->buf, &c->journal, &i->iter);
i->iter++;
- } while (!done);
+ }
if (i->buf.allocation_failure)
return -ENOMEM;
@@ -693,13 +697,45 @@ static const struct file_operations journal_pins_ops = {
.read = bch2_journal_pins_read,
};
+static ssize_t bch2_btree_updates_read(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ struct dump_iter *i = file->private_data;
+ struct bch_fs *c = i->c;
+ int err;
+
+ i->ubuf = buf;
+ i->size = size;
+ i->ret = 0;
+
+ if (!i->iter) {
+ bch2_btree_updates_to_text(&i->buf, c);
+ i->iter++;
+ }
+
+ err = flush_buf(i);
+ if (err)
+ return err;
+
+ if (i->buf.allocation_failure)
+ return -ENOMEM;
+
+ return i->ret;
+}
+
+static const struct file_operations btree_updates_ops = {
+ .owner = THIS_MODULE,
+ .open = bch2_dump_open,
+ .release = bch2_dump_release,
+ .read = bch2_btree_updates_read,
+};
+
static int btree_transaction_stats_open(struct inode *inode, struct file *file)
{
struct bch_fs *c = inode->i_private;
struct dump_iter *i;
i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
-
if (!i)
return -ENOMEM;
@@ -866,6 +902,20 @@ void bch2_fs_debug_exit(struct bch_fs *c)
debugfs_remove_recursive(c->fs_debug_dir);
}
+static void bch2_fs_debug_btree_init(struct bch_fs *c, struct btree_debug *bd)
+{
+ struct dentry *d;
+
+ d = debugfs_create_dir(bch2_btree_id_str(bd->id), c->btree_debug_dir);
+
+ debugfs_create_file("keys", 0400, d, bd, &btree_debug_ops);
+
+ debugfs_create_file("formats", 0400, d, bd, &btree_format_debug_ops);
+
+ debugfs_create_file("bfloat-failed", 0400, d, bd,
+ &bfloat_failed_debug_ops);
+}
+
void bch2_fs_debug_init(struct bch_fs *c)
{
struct btree_debug *bd;
@@ -888,6 +938,9 @@ void bch2_fs_debug_init(struct bch_fs *c)
debugfs_create_file("journal_pins", 0400, c->fs_debug_dir,
c->btree_debug, &journal_pins_ops);
+ debugfs_create_file("btree_updates", 0400, c->fs_debug_dir,
+ c->btree_debug, &btree_updates_ops);
+
debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir,
c, &btree_transaction_stats_op);
@@ -902,21 +955,7 @@ void bch2_fs_debug_init(struct bch_fs *c)
bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
bd++) {
bd->id = bd - c->btree_debug;
- debugfs_create_file(bch2_btree_id_str(bd->id),
- 0400, c->btree_debug_dir, bd,
- &btree_debug_ops);
-
- snprintf(name, sizeof(name), "%s-formats",
- bch2_btree_id_str(bd->id));
-
- debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
- &btree_format_debug_ops);
-
- snprintf(name, sizeof(name), "%s-bfloat-failed",
- bch2_btree_id_str(bd->id));
-
- debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
- &bfloat_failed_debug_ops);
+ bch2_fs_debug_btree_init(c, bd);
}
}
diff --git a/fs/bcachefs/eytzinger.c b/fs/bcachefs/eytzinger.c
index 4ce5e957a6e916..0f955c3c76a7bc 100644
--- a/fs/bcachefs/eytzinger.c
+++ b/fs/bcachefs/eytzinger.c
@@ -115,7 +115,7 @@ static void swap_bytes(void *a, void *b, size_t n)
struct wrapper {
cmp_func_t cmp;
- swap_func_t swap;
+ swap_func_t swap_func;
};
/*
@@ -125,7 +125,7 @@ struct wrapper {
static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv)
{
if (swap_func == SWAP_WRAPPER) {
- ((const struct wrapper *)priv)->swap(a, b, (int)size);
+ ((const struct wrapper *)priv)->swap_func(a, b, (int)size);
return;
}
@@ -174,7 +174,7 @@ void eytzinger0_sort_r(void *base, size_t n, size_t size,
int i, c, r;
/* called from 'sort' without swap function, let's pick the default */
- if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap)
+ if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_func)
swap_func = NULL;
if (!swap_func) {
@@ -227,7 +227,7 @@ void eytzinger0_sort(void *base, size_t n, size_t size,
{
struct wrapper w = {
.cmp = cmp_func,
- .swap = swap_func,
+ .swap_func = swap_func,
};
return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w);
diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h
index ee0e2df33322d2..24840aee335c0f 100644
--- a/fs/bcachefs/eytzinger.h
+++ b/fs/bcachefs/eytzinger.h
@@ -242,8 +242,8 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
(_i) = eytzinger0_next((_i), (_size)))
/* return greatest node <= @search, or -1 if not found */
-static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
- cmp_func_t cmp, const void *search)
+static inline int eytzinger0_find_le(void *base, size_t nr, size_t size,
+ cmp_func_t cmp, const void *search)
{
unsigned i, n = 0;
@@ -256,18 +256,32 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
} while (n < nr);
if (n & 1) {
- /* @i was greater than @search, return previous node: */
+ /*
+ * @i was greater than @search, return previous node:
+ *
+ * if @i was leftmost/smallest element,
+ * eytzinger0_prev(eytzinger0_first())) returns -1, as expected
+ */
return eytzinger0_prev(i, nr);
} else {
return i;
}
}
-static inline ssize_t eytzinger0_find_gt(void *base, size_t nr, size_t size,
- cmp_func_t cmp, const void *search)
+static inline int eytzinger0_find_gt(void *base, size_t nr, size_t size,
+ cmp_func_t cmp, const void *search)
{
ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search);
- return eytzinger0_next(idx, size);
+
+ /*
+ * if eytitzinger0_find_le() returned -1 - no element was <= search - we
+ * want to return the first element; next/prev identities mean this work
+ * as expected
+ *
+ * similarly if find_le() returns last element, we should return -1;
+ * identities mean this all works out:
+ */
+ return eytzinger0_next(idx, nr);
}
#define eytzinger0_find(base, nr, size, _cmp, search) \
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index ab811c0dad26ac..04a577848b015c 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -67,6 +67,8 @@ void bch2_journal_set_watermark(struct journal *j)
track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], low_on_wb))
trace_and_count(c, journal_full, c);
+ mod_bit(JOURNAL_SPACE_LOW, &j->flags, low_on_space || low_on_pin);
+
swap(watermark, j->watermark);
if (watermark > j->watermark)
journal_wake(j);
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 8c053cb64ca5ee..b5161b5d76a008 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -134,6 +134,7 @@ enum journal_flags {
JOURNAL_STARTED,
JOURNAL_MAY_SKIP_FLUSH,
JOURNAL_NEED_FLUSH_WRITE,
+ JOURNAL_SPACE_LOW,
};
/* Reasons we may fail to get a journal reservation: */
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index b76c16152579c6..0f328aba9760ba 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -47,20 +47,6 @@ void bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree)
}
}
-static bool btree_id_is_alloc(enum btree_id id)
-{
- switch (id) {
- case BTREE_ID_alloc:
- case BTREE_ID_backpointers:
- case BTREE_ID_need_discard:
- case BTREE_ID_freespace:
- case BTREE_ID_bucket_gens:
- return true;
- default:
- return false;
- }
-}
-
/* for -o reconstruct_alloc: */
static void bch2_reconstruct_alloc(struct bch_fs *c)
{
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 0e806f04f3d7c5..544322d5c25170 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -125,6 +125,15 @@ static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ances
return s->parent;
}
+static bool test_ancestor_bitmap(struct snapshot_table *t, u32 id, u32 ancestor)
+{
+ const struct snapshot_t *s = __snapshot_t(t, id);
+ if (!s)
+ return false;
+
+ return test_bit(ancestor - id - 1, s->is_ancestor);
+}
+
bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
{
bool ret;
@@ -140,13 +149,11 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
while (id && id < ancestor - IS_ANCESTOR_BITMAP)
id = get_ancestor_below(t, id, ancestor);
- if (id && id < ancestor) {
- ret = test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor);
+ ret = id && id < ancestor
+ ? test_ancestor_bitmap(t, id, ancestor)
+ : id == ancestor;
- EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor));
- } else {
- ret = id == ancestor;
- }
+ EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor));
out:
rcu_read_unlock();
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index ed63018f21bef5..8daf80a38d60c6 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -288,8 +288,13 @@ static void __bch2_fs_read_only(struct bch_fs *c)
if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) &&
!test_bit(BCH_FS_emergency_ro, &c->flags))
set_bit(BCH_FS_clean_shutdown, &c->flags);
+
bch2_fs_journal_stop(&c->journal);
+ bch_info(c, "%sshutdown complete, journal seq %llu",
+ test_bit(BCH_FS_clean_shutdown, &c->flags) ? "" : "un",
+ c->journal.seq_ondisk);
+
/*
* After stopping journal:
*/
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index c86a93a8d8fc81..b18b0cc81b594a 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -17,7 +17,6 @@
#include "btree_iter.h"
#include "btree_key_cache.h"
#include "btree_update.h"
-#include "btree_update_interior.h"
#include "btree_gc.h"
#include "buckets.h"
#include "clock.h"
@@ -166,7 +165,6 @@ read_attribute(btree_write_stats);
read_attribute(btree_cache_size);
read_attribute(compression_stats);
read_attribute(journal_debug);
-read_attribute(btree_updates);
read_attribute(btree_cache);
read_attribute(btree_key_cache);
read_attribute(stripes_heap);
@@ -415,9 +413,6 @@ SHOW(bch2_fs)
if (attr == &sysfs_journal_debug)
bch2_journal_debug_to_text(out, &c->journal);
- if (attr == &sysfs_btree_updates)
- bch2_btree_updates_to_text(out, c);
-
if (attr == &sysfs_btree_cache)
bch2_btree_cache_to_text(out, c);
@@ -639,7 +634,6 @@ SYSFS_OPS(bch2_fs_internal);
struct attribute *bch2_fs_internal_files[] = {
&sysfs_flags,
&sysfs_journal_debug,
- &sysfs_btree_updates,
&sysfs_btree_cache,
&sysfs_btree_key_cache,
&sysfs_new_stripes,
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index b3fe9fc577470f..bfec656f94c075 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -672,7 +672,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos)
bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos,
BTREE_ITER_INTENT);
- k = bch2_btree_iter_peek(&iter);
+ k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX));
ret = bkey_err(k);
if (ret)
goto err;
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index b7e7c29278fc05..5cf885b09986ac 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -788,6 +788,14 @@ static inline int copy_from_user_errcode(void *to, const void __user *from, unsi
#endif
+static inline void mod_bit(long nr, volatile unsigned long *addr, bool v)
+{
+ if (v)
+ set_bit(nr, addr);
+ else
+ clear_bit(nr, addr);
+}
+
static inline void __set_bit_le64(size_t bit, __le64 *addr)
{
addr[bit / 64] |= cpu_to_le64(BIT_ULL(bit % 64));
@@ -795,7 +803,7 @@ static inline void __set_bit_le64(size_t bit, __le64 *addr)
static inline void __clear_bit_le64(size_t bit, __le64 *addr)
{
- addr[bit / 64] &= !cpu_to_le64(BIT_ULL(bit % 64));
+ addr[bit / 64] &= ~cpu_to_le64(BIT_ULL(bit % 64));
}
static inline bool test_bit_le64(size_t bit, __le64 *addr)