aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@linux.dev>2023-03-14 12:56:38 -0400
committerKent Overstreet <kent.overstreet@linux.dev>2023-03-14 12:56:38 -0400
commitfa358537725c8065b058b558125cf15359936f94 (patch)
treec9aa0d375aa87434c4139f59d4aa66281a6c6c17
parent46ba4fb48ca5ac28f442b74c44ca53196112423f (diff)
downloadbcachefs-tools-fa358537725c8065b058b558125cf15359936f94.tar.gz
Update bcachefs sources to 72405e7ff8 bcachefs: Fix bch2_check_extents_to_backpointers()
-rw-r--r--.bcachefs_revision2
-rw-r--r--libbcachefs/alloc_background.c40
-rw-r--r--libbcachefs/alloc_background.h2
-rw-r--r--libbcachefs/alloc_foreground.c149
-rw-r--r--libbcachefs/alloc_foreground.h8
-rw-r--r--libbcachefs/backpointers.c30
-rw-r--r--libbcachefs/bcachefs.h23
-rw-r--r--libbcachefs/btree_iter.c2
-rw-r--r--libbcachefs/btree_key_cache.c23
-rw-r--r--libbcachefs/btree_key_cache.h2
-rw-r--r--libbcachefs/btree_locking.c34
-rw-r--r--libbcachefs/btree_locking.h13
-rw-r--r--libbcachefs/btree_update.h3
-rw-r--r--libbcachefs/btree_update_leaf.c10
-rw-r--r--libbcachefs/buckets.c2
-rw-r--r--libbcachefs/data_update.c105
-rw-r--r--libbcachefs/ec.c133
-rw-r--r--libbcachefs/ec.h40
-rw-r--r--libbcachefs/extents.c32
-rw-r--r--libbcachefs/extents.h16
-rw-r--r--libbcachefs/fsck.c4
-rw-r--r--libbcachefs/io.c72
-rw-r--r--libbcachefs/io.h61
-rw-r--r--libbcachefs/io_types.h2
-rw-r--r--libbcachefs/journal.c176
-rw-r--r--libbcachefs/journal_io.c3
-rw-r--r--libbcachefs/journal_reclaim.c83
-rw-r--r--libbcachefs/journal_sb.c27
-rw-r--r--libbcachefs/journal_sb.h2
-rw-r--r--libbcachefs/journal_types.h10
-rw-r--r--libbcachefs/migrate.c5
-rw-r--r--libbcachefs/move.c122
-rw-r--r--libbcachefs/move.h9
-rw-r--r--libbcachefs/movinggc.c20
-rw-r--r--libbcachefs/opts.h8
-rw-r--r--libbcachefs/reflink.c2
-rw-r--r--libbcachefs/subvolume.c6
-rw-r--r--libbcachefs/super.c21
-rw-r--r--libbcachefs/sysfs.c26
-rw-r--r--linux/six.c14
40 files changed, 822 insertions, 520 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index 2845be68..d8d13865 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-3856459b1b9f37cebee2bca3c9edcafaf393aa98
+72405e7ff8c5fb569b74b046d19866ee480f29b7
diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c
index 5f4bb82c..009a85bc 100644
--- a/libbcachefs/alloc_background.c
+++ b/libbcachefs/alloc_background.c
@@ -1006,7 +1006,7 @@ static bool next_bucket(struct bch_fs *c, struct bpos *bucket)
iter = bucket->inode;
ca = __bch2_next_dev(c, &iter, NULL);
if (ca)
- bucket->offset = ca->mi.first_bucket;
+ *bucket = POS(ca->dev_idx, ca->mi.first_bucket);
rcu_read_unlock();
return ca != NULL;
@@ -2158,43 +2158,7 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
*/
bch2_recalc_capacity(c);
- /* Next, close write points that point to this device... */
- for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
- bch2_writepoint_stop(c, ca, &c->write_points[i]);
-
- bch2_writepoint_stop(c, ca, &c->copygc_write_point);
- bch2_writepoint_stop(c, ca, &c->rebalance_write_point);
- bch2_writepoint_stop(c, ca, &c->btree_write_point);
-
- mutex_lock(&c->btree_reserve_cache_lock);
- while (c->btree_reserve_cache_nr) {
- struct btree_alloc *a =
- &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
-
- bch2_open_buckets_put(c, &a->ob);
- }
- mutex_unlock(&c->btree_reserve_cache_lock);
-
- spin_lock(&c->freelist_lock);
- i = 0;
- while (i < c->open_buckets_partial_nr) {
- struct open_bucket *ob =
- c->open_buckets + c->open_buckets_partial[i];
-
- if (ob->dev == ca->dev_idx) {
- swap(c->open_buckets_partial[i],
- c->open_buckets_partial[--c->open_buckets_partial_nr]);
- ob->on_partial_list = false;
- spin_unlock(&c->freelist_lock);
- bch2_open_bucket_put(c, ob);
- spin_lock(&c->freelist_lock);
- } else {
- i++;
- }
- }
- spin_unlock(&c->freelist_lock);
-
- bch2_ec_stop_dev(c, ca);
+ bch2_open_buckets_stop(c, ca, false);
/*
* Wake up threads that were blocked on allocation, so they can notice
diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h
index c9ff590e..32479839 100644
--- a/libbcachefs/alloc_background.h
+++ b/libbcachefs/alloc_background.h
@@ -216,7 +216,7 @@ static inline u64 should_invalidate_buckets(struct bch_dev *ca,
u64 free = max_t(s64, 0,
u.d[BCH_DATA_free].buckets
+ u.d[BCH_DATA_need_discard].buckets
- - bch2_dev_buckets_reserved(ca, RESERVE_none));
+ - bch2_dev_buckets_reserved(ca, RESERVE_stripe));
return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets);
}
diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c
index 3a67ac0d..d52f30ac 100644
--- a/libbcachefs/alloc_foreground.c
+++ b/libbcachefs/alloc_foreground.c
@@ -97,7 +97,7 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
if (ob->ec) {
- ec_stripe_new_put(c, ob->ec);
+ ec_stripe_new_put(c, ob->ec, STRIPE_REF_io);
return;
}
@@ -658,9 +658,11 @@ static int add_new_bucket(struct bch_fs *c,
bch_dev_bkey_exists(c, ob->dev)->mi.durability;
BUG_ON(*nr_effective >= nr_replicas);
+ BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS);
__clear_bit(ob->dev, devs_may_alloc->d);
- *nr_effective += durability;
+ *nr_effective += (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)
+ ? durability : 1;
*have_cache |= !durability;
ob_push(c, ptrs, ob);
@@ -679,6 +681,7 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
unsigned nr_replicas,
unsigned *nr_effective,
bool *have_cache,
+ unsigned flags,
enum bch_data_type data_type,
enum alloc_reserve reserve,
struct closure *cl)
@@ -729,7 +732,7 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
if (add_new_bucket(c, ptrs, devs_may_alloc,
nr_replicas, nr_effective,
- have_cache, 0, ob)) {
+ have_cache, flags, ob)) {
ret = 0;
break;
}
@@ -796,7 +799,7 @@ got_bucket:
ob->ec_idx = ec_idx;
ob->ec = h->s;
- ec_stripe_new_get(h->s);
+ ec_stripe_new_get(h->s, STRIPE_REF_io);
ret = add_new_bucket(c, ptrs, devs_may_alloc,
nr_replicas, nr_effective,
@@ -823,7 +826,7 @@ static bool want_bucket(struct bch_fs *c,
return false;
if (!ca->mi.durability &&
- (wp->data_type != BCH_DATA_user || !*have_cache))
+ (wp->data_type == BCH_DATA_btree || ec || *have_cache))
return false;
if (ec != (ob->ec != NULL))
@@ -877,6 +880,9 @@ static int bucket_alloc_set_partial(struct bch_fs *c,
spin_lock(&c->freelist_lock);
+ if (!c->open_buckets_partial_nr)
+ goto unlock;
+
for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) {
struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i];
@@ -902,7 +908,7 @@ static int bucket_alloc_set_partial(struct bch_fs *c,
break;
}
}
-
+unlock:
spin_unlock(&c->freelist_lock);
return ret;
}
@@ -967,7 +973,7 @@ retry_blocking:
*/
ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs,
nr_replicas, nr_effective, have_cache,
- wp->data_type, reserve, cl);
+ flags, wp->data_type, reserve, cl);
if (ret &&
!bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
!bch2_err_matches(ret, BCH_ERR_insufficient_devices) &&
@@ -1017,45 +1023,96 @@ static int open_bucket_add_buckets(struct btree_trans *trans,
return ret < 0 ? ret : 0;
}
-void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca,
- struct open_buckets *obs)
+static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c,
+ struct bch_dev *ca, bool ec)
{
- struct open_buckets ptrs = { .nr = 0 };
- struct open_bucket *ob, *ob2;
- unsigned i, j;
-
- open_bucket_for_each(c, obs, ob, i) {
- bool drop = !ca || ob->dev == ca->dev_idx;
+ if (ec) {
+ return ob->ec != NULL;
+ } else if (ca) {
+ bool drop = ob->dev == ca->dev_idx;
+ struct open_bucket *ob2;
+ unsigned i;
if (!drop && ob->ec) {
mutex_lock(&ob->ec->lock);
- for (j = 0; j < ob->ec->new_stripe.key.v.nr_blocks; j++) {
- if (!ob->ec->blocks[j])
+ for (i = 0; i < ob->ec->new_stripe.key.v.nr_blocks; i++) {
+ if (!ob->ec->blocks[i])
continue;
- ob2 = c->open_buckets + ob->ec->blocks[j];
+ ob2 = c->open_buckets + ob->ec->blocks[i];
drop |= ob2->dev == ca->dev_idx;
}
mutex_unlock(&ob->ec->lock);
}
- if (drop)
- bch2_open_bucket_put(c, ob);
- else
- ob_push(c, &ptrs, ob);
+ return drop;
+ } else {
+ return true;
}
-
- *obs = ptrs;
}
-void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
- struct write_point *wp)
+static void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
+ bool ec, struct write_point *wp)
{
+ struct open_buckets ptrs = { .nr = 0 };
+ struct open_bucket *ob;
+ unsigned i;
+
mutex_lock(&wp->lock);
- bch2_open_buckets_stop_dev(c, ca, &wp->ptrs);
+ open_bucket_for_each(c, &wp->ptrs, ob, i)
+ if (should_drop_bucket(ob, c, ca, ec))
+ bch2_open_bucket_put(c, ob);
+ else
+ ob_push(c, &ptrs, ob);
+ wp->ptrs = ptrs;
mutex_unlock(&wp->lock);
}
+void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca,
+ bool ec)
+{
+ unsigned i;
+
+ /* Next, close write points that point to this device... */
+ for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
+ bch2_writepoint_stop(c, ca, ec, &c->write_points[i]);
+
+ bch2_writepoint_stop(c, ca, ec, &c->copygc_write_point);
+ bch2_writepoint_stop(c, ca, ec, &c->rebalance_write_point);
+ bch2_writepoint_stop(c, ca, ec, &c->btree_write_point);
+
+ mutex_lock(&c->btree_reserve_cache_lock);
+ while (c->btree_reserve_cache_nr) {
+ struct btree_alloc *a =
+ &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
+
+ bch2_open_buckets_put(c, &a->ob);
+ }
+ mutex_unlock(&c->btree_reserve_cache_lock);
+
+ spin_lock(&c->freelist_lock);
+ i = 0;
+ while (i < c->open_buckets_partial_nr) {
+ struct open_bucket *ob =
+ c->open_buckets + c->open_buckets_partial[i];
+
+ if (should_drop_bucket(ob, c, ca, ec)) {
+ --c->open_buckets_partial_nr;
+ swap(c->open_buckets_partial[i],
+ c->open_buckets_partial[c->open_buckets_partial_nr]);
+ ob->on_partial_list = false;
+ spin_unlock(&c->freelist_lock);
+ bch2_open_bucket_put(c, ob);
+ spin_lock(&c->freelist_lock);
+ } else {
+ i++;
+ }
+ }
+ spin_unlock(&c->freelist_lock);
+
+ bch2_ec_stop_dev(c, ca);
+}
+
static inline struct hlist_head *writepoint_hash(struct bch_fs *c,
unsigned long write_point)
{
@@ -1101,8 +1158,7 @@ static bool try_increase_writepoints(struct bch_fs *c)
return true;
}
-static bool try_decrease_writepoints(struct bch_fs *c,
- unsigned old_nr)
+static bool try_decrease_writepoints(struct bch_fs *c, unsigned old_nr)
{
struct write_point *wp;
@@ -1123,7 +1179,7 @@ static bool try_decrease_writepoints(struct bch_fs *c,
hlist_del_rcu(&wp->node);
mutex_unlock(&c->write_points_hash_lock);
- bch2_writepoint_stop(c, NULL, wp);
+ bch2_writepoint_stop(c, NULL, false, wp);
return true;
}
@@ -1217,6 +1273,8 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
int ret;
int i;
+ BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS);
+
BUG_ON(!nr_replicas || !nr_replicas_required);
retry:
ptrs.nr = 0;
@@ -1230,13 +1288,7 @@ retry:
if (wp->data_type != BCH_DATA_user)
have_cache = true;
- if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
- ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
- target, erasure_code,
- nr_replicas, &nr_effective,
- &have_cache, reserve,
- flags, cl);
- } else {
+ if (target && !(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
target, erasure_code,
nr_replicas, &nr_effective,
@@ -1246,11 +1298,28 @@ retry:
bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto alloc_done;
+ /* Don't retry from all devices if we're out of open buckets: */
+ if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
+ goto allocate_blocking;
+
+ /*
+ * Only try to allocate cache (durability = 0 devices) from the
+ * specified target:
+ */
+ have_cache = true;
+
ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
0, erasure_code,
nr_replicas, &nr_effective,
&have_cache, reserve,
flags, cl);
+ } else {
+allocate_blocking:
+ ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
+ target, erasure_code,
+ nr_replicas, &nr_effective,
+ &have_cache, reserve,
+ flags, cl);
}
alloc_done:
BUG_ON(!ret && nr_effective < nr_replicas);
@@ -1380,14 +1449,16 @@ void bch2_fs_allocator_foreground_init(struct bch_fs *c)
static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob)
{
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
unsigned data_type = ob->data_type;
barrier(); /* READ_ONCE() doesn't work on bitfields */
- prt_printf(out, "%zu ref %u %s %u:%llu gen %u",
+ prt_printf(out, "%zu ref %u %s %u:%llu gen %u allocated %u/%u",
ob - c->open_buckets,
atomic_read(&ob->pin),
data_type < BCH_DATA_NR ? bch2_data_types[data_type] : "invalid data type",
- ob->dev, ob->bucket, ob->gen);
+ ob->dev, ob->bucket, ob->gen,
+ ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size);
if (ob->ec)
prt_printf(out, " ec idx %llu", ob->ec->idx);
if (ob->on_partial_list)
diff --git a/libbcachefs/alloc_foreground.h b/libbcachefs/alloc_foreground.h
index e9b3b142..8a1cf425 100644
--- a/libbcachefs/alloc_foreground.h
+++ b/libbcachefs/alloc_foreground.h
@@ -151,7 +151,7 @@ static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64
int bch2_bucket_alloc_set_trans(struct btree_trans *, struct open_buckets *,
struct dev_stripe_state *, struct bch_devs_mask *,
- unsigned, unsigned *, bool *,
+ unsigned, unsigned *, bool *, unsigned,
enum bch_data_type, enum alloc_reserve,
struct closure *);
@@ -202,11 +202,7 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
struct bkey_i *, unsigned, bool);
void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
-void bch2_open_buckets_stop_dev(struct bch_fs *, struct bch_dev *,
- struct open_buckets *);
-
-void bch2_writepoint_stop(struct bch_fs *, struct bch_dev *,
- struct write_point *);
+void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *, bool);
static inline struct write_point_specifier writepoint_hashed(unsigned long v)
{
diff --git a/libbcachefs/backpointers.c b/libbcachefs/backpointers.c
index a40c2612..8517c563 100644
--- a/libbcachefs/backpointers.c
+++ b/libbcachefs/backpointers.c
@@ -549,13 +549,18 @@ int bch2_check_btree_backpointers(struct bch_fs *c)
bch2_check_btree_backpointer(&trans, &iter, k)));
}
+struct bpos_level {
+ unsigned level;
+ struct bpos pos;
+};
+
static int check_bp_exists(struct btree_trans *trans,
struct bpos bucket_pos,
struct bch_backpointer bp,
struct bkey_s_c orig_k,
struct bpos bucket_start,
struct bpos bucket_end,
- struct bpos *last_flushed_pos)
+ struct bpos_level *last_flushed)
{
struct bch_fs *c = trans->c;
struct btree_iter alloc_iter, bp_iter = { NULL };
@@ -600,8 +605,11 @@ static int check_bp_exists(struct btree_trans *trans,
if (bp_k.k->type != KEY_TYPE_backpointer ||
memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) {
- if (!bpos_eq(*last_flushed_pos, orig_k.k->p)) {
- *last_flushed_pos = orig_k.k->p;
+ if (last_flushed->level != bp.level ||
+ !bpos_eq(last_flushed->pos, orig_k.k->p)) {
+ last_flushed->level = bp.level;
+ last_flushed->pos = orig_k.k->p;
+
ret = bch2_btree_write_buffer_flush_sync(trans) ?:
-BCH_ERR_transaction_restart_write_buffer_flush;
goto out;
@@ -639,7 +647,7 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
struct btree_iter *iter,
struct bpos bucket_start,
struct bpos bucket_end,
- struct bpos *last_flushed_pos)
+ struct bpos_level *last_flushed)
{
struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs;
@@ -668,7 +676,7 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
ret = check_bp_exists(trans, bucket_pos, bp, k,
bucket_start, bucket_end,
- last_flushed_pos);
+ last_flushed);
if (ret)
return ret;
}
@@ -680,7 +688,7 @@ static int check_btree_root_to_backpointers(struct btree_trans *trans,
enum btree_id btree_id,
struct bpos bucket_start,
struct bpos bucket_end,
- struct bpos *last_flushed_pos)
+ struct bpos_level *last_flushed)
{
struct bch_fs *c = trans->c;
struct btree_iter iter;
@@ -709,12 +717,12 @@ static int check_btree_root_to_backpointers(struct btree_trans *trans,
if (p.ptr.cached)
continue;
- bch2_extent_ptr_to_bp(c, iter.btree_id, iter.path->level + 1,
+ bch2_extent_ptr_to_bp(c, iter.btree_id, b->c.level + 1,
k, p, &bucket_pos, &bp);
ret = check_bp_exists(trans, bucket_pos, bp, k,
bucket_start, bucket_end,
- last_flushed_pos);
+ last_flushed);
if (ret)
goto err;
}
@@ -794,7 +802,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
{
struct btree_iter iter;
enum btree_id btree_id;
- struct bpos last_flushed_pos = SPOS_MAX;
+ struct bpos_level last_flushed = { UINT_MAX };
int ret = 0;
for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
@@ -811,7 +819,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
BTREE_INSERT_NOFAIL,
check_extent_to_backpointers(trans, &iter,
bucket_start, bucket_end,
- &last_flushed_pos));
+ &last_flushed));
if (ret)
break;
} while (!bch2_btree_iter_advance(&iter));
@@ -826,7 +834,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
BTREE_INSERT_NOFAIL,
check_btree_root_to_backpointers(trans, btree_id,
bucket_start, bucket_end,
- &last_flushed_pos));
+ &last_flushed));
if (ret)
break;
}
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index 25a32fd6..348ee8e8 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -214,8 +214,11 @@
#define BCH_WRITE_REF_DEBUG
#endif
+#ifndef dynamic_fault
#define dynamic_fault(...) 0
-#define race_fault(...) 0
+#endif
+
+#define race_fault(...) dynamic_fault("bcachefs:race")
#define trace_and_count(_c, _name, ...) \
do { \
@@ -652,7 +655,6 @@ typedef struct {
x(fallocate) \
x(discard) \
x(invalidate) \
- x(move) \
x(delete_dead_snapshots) \
x(snapshot_delete_pagecache) \
x(sysfs)
@@ -922,6 +924,13 @@ struct bch_fs {
mempool_t large_bkey_pool;
+ /* MOVE.C */
+ struct list_head moving_context_list;
+ struct mutex moving_context_lock;
+
+ struct list_head data_progress_list;
+ struct mutex data_progress_lock;
+
/* REBALANCE */
struct bch_fs_rebalance rebalance;
@@ -932,10 +941,6 @@ struct bch_fs {
bool copygc_running;
wait_queue_head_t copygc_running_wq;
- /* DATA PROGRESS STATS */
- struct list_head data_progress_list;
- struct mutex data_progress_lock;
-
/* STRIPES: */
GENRADIX(struct stripe) stripes;
GENRADIX(struct gc_stripe) gc_stripes;
@@ -952,14 +957,14 @@ struct bch_fs {
struct list_head ec_stripe_new_list;
struct mutex ec_stripe_new_lock;
+ wait_queue_head_t ec_stripe_new_wait;
struct work_struct ec_stripe_create_work;
u64 ec_stripe_hint;
- struct bio_set ec_bioset;
-
struct work_struct ec_stripe_delete_work;
- struct llist_head ec_stripe_delete_list;
+
+ struct bio_set ec_bioset;
/* REFLINK */
u64 reflink_hint;
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index 2d344993..0a3e5605 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -16,7 +16,7 @@
#include "replicas.h"
#include "subvolume.h"
-#include <linux/prandom.h>
+#include <linux/random.h>
#include <linux/prefetch.h>
#include <trace/events/bcachefs.h>
diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c
index 298a674d..27a73933 100644
--- a/libbcachefs/btree_key_cache.c
+++ b/libbcachefs/btree_key_cache.c
@@ -770,11 +770,11 @@ int bch2_btree_key_cache_flush(struct btree_trans *trans,
bool bch2_btree_insert_key_cached(struct btree_trans *trans,
unsigned flags,
- struct btree_path *path,
- struct bkey_i *insert)
+ struct btree_insert_entry *insert_entry)
{
struct bch_fs *c = trans->c;
- struct bkey_cached *ck = (void *) path->l[0].b;
+ struct bkey_cached *ck = (void *) insert_entry->path->l[0].b;
+ struct bkey_i *insert = insert_entry->k;
bool kick_reclaim = false;
BUG_ON(insert->k.u64s > ck->u64s);
@@ -802,9 +802,24 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
kick_reclaim = true;
}
+ /*
+ * To minimize lock contention, we only add the journal pin here and
+ * defer pin updates to the flush callback via ->seq. Be careful not to
+ * update ->seq on nojournal commits because we don't want to update the
+ * pin to a seq that doesn't include journal updates on disk. Otherwise
+ * we risk losing the update after a crash.
+ *
+ * The only exception is if the pin is not active in the first place. We
+ * have to add the pin because journal reclaim drives key cache
+ * flushing. The flush callback will not proceed unless ->seq matches
+ * the latest pin, so make sure it starts with a consistent value.
+ */
+ if (!(insert_entry->flags & BTREE_UPDATE_NOJOURNAL) ||
+ !journal_pin_active(&ck->journal)) {
+ ck->seq = trans->journal_res.seq;
+ }
bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
&ck->journal, bch2_btree_key_cache_journal_flush);
- ck->seq = trans->journal_res.seq;
if (kick_reclaim)
journal_reclaim_kick(&c->journal);
diff --git a/libbcachefs/btree_key_cache.h b/libbcachefs/btree_key_cache.h
index c86d5e48..be3acde2 100644
--- a/libbcachefs/btree_key_cache.h
+++ b/libbcachefs/btree_key_cache.h
@@ -30,7 +30,7 @@ int bch2_btree_path_traverse_cached(struct btree_trans *, struct btree_path *,
unsigned);
bool bch2_btree_insert_key_cached(struct btree_trans *, unsigned,
- struct btree_path *, struct bkey_i *);
+ struct btree_insert_entry *);
int bch2_btree_key_cache_flush(struct btree_trans *,
enum btree_id, struct bpos);
void bch2_btree_key_cache_drop(struct btree_trans *,
diff --git a/libbcachefs/btree_locking.c b/libbcachefs/btree_locking.c
index 0032d0eb..b9998665 100644
--- a/libbcachefs/btree_locking.c
+++ b/libbcachefs/btree_locking.c
@@ -388,6 +388,40 @@ int __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree_path *p
return ret;
}
+void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree_bkey_cached_common *b)
+{
+ struct btree_path *linked;
+ unsigned i;
+ int ret;
+
+ /*
+ * XXX BIG FAT NOTICE
+ *
+ * Drop all read locks before taking a write lock:
+ *
+ * This is a hack, because bch2_btree_node_lock_write_nofail() is a
+ * hack - but by dropping read locks first, this should never fail, and
+ * we only use this in code paths where whatever read locks we've
+ * already taken are no longer needed:
+ */
+
+ trans_for_each_path(trans, linked) {
+ if (!linked->nodes_locked)
+ continue;
+
+ for (i = 0; i < BTREE_MAX_DEPTH; i++)
+ if (btree_node_read_locked(linked, i)) {
+ btree_node_unlock(trans, linked, i);
+ btree_path_set_dirty(linked, BTREE_ITER_NEED_RELOCK);
+ }
+ }
+
+ ret = __btree_node_lock_write(trans, path, b, true);
+ BUG_ON(ret);
+}
+
/* relock */
static inline bool btree_path_get_locks(struct btree_trans *trans,
diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h
index bd658e5c..327780ce 100644
--- a/libbcachefs/btree_locking.h
+++ b/libbcachefs/btree_locking.h
@@ -299,15 +299,6 @@ static inline int __btree_node_lock_write(struct btree_trans *trans,
: __bch2_btree_node_lock_write(trans, path, b, lock_may_not_fail);
}
-static inline void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
- struct btree_path *path,
- struct btree_bkey_cached_common *b)
-{
- int ret = __btree_node_lock_write(trans, path, b, true);
-
- BUG_ON(ret);
-}
-
static inline int __must_check
bch2_btree_node_lock_write(struct btree_trans *trans,
struct btree_path *path,
@@ -316,6 +307,10 @@ bch2_btree_node_lock_write(struct btree_trans *trans,
return __btree_node_lock_write(trans, path, b, false);
}
+void bch2_btree_node_lock_write_nofail(struct btree_trans *,
+ struct btree_path *,
+ struct btree_bkey_cached_common *);
+
/* relock: */
bool bch2_btree_path_relock_norestart(struct btree_trans *,
diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h
index ee1d1593..46fb4a9e 100644
--- a/libbcachefs/btree_update.h
+++ b/libbcachefs/btree_update.h
@@ -13,6 +13,9 @@ void bch2_btree_node_prep_for_write(struct btree_trans *,
bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *,
struct btree *, struct btree_node_iter *,
struct bkey_i *);
+
+int bch2_btree_node_flush0(struct journal *, struct journal_entry_pin *, u64);
+int bch2_btree_node_flush1(struct journal *, struct journal_entry_pin *, u64);
void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *,
diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c
index c93c132d..629e5288 100644
--- a/libbcachefs/btree_update_leaf.c
+++ b/libbcachefs/btree_update_leaf.c
@@ -227,12 +227,12 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
return 0;
}
-static int btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
+int bch2_btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
{
return __btree_node_flush(j, pin, 0, seq);
}
-static int btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
+int bch2_btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
{
return __btree_node_flush(j, pin, 1, seq);
}
@@ -244,8 +244,8 @@ inline void bch2_btree_add_journal_pin(struct bch_fs *c,
bch2_journal_pin_add(&c->journal, seq, &w->journal,
btree_node_write_idx(b) == 0
- ? btree_node_flush0
- : btree_node_flush1);
+ ? bch2_btree_node_flush0
+ : bch2_btree_node_flush1);
}
/**
@@ -765,7 +765,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
if (!i->cached)
btree_insert_key_leaf(trans, i);
else if (!i->key_cache_already_flushed)
- bch2_btree_insert_key_cached(trans, flags, i->path, i->k);
+ bch2_btree_insert_key_cached(trans, flags, i);
else {
bch2_btree_key_cache_drop(trans, i->path);
btree_path_set_dirty(i->path, BTREE_ITER_NEED_TRAVERSE);
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c
index 6805f2c0..1bcef419 100644
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -1855,7 +1855,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
if (IS_ERR(a))
return PTR_ERR(a);
- if (a->v.data_type && a->v.data_type != type) {
+ if (a->v.data_type && type && a->v.data_type != type) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
"while marking %s",
diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c
index eb91e24c..e414d1af 100644
--- a/libbcachefs/data_update.c
+++ b/libbcachefs/data_update.c
@@ -92,18 +92,6 @@ static int insert_snapshot_whiteouts(struct btree_trans *trans,
return ret;
}
-static void bch2_bkey_mark_dev_cached(struct bkey_s k, unsigned dev)
-{
- struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
- struct bch_extent_ptr *ptr;
-
- bkey_for_each_ptr(ptrs, ptr)
- if (ptr->dev == dev) {
- bch2_extent_ptr_set_cached(k, ptr);
- return;
- }
-}
-
static int __bch2_data_update_index_update(struct btree_trans *trans,
struct bch_write_op *op)
{
@@ -126,15 +114,17 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
while (1) {
struct bkey_s_c k;
struct bkey_s_c old = bkey_i_to_s_c(m->k.k);
- struct bkey_i *insert;
+ struct bkey_i *insert = NULL;
struct bkey_i_extent *new;
- const union bch_extent_entry *entry;
+ const union bch_extent_entry *entry_c;
+ union bch_extent_entry *entry;
struct extent_ptr_decoded p;
+ struct bch_extent_ptr *ptr;
+ const struct bch_extent_ptr *ptr_c;
struct bpos next_pos;
- bool did_work = false;
bool should_check_enospc;
s64 i_sectors_delta = 0, disk_sectors_delta = 0;
- unsigned i;
+ unsigned rewrites_found = 0, durability, i;
bch2_trans_begin(trans);
@@ -146,7 +136,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
new = bkey_i_to_extent(bch2_keylist_front(keys));
if (!bch2_extents_match(k, old))
- goto nomatch;
+ goto nowork;
bkey_reassemble(_insert.k, k);
insert = _insert.k;
@@ -169,50 +159,60 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
* Fist, drop rewrite_ptrs from @new:
*/
i = 0;
- bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) {
+ bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry_c) {
if (((1U << i) & m->data_opts.rewrite_ptrs) &&
- bch2_extent_has_ptr(old, p, bkey_i_to_s_c(insert))) {
- /*
- * If we're going to be adding a pointer to the
- * same device, we have to drop the old one -
- * otherwise, we can just mark it cached:
- */
- if (bch2_bkey_has_device(bkey_i_to_s_c(&new->k_i), p.ptr.dev))
- bch2_bkey_drop_device_noerror(bkey_i_to_s(insert), p.ptr.dev);
- else
- bch2_bkey_mark_dev_cached(bkey_i_to_s(insert), p.ptr.dev);
+ (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
+ !ptr->cached) {
+ bch2_extent_ptr_set_cached(bkey_i_to_s(insert), ptr);
+ rewrites_found |= 1U << i;
}
i++;
}
+ if (m->data_opts.rewrite_ptrs &&
+ !rewrites_found &&
+ bch2_bkey_durability(c, k) >= m->op.opts.data_replicas)
+ goto nowork;
- /* Add new ptrs: */
- extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) {
- const struct bch_extent_ptr *existing_ptr =
- bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev);
-
- if (existing_ptr && existing_ptr->cached) {
- /*
- * We're replacing a cached pointer with a non
- * cached pointer:
- */
- bch2_bkey_drop_device_noerror(bkey_i_to_s(insert),
- existing_ptr->dev);
- } else if (existing_ptr) {
- /*
- * raced with another move op? extent already
- * has a pointer to the device we just wrote
- * data to
- */
- continue;
+ /*
+ * A replica that we just wrote might conflict with a replica
+ * that we want to keep, due to racing with another move:
+ */
+restart_drop_conflicting_replicas:
+ extent_for_each_ptr(extent_i_to_s(new), ptr)
+ if ((ptr_c = bch2_bkey_has_device_c(bkey_i_to_s_c(insert), ptr->dev)) &&
+ !ptr_c->cached) {
+ bch2_bkey_drop_ptr_noerror(bkey_i_to_s(&new->k_i), ptr);
+ goto restart_drop_conflicting_replicas;
}
- bch2_extent_ptr_decoded_append(insert, &p);
- did_work = true;
+ if (!bkey_val_u64s(&new->k))
+ goto nowork;
+
+ /* Now, drop pointers that conflict with what we just wrote: */
+ extent_for_each_ptr_decode(extent_i_to_s(new), p, entry)
+ if ((ptr = bch2_bkey_has_device(bkey_i_to_s(insert), p.ptr.dev)))
+ bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr);
+
+ durability = bch2_bkey_durability(c, bkey_i_to_s_c(insert)) +
+ bch2_bkey_durability(c, bkey_i_to_s_c(&new->k_i));
+
+ /* Now, drop excess replicas: */
+restart_drop_extra_replicas:
+ bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) {
+ unsigned ptr_durability = bch2_extent_ptr_durability(c, &p);
+
+ if (!p.ptr.cached &&
+ durability - ptr_durability >= m->op.opts.data_replicas) {
+ durability -= ptr_durability;
+ bch2_extent_ptr_set_cached(bkey_i_to_s(insert), &entry->ptr);
+ goto restart_drop_extra_replicas;
+ }
}
- if (!did_work)
- goto nomatch;
+ /* Finally, add the pointers we just wrote: */
+ extent_for_each_ptr_decode(extent_i_to_s(new), p, entry)
+ bch2_extent_ptr_decoded_append(insert, &p);
bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 });
bch2_extent_normalize(c, bkey_i_to_s(insert));
@@ -253,6 +253,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(trans, &op->res,
NULL,
+ BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_NOFAIL|
m->data_opts.btree_insert_flags);
if (!ret) {
@@ -273,7 +274,7 @@ next:
goto out;
}
continue;
-nomatch:
+nowork:
if (m->ctxt && m->ctxt->stats) {
BUG_ON(k.k->p.offset <= iter.pos.offset);
atomic64_inc(&m->ctxt->stats->keys_raced);
diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c
index 7d43fd4a..09c6f93c 100644
--- a/libbcachefs/ec.c
+++ b/libbcachefs/ec.c
@@ -659,14 +659,13 @@ static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s)
static u64 stripe_idx_to_delete(struct bch_fs *c)
{
ec_stripes_heap *h = &c->ec_stripes_heap;
- size_t heap_idx;
lockdep_assert_held(&c->ec_stripes_heap_lock);
- for (heap_idx = 0; heap_idx < h->used; heap_idx++)
- if (h->data[heap_idx].blocks_nonempty == 0 &&
- !bch2_stripe_is_open(c, h->data[heap_idx].idx))
- return h->data[heap_idx].idx;
+ if (h->used &&
+ h->data[0].blocks_nonempty == 0 &&
+ !bch2_stripe_is_open(c, h->data[0].idx))
+ return h->data[0].idx;
return 0;
}
@@ -959,7 +958,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
bkey_reassemble(n, k);
bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev);
- ec_ptr = (void *) bch2_bkey_has_device(bkey_i_to_s_c(n), dev);
+ ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev);
BUG_ON(!ec_ptr);
stripe_ptr = (struct bch_extent_stripe_ptr) {
@@ -990,6 +989,7 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
while (1) {
ret = commit_do(trans, NULL, NULL,
+ BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_NOFAIL,
ec_stripe_update_extent(trans, bucket_pos, bucket.gen,
s, &bp_offset));
@@ -1057,6 +1057,13 @@ static void zero_out_rest_of_ec_bucket(struct bch_fs *c,
s->err = ret;
}
+void bch2_ec_stripe_new_free(struct bch_fs *c, struct ec_stripe_new *s)
+{
+ if (s->idx)
+ bch2_stripe_close(c, s);
+ kfree(s);
+}
+
/*
* data buckets of new stripe all written: create the stripe
*/
@@ -1072,13 +1079,15 @@ static void ec_stripe_create(struct ec_stripe_new *s)
closure_sync(&s->iodone);
- for (i = 0; i < nr_data; i++)
- if (s->blocks[i]) {
- ob = c->open_buckets + s->blocks[i];
+ if (!s->err) {
+ for (i = 0; i < nr_data; i++)
+ if (s->blocks[i]) {
+ ob = c->open_buckets + s->blocks[i];
- if (ob->sectors_free)
- zero_out_rest_of_ec_bucket(c, s, i, ob);
- }
+ if (ob->sectors_free)
+ zero_out_rest_of_ec_bucket(c, s, i, ob);
+ }
+ }
if (s->err) {
if (!bch2_err_matches(s->err, EROFS))
@@ -1119,7 +1128,9 @@ static void ec_stripe_create(struct ec_stripe_new *s)
goto err;
}
- ret = bch2_trans_do(c, &s->res, NULL, BTREE_INSERT_NOFAIL,
+ ret = bch2_trans_do(c, &s->res, NULL,
+ BTREE_INSERT_NOCHECK_RW|
+ BTREE_INSERT_NOFAIL,
ec_stripe_key_update(&trans, &s->new_stripe.key,
!s->have_existing_stripe));
if (ret) {
@@ -1152,13 +1163,11 @@ err:
list_del(&s->list);
mutex_unlock(&c->ec_stripe_new_lock);
- if (s->idx)
- bch2_stripe_close(c, s);
-
ec_stripe_buf_exit(&s->existing_stripe);
ec_stripe_buf_exit(&s->new_stripe);
closure_debug_destroy(&s->iodone);
- kfree(s);
+
+ ec_stripe_new_put(c, s, STRIPE_REF_stripe);
}
static struct ec_stripe_new *get_pending_stripe(struct bch_fs *c)
@@ -1167,7 +1176,7 @@ static struct ec_stripe_new *get_pending_stripe(struct bch_fs *c)
mutex_lock(&c->ec_stripe_new_lock);
list_for_each_entry(s, &c->ec_stripe_new_list, list)
- if (!atomic_read(&s->pin))
+ if (!atomic_read(&s->ref[STRIPE_REF_io]))
goto out;
s = NULL;
out:
@@ -1209,7 +1218,7 @@ static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
list_add(&s->list, &c->ec_stripe_new_list);
mutex_unlock(&c->ec_stripe_new_lock);
- ec_stripe_new_put(c, s);
+ ec_stripe_new_put(c, s, STRIPE_REF_io);
}
void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
@@ -1321,7 +1330,8 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
mutex_init(&s->lock);
closure_init(&s->iodone, NULL);
- atomic_set(&s->pin, 1);
+ atomic_set(&s->ref[STRIPE_REF_stripe], 1);
+ atomic_set(&s->ref[STRIPE_REF_io], 1);
s->c = c;
s->h = h;
s->nr_data = min_t(unsigned, h->nr_active_devs,
@@ -1402,6 +1412,11 @@ struct ec_stripe_head *__bch2_ec_stripe_head_get(struct btree_trans *trans,
if (ret)
return ERR_PTR(ret);
+ if (test_bit(BCH_FS_GOING_RO, &c->flags)) {
+ h = ERR_PTR(-EROFS);
+ goto found;
+ }
+
list_for_each_entry(h, &c->ec_stripe_head_list, list)
if (h->target == target &&
h->algo == algo &&
@@ -1451,7 +1466,7 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
&devs,
h->s->nr_parity,
&nr_have_parity,
- &have_cache,
+ &have_cache, 0,
BCH_DATA_parity,
reserve,
cl);
@@ -1478,7 +1493,7 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
&devs,
h->s->nr_data,
&nr_have_data,
- &have_cache,
+ &have_cache, 0,
BCH_DATA_user,
reserve,
cl);
@@ -1706,6 +1721,14 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked)
goto err;
+ if (reserve == RESERVE_movinggc) {
+ ret = new_stripe_alloc_buckets(trans, h, reserve, NULL) ?:
+ __bch2_ec_stripe_head_reserve(trans, h);
+ if (ret)
+ goto err;
+ goto allocate_buf;
+ }
+
/* XXX freelist_wait? */
closure_wait(&c->freelist_wait, cl);
waiting = true;
@@ -1738,7 +1761,7 @@ err:
return ERR_PTR(ret);
}
-void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
+static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca)
{
struct ec_stripe_head *h;
struct open_bucket *ob;
@@ -1746,11 +1769,13 @@ void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
mutex_lock(&c->ec_stripe_head_lock);
list_for_each_entry(h, &c->ec_stripe_head_list, list) {
-
mutex_lock(&h->lock);
if (!h->s)
goto unlock;
+ if (!ca)
+ goto found;
+
for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) {
if (!h->s->blocks[i])
continue;
@@ -1769,6 +1794,32 @@ unlock:
mutex_unlock(&c->ec_stripe_head_lock);
}
+void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
+{
+ __bch2_ec_stop(c, ca);
+}
+
+void bch2_fs_ec_stop(struct bch_fs *c)
+{
+ __bch2_ec_stop(c, NULL);
+}
+
+static bool bch2_fs_ec_flush_done(struct bch_fs *c)
+{
+ bool ret;
+
+ mutex_lock(&c->ec_stripe_new_lock);
+ ret = list_empty(&c->ec_stripe_new_list);
+ mutex_unlock(&c->ec_stripe_new_lock);
+
+ return ret;
+}
+
+void bch2_fs_ec_flush(struct bch_fs *c)
+{
+ wait_event(c->ec_stripe_new_wait, bch2_fs_ec_flush_done(c));
+}
+
int bch2_stripes_read(struct bch_fs *c)
{
struct btree_trans trans;
@@ -1821,13 +1872,16 @@ void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
size_t i;
mutex_lock(&c->ec_stripes_heap_lock);
- for (i = 0; i < min_t(size_t, h->used, 20); i++) {
+ for (i = 0; i < min_t(size_t, h->used, 50); i++) {
m = genradix_ptr(&c->stripes, h->data[i].idx);
- prt_printf(out, "%zu %u/%u+%u\n", h->data[i].idx,
+ prt_printf(out, "%zu %u/%u+%u", h->data[i].idx,
h->data[i].blocks_nonempty,
m->nr_blocks - m->nr_redundant,
m->nr_redundant);
+ if (bch2_stripe_is_open(c, h->data[i].idx))
+ prt_str(out, " open");
+ prt_newline(out);
}
mutex_unlock(&c->ec_stripes_heap_lock);
}
@@ -1839,22 +1893,27 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
mutex_lock(&c->ec_stripe_head_lock);
list_for_each_entry(h, &c->ec_stripe_head_list, list) {
- prt_printf(out, "target %u algo %u redundancy %u:\n",
- h->target, h->algo, h->redundancy);
+ prt_printf(out, "target %u algo %u redundancy %u %s:\n",
+ h->target, h->algo, h->redundancy,
+ bch2_alloc_reserves[h->reserve]);
if (h->s)
- prt_printf(out, "\tpending: idx %llu blocks %u+%u allocated %u\n",
+ prt_printf(out, "\tidx %llu blocks %u+%u allocated %u\n",
h->s->idx, h->s->nr_data, h->s->nr_parity,
bitmap_weight(h->s->blocks_allocated,
h->s->nr_data));
}
mutex_unlock(&c->ec_stripe_head_lock);
+ prt_printf(out, "in flight:\n");
+
mutex_lock(&c->ec_stripe_new_lock);
list_for_each_entry(s, &c->ec_stripe_new_list, list) {
- prt_printf(out, "\tin flight: idx %llu blocks %u+%u pin %u\n",
+ prt_printf(out, "\tidx %llu blocks %u+%u ref %u %u %s\n",
s->idx, s->nr_data, s->nr_parity,
- atomic_read(&s->pin));
+ atomic_read(&s->ref[STRIPE_REF_io]),
+ atomic_read(&s->ref[STRIPE_REF_stripe]),
+ bch2_alloc_reserves[s->h->reserve]);
}
mutex_unlock(&c->ec_stripe_new_lock);
}
@@ -1892,14 +1951,22 @@ void bch2_fs_ec_exit(struct bch_fs *c)
void bch2_fs_ec_init_early(struct bch_fs *c)
{
+ spin_lock_init(&c->ec_stripes_new_lock);
+ mutex_init(&c->ec_stripes_heap_lock);
+
+ INIT_LIST_HEAD(&c->ec_stripe_head_list);
+ mutex_init(&c->ec_stripe_head_lock);
+
+ INIT_LIST_HEAD(&c->ec_stripe_new_list);
+ mutex_init(&c->ec_stripe_new_lock);
+ init_waitqueue_head(&c->ec_stripe_new_wait);
+
INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work);
INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work);
}
int bch2_fs_ec_init(struct bch_fs *c)
{
- spin_lock_init(&c->ec_stripes_new_lock);
-
return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
BIOSET_NEED_BVECS);
}
diff --git a/libbcachefs/ec.h b/libbcachefs/ec.h
index d112aea9..7c08a49d 100644
--- a/libbcachefs/ec.h
+++ b/libbcachefs/ec.h
@@ -143,6 +143,12 @@ struct ec_stripe_buf {
struct ec_stripe_head;
+enum ec_stripe_ref {
+ STRIPE_REF_io,
+ STRIPE_REF_stripe,
+ STRIPE_REF_NR
+};
+
struct ec_stripe_new {
struct bch_fs *c;
struct ec_stripe_head *h;
@@ -154,8 +160,7 @@ struct ec_stripe_new {
struct closure iodone;
- /* counts in flight writes, stripe is created when pin == 0 */
- atomic_t pin;
+ atomic_t ref[STRIPE_REF_NR];
int err;
@@ -213,24 +218,35 @@ void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t);
void bch2_do_stripe_deletes(struct bch_fs *);
void bch2_ec_do_stripe_creates(struct bch_fs *);
+void bch2_ec_stripe_new_free(struct bch_fs *, struct ec_stripe_new *);
-static inline void ec_stripe_new_get(struct ec_stripe_new *s)
+static inline void ec_stripe_new_get(struct ec_stripe_new *s,
+ enum ec_stripe_ref ref)
{
- atomic_inc(&s->pin);
+ atomic_inc(&s->ref[ref]);
}
-static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s)
+static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s,
+ enum ec_stripe_ref ref)
{
- BUG_ON(atomic_read(&s->pin) <= 0);
- BUG_ON(!s->err && !s->idx);
-
- if (atomic_dec_and_test(&s->pin))
- bch2_ec_do_stripe_creates(c);
+ BUG_ON(atomic_read(&s->ref[ref]) <= 0);
+
+ if (atomic_dec_and_test(&s->ref[ref]))
+ switch (ref) {
+ case STRIPE_REF_stripe:
+ bch2_ec_stripe_new_free(c, s);
+ break;
+ case STRIPE_REF_io:
+ bch2_ec_do_stripe_creates(c);
+ break;
+ default:
+ unreachable();
+ }
}
void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
-
-void bch2_ec_flush_new_stripes(struct bch_fs *);
+void bch2_fs_ec_stop(struct bch_fs *);
+void bch2_fs_ec_flush(struct bch_fs *);
int bch2_stripes_read(struct bch_fs *);
diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c
index 4fc581be..e2c09ea4 100644
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@@ -26,8 +26,6 @@
#include <trace/events/bcachefs.h>
-static union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *);
-
static unsigned bch2_crc_field_size_max[] = {
[BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
[BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX,
@@ -512,7 +510,7 @@ restart_narrow_pointers:
bkey_for_each_ptr_decode(&k->k, ptrs, p, i)
if (can_narrow_crc(p.crc, n)) {
- __bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr);
+ bch2_bkey_drop_ptr_noerror(bkey_i_to_s(k), &i->ptr);
p.ptr.offset += p.crc.offset;
p.crc = n;
bch2_extent_ptr_decoded_append(k, &p);
@@ -765,8 +763,8 @@ static void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry)
/*
* Returns pointer to the next entry after the one being dropped:
*/
-static union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s k,
- struct bch_extent_ptr *ptr)
+union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s k,
+ struct bch_extent_ptr *ptr)
{
struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
union bch_extent_entry *entry = to_entry(ptr), *next;
@@ -809,7 +807,7 @@ union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
{
bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr;
union bch_extent_entry *ret =
- __bch2_bkey_drop_ptr(k, ptr);
+ bch2_bkey_drop_ptr_noerror(k, ptr);
/*
* If we deleted all the dirty pointers and there's still cached
@@ -840,14 +838,13 @@ void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev)
{
- struct bch_extent_ptr *ptr = (void *) bch2_bkey_has_device(k.s_c, dev);
+ struct bch_extent_ptr *ptr = bch2_bkey_has_device(k, dev);
if (ptr)
- __bch2_bkey_drop_ptr(k, ptr);
+ bch2_bkey_drop_ptr_noerror(k, ptr);
}
-const struct bch_extent_ptr *
-bch2_bkey_has_device(struct bkey_s_c k, unsigned dev)
+const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const struct bch_extent_ptr *ptr;
@@ -922,11 +919,11 @@ bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
}
}
-bool bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1,
- struct bkey_s_c k2)
+struct bch_extent_ptr *
+bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bkey_s k2)
{
- struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2);
- const union bch_extent_entry *entry2;
+ struct bkey_ptrs ptrs2 = bch2_bkey_ptrs(k2);
+ union bch_extent_entry *entry2;
struct extent_ptr_decoded p2;
bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
@@ -934,9 +931,9 @@ bool bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1,
p1.ptr.gen == p2.ptr.gen &&
(s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
(s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
- return true;
+ return &entry2->ptr;
- return false;
+ return NULL;
}
void bch2_extent_ptr_set_cached(struct bkey_s k, struct bch_extent_ptr *ptr)
@@ -992,6 +989,9 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
struct bch_dev *ca;
bool first = true;
+ if (c)
+ prt_printf(out, "durability: %u ", bch2_bkey_durability(c, k));
+
bkey_extent_entry_for_each(ptrs, entry) {
if (!first)
prt_printf(out, " ");
diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h
index bac6a1ed..9b026ae9 100644
--- a/libbcachefs/extents.h
+++ b/libbcachefs/extents.h
@@ -613,14 +613,21 @@ unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
void bch2_bkey_drop_device(struct bkey_s, unsigned);
void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned);
-const struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s_c, unsigned);
+
+const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c, unsigned);
+
+static inline struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s k, unsigned dev)
+{
+ return (void *) bch2_bkey_has_device_c(k.s_c, dev);
+}
+
bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned);
void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *);
static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr ptr)
{
- EBUG_ON(bch2_bkey_has_device(bkey_i_to_s_c(k), ptr.dev));
+ EBUG_ON(bch2_bkey_has_device(bkey_i_to_s(k), ptr.dev));
switch (k->k.type) {
case KEY_TYPE_btree_ptr:
@@ -642,6 +649,8 @@ static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr
void bch2_extent_ptr_decoded_append(struct bkey_i *,
struct extent_ptr_decoded *);
+union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s,
+ struct bch_extent_ptr *);
union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s,
struct bch_extent_ptr *);
@@ -665,7 +674,8 @@ do { \
bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c,
struct bch_extent_ptr, u64);
bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c);
-bool bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s_c);
+struct bch_extent_ptr *
+bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s);
void bch2_extent_ptr_set_cached(struct bkey_s, struct bch_extent_ptr *);
diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c
index e232f331..5e6dc6c3 100644
--- a/libbcachefs/fsck.c
+++ b/libbcachefs/fsck.c
@@ -954,11 +954,11 @@ static int check_inode(struct btree_trans *trans,
iter->pos.snapshot),
POS(u.bi_inum, U64_MAX),
0, NULL);
- if (ret) {
+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
bch_err(c, "error in fsck: error truncating inode: %s",
bch2_err_str(ret));
+ if (ret)
return ret;
- }
/*
* We truncated without our normal sector accounting hook, just
diff --git a/libbcachefs/io.c b/libbcachefs/io.c
index ea0fd631..76856bfd 100644
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
@@ -218,7 +218,8 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
bch2_trans_copy_iter(&iter, extent_iter);
- for_each_btree_key_continue_norestart(iter, BTREE_ITER_SLOTS, old, ret) {
+ for_each_btree_key_upto_continue_norestart(iter,
+ new->k.p, BTREE_ITER_SLOTS, old, ret) {
s64 sectors = min(new->k.p.offset, old.k->p.offset) -
max(bkey_start_offset(&new->k),
bkey_start_offset(old.k));
@@ -705,7 +706,8 @@ static void bch2_write_done(struct closure *cl)
struct bch_fs *c = op->c;
bch2_disk_reservation_put(c, &op->res);
- bch2_write_ref_put(c, BCH_WRITE_REF_write);
+ if (!(op->flags & BCH_WRITE_MOVE))
+ bch2_write_ref_put(c, BCH_WRITE_REF_write);
bch2_keylist_free(&op->insert_keys, op->inline_keys);
bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
@@ -834,36 +836,30 @@ static void bch2_write_index(struct closure *cl)
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
struct write_point *wp = op->wp;
struct workqueue_struct *wq = index_update_wq(op);
+ unsigned long flags;
if ((op->flags & BCH_WRITE_DONE) &&
(op->flags & BCH_WRITE_MOVE))
bch2_bio_free_pages_pool(op->c, &op->wbio.bio);
- barrier();
-
- /*
- * We're not using wp->writes_lock here, so this is racey: that's ok,
- * because this is just for diagnostic purposes, and we're running out
- * of interrupt context here so if we were to take the log we'd have to
- * switch to spin_lock_irq()/irqsave(), which is not free:
- */
+ spin_lock_irqsave(&wp->writes_lock, flags);
if (wp->state == WRITE_POINT_waiting_io)
__wp_update_state(wp, WRITE_POINT_waiting_work);
+ list_add_tail(&op->wp_list, &wp->writes);
+ spin_unlock_irqrestore (&wp->writes_lock, flags);
- op->btree_update_ready = true;
queue_work(wq, &wp->index_update_work);
}
static inline void bch2_write_queue(struct bch_write_op *op, struct write_point *wp)
{
- op->btree_update_ready = false;
op->wp = wp;
- spin_lock(&wp->writes_lock);
- list_add_tail(&op->wp_list, &wp->writes);
- if (wp->state == WRITE_POINT_stopped)
+ if (wp->state == WRITE_POINT_stopped) {
+ spin_lock_irq(&wp->writes_lock);
__wp_update_state(wp, WRITE_POINT_waiting_io);
- spin_unlock(&wp->writes_lock);
+ spin_unlock_irq(&wp->writes_lock);
+ }
}
void bch2_write_point_do_index_updates(struct work_struct *work)
@@ -873,16 +869,12 @@ void bch2_write_point_do_index_updates(struct work_struct *work)
struct bch_write_op *op;
while (1) {
- spin_lock(&wp->writes_lock);
- list_for_each_entry(op, &wp->writes, wp_list)
- if (op->btree_update_ready) {
- list_del(&op->wp_list);
- goto unlock;
- }
- op = NULL;
-unlock:
+ spin_lock_irq(&wp->writes_lock);
+ op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list);
+ if (op)
+ list_del(&op->wp_list);
wp_update_state(wp, op != NULL);
- spin_unlock(&wp->writes_lock);
+ spin_unlock_irq(&wp->writes_lock);
if (!op)
break;
@@ -1673,7 +1665,6 @@ static void __bch2_write(struct bch_write_op *op)
}
again:
memset(&op->failed, 0, sizeof(op->failed));
- op->btree_update_ready = false;
do {
struct bkey_i *key_to_write;
@@ -1853,7 +1844,12 @@ void bch2_write(struct closure *cl)
goto err;
}
- if (c->opts.nochanges ||
+ if (c->opts.nochanges) {
+ op->error = -BCH_ERR_erofs_no_writes;
+ goto err;
+ }
+
+ if (!(op->flags & BCH_WRITE_MOVE) &&
!bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) {
op->error = -BCH_ERR_erofs_no_writes;
goto err;
@@ -1881,6 +1877,28 @@ err:
op->end_io(op);
}
+const char * const bch2_write_flags[] = {
+#define x(f) #f,
+ BCH_WRITE_FLAGS()
+#undef x
+ NULL
+};
+
+void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
+{
+ prt_str(out, "pos: ");
+ bch2_bpos_to_text(out, op->pos);
+ prt_newline(out);
+
+ prt_str(out, "started: ");
+ bch2_pr_time_units(out, local_clock() - op->start_time);
+ prt_newline(out);
+
+ prt_str(out, "flags: ");
+ prt_bitflags(out, bch2_write_flags, op->flags);
+ prt_newline(out);
+}
+
/* Cache promotion on read */
struct promote_op {
diff --git a/libbcachefs/io.h b/libbcachefs/io.h
index 166ad681..90948bb0 100644
--- a/libbcachefs/io.h
+++ b/libbcachefs/io.h
@@ -28,41 +28,34 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
const char *bch2_blk_status_to_str(blk_status_t);
-enum bch_write_flags {
- __BCH_WRITE_ALLOC_NOWAIT,
- __BCH_WRITE_CACHED,
- __BCH_WRITE_DATA_ENCODED,
- __BCH_WRITE_PAGES_STABLE,
- __BCH_WRITE_PAGES_OWNED,
- __BCH_WRITE_ONLY_SPECIFIED_DEVS,
- __BCH_WRITE_WROTE_DATA_INLINE,
- __BCH_WRITE_FROM_INTERNAL,
- __BCH_WRITE_CHECK_ENOSPC,
- __BCH_WRITE_SYNC,
- __BCH_WRITE_MOVE,
- __BCH_WRITE_IN_WORKER,
- __BCH_WRITE_DONE,
- __BCH_WRITE_IO_ERROR,
- __BCH_WRITE_CONVERT_UNWRITTEN,
+#define BCH_WRITE_FLAGS() \
+ x(ALLOC_NOWAIT) \
+ x(CACHED) \
+ x(DATA_ENCODED) \
+ x(PAGES_STABLE) \
+ x(PAGES_OWNED) \
+ x(ONLY_SPECIFIED_DEVS) \
+ x(WROTE_DATA_INLINE) \
+ x(FROM_INTERNAL) \
+ x(CHECK_ENOSPC) \
+ x(SYNC) \
+ x(MOVE) \
+ x(IN_WORKER) \
+ x(DONE) \
+ x(IO_ERROR) \
+ x(CONVERT_UNWRITTEN)
+
+enum __bch_write_flags {
+#define x(f) __BCH_WRITE_##f,
+ BCH_WRITE_FLAGS()
+#undef x
};
-#define BCH_WRITE_ALLOC_NOWAIT (1U << __BCH_WRITE_ALLOC_NOWAIT)
-#define BCH_WRITE_CACHED (1U << __BCH_WRITE_CACHED)
-#define BCH_WRITE_DATA_ENCODED (1U << __BCH_WRITE_DATA_ENCODED)
-#define BCH_WRITE_PAGES_STABLE (1U << __BCH_WRITE_PAGES_STABLE)
-#define BCH_WRITE_PAGES_OWNED (1U << __BCH_WRITE_PAGES_OWNED)
-#define BCH_WRITE_ONLY_SPECIFIED_DEVS (1U << __BCH_WRITE_ONLY_SPECIFIED_DEVS)
-#define BCH_WRITE_WROTE_DATA_INLINE (1U << __BCH_WRITE_WROTE_DATA_INLINE)
-#define BCH_WRITE_FROM_INTERNAL (1U << __BCH_WRITE_FROM_INTERNAL)
-#define BCH_WRITE_CHECK_ENOSPC (1U << __BCH_WRITE_CHECK_ENOSPC)
-#define BCH_WRITE_SYNC (1U << __BCH_WRITE_SYNC)
-#define BCH_WRITE_MOVE (1U << __BCH_WRITE_MOVE)
-
-/* Internal: */
-#define BCH_WRITE_IN_WORKER (1U << __BCH_WRITE_IN_WORKER)
-#define BCH_WRITE_DONE (1U << __BCH_WRITE_DONE)
-#define BCH_WRITE_IO_ERROR (1U << __BCH_WRITE_IO_ERROR)
-#define BCH_WRITE_CONVERT_UNWRITTEN (1U << __BCH_WRITE_CONVERT_UNWRITTEN)
+enum bch_write_flags {
+#define x(f) BCH_WRITE_##f = 1U << __BCH_WRITE_##f,
+ BCH_WRITE_FLAGS()
+#undef x
+};
static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
{
@@ -124,6 +117,8 @@ static inline struct bch_write_bio *wbio_init(struct bio *bio)
return wbio;
}
+void bch2_write_op_to_text(struct printbuf *, struct bch_write_op *);
+
struct bch_devs_mask;
struct cache_promote_op;
struct extent_ptr_decoded;
diff --git a/libbcachefs/io_types.h b/libbcachefs/io_types.h
index 4e5d3106..3b2ed0fa 100644
--- a/libbcachefs/io_types.h
+++ b/libbcachefs/io_types.h
@@ -119,7 +119,7 @@ struct bch_write_op {
unsigned nr_replicas_required:4;
unsigned alloc_reserve:3;
unsigned incompressible:1;
- unsigned btree_update_ready:1;
+ unsigned stripe_waited:1;
struct bch_devs_list devs_have;
u16 target;
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index e0c4f51a..5699a9d8 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -68,8 +68,9 @@ journal_seq_to_buf(struct journal *j, u64 seq)
static void journal_pin_list_init(struct journal_entry_pin_list *p, int count)
{
- INIT_LIST_HEAD(&p->list);
- INIT_LIST_HEAD(&p->key_cache_list);
+ unsigned i;
+ for (i = 0; i < ARRAY_SIZE(p->list); i++)
+ INIT_LIST_HEAD(&p->list[i]);
INIT_LIST_HEAD(&p->flushed);
atomic_set(&p->count, count);
p->devs.nr = 0;
@@ -758,19 +759,10 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
u64 *new_bucket_seq = NULL, *new_buckets = NULL;
struct open_bucket **ob = NULL;
long *bu = NULL;
- unsigned i, nr_got = 0, nr_want = nr - ja->nr;
- unsigned old_nr = ja->nr;
- unsigned old_discard_idx = ja->discard_idx;
- unsigned old_dirty_idx_ondisk = ja->dirty_idx_ondisk;
- unsigned old_dirty_idx = ja->dirty_idx;
- unsigned old_cur_idx = ja->cur_idx;
+ unsigned i, pos, nr_got = 0, nr_want = nr - ja->nr;
int ret = 0;
- if (c) {
- bch2_journal_flush_all_pins(&c->journal);
- bch2_journal_block(&c->journal);
- mutex_lock(&c->sb_lock);
- }
+ BUG_ON(nr <= ja->nr);
bu = kcalloc(nr_want, sizeof(*bu), GFP_KERNEL);
ob = kcalloc(nr_want, sizeof(*ob), GFP_KERNEL);
@@ -778,7 +770,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
new_bucket_seq = kcalloc(nr, sizeof(u64), GFP_KERNEL);
if (!bu || !ob || !new_buckets || !new_bucket_seq) {
ret = -ENOMEM;
- goto err_unblock;
+ goto err_free;
}
for (nr_got = 0; nr_got < nr_want; nr_got++) {
@@ -794,87 +786,92 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
if (ret)
break;
+ ret = bch2_trans_run(c,
+ bch2_trans_mark_metadata_bucket(&trans, ca,
+ ob[nr_got]->bucket, BCH_DATA_journal,
+ ca->mi.bucket_size));
+ if (ret) {
+ bch2_open_bucket_put(c, ob[nr_got]);
+ bch_err(c, "error marking new journal buckets: %s", bch2_err_str(ret));
+ break;
+ }
+
bu[nr_got] = ob[nr_got]->bucket;
}
}
if (!nr_got)
- goto err_unblock;
+ goto err_free;
- /*
- * We may be called from the device add path, before the new device has
- * actually been added to the running filesystem:
- */
- if (!new_fs)
- spin_lock(&c->journal.lock);
+ /* Don't return an error if we successfully allocated some buckets: */
+ ret = 0;
+
+ if (c) {
+ bch2_journal_flush_all_pins(&c->journal);
+ bch2_journal_block(&c->journal);
+ mutex_lock(&c->sb_lock);
+ }
memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64));
memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64));
- swap(new_buckets, ja->buckets);
- swap(new_bucket_seq, ja->bucket_seq);
+
+ BUG_ON(ja->discard_idx > ja->nr);
+
+ pos = ja->discard_idx ?: ja->nr;
+
+ memmove(new_buckets + pos + nr_got,
+ new_buckets + pos,
+ sizeof(new_buckets[0]) * (ja->nr - pos));
+ memmove(new_bucket_seq + pos + nr_got,
+ new_bucket_seq + pos,
+ sizeof(new_bucket_seq[0]) * (ja->nr - pos));
for (i = 0; i < nr_got; i++) {
- unsigned pos = ja->discard_idx ?: ja->nr;
- long b = bu[i];
-
- __array_insert_item(ja->buckets, ja->nr, pos);
- __array_insert_item(ja->bucket_seq, ja->nr, pos);
- ja->nr++;
-
- ja->buckets[pos] = b;
- ja->bucket_seq[pos] = 0;
-
- if (pos <= ja->discard_idx)
- ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
- if (pos <= ja->dirty_idx_ondisk)
- ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
- if (pos <= ja->dirty_idx)
- ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
- if (pos <= ja->cur_idx)
- ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
+ new_buckets[pos + i] = bu[i];
+ new_bucket_seq[pos + i] = 0;
}
- ret = bch2_journal_buckets_to_sb(c, ca);
- if (ret) {
- /* Revert: */
- swap(new_buckets, ja->buckets);
- swap(new_bucket_seq, ja->bucket_seq);
- ja->nr = old_nr;
- ja->discard_idx = old_discard_idx;
- ja->dirty_idx_ondisk = old_dirty_idx_ondisk;
- ja->dirty_idx = old_dirty_idx;
- ja->cur_idx = old_cur_idx;
- }
+ nr = ja->nr + nr_got;
- if (!new_fs)
- spin_unlock(&c->journal.lock);
+ ret = bch2_journal_buckets_to_sb(c, ca, new_buckets, nr);
+ if (ret)
+ goto err_unblock;
- if (ja->nr != old_nr && !new_fs)
+ if (!new_fs)
bch2_write_super(c);
+ /* Commit: */
if (c)
- bch2_journal_unblock(&c->journal);
+ spin_lock(&c->journal.lock);
- if (ret)
- goto err;
+ swap(new_buckets, ja->buckets);
+ swap(new_bucket_seq, ja->bucket_seq);
+ ja->nr = nr;
+
+ if (pos <= ja->discard_idx)
+ ja->discard_idx = (ja->discard_idx + nr_got) % ja->nr;
+ if (pos <= ja->dirty_idx_ondisk)
+ ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + nr_got) % ja->nr;
+ if (pos <= ja->dirty_idx)
+ ja->dirty_idx = (ja->dirty_idx + nr_got) % ja->nr;
+ if (pos <= ja->cur_idx)
+ ja->cur_idx = (ja->cur_idx + nr_got) % ja->nr;
- if (!new_fs) {
- for (i = 0; i < nr_got; i++) {
- ret = bch2_trans_run(c,
- bch2_trans_mark_metadata_bucket(&trans, ca,
- bu[i], BCH_DATA_journal,
- ca->mi.bucket_size));
- if (ret) {
- bch2_fs_inconsistent(c, "error marking new journal buckets: %i", ret);
- goto err;
- }
- }
- }
-err:
if (c)
+ spin_unlock(&c->journal.lock);
+err_unblock:
+ if (c) {
+ bch2_journal_unblock(&c->journal);
mutex_unlock(&c->sb_lock);
+ }
- if (ob && !new_fs)
+ if (ret && !new_fs)
+ for (i = 0; i < nr_got; i++)
+ bch2_trans_run(c,
+ bch2_trans_mark_metadata_bucket(&trans, ca,
+ bu[i], BCH_DATA_free, 0));
+err_free:
+ if (!new_fs)
for (i = 0; i < nr_got; i++)
bch2_open_bucket_put(c, ob[i]);
@@ -882,12 +879,7 @@ err:
kfree(new_buckets);
kfree(ob);
kfree(bu);
-
return ret;
-err_unblock:
- if (c)
- bch2_journal_unblock(&c->journal);
- goto err;
}
/*
@@ -901,13 +893,15 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
struct closure cl;
int ret = 0;
+ closure_init_stack(&cl);
+
+ down_write(&c->state_lock);
+
/* don't handle reducing nr of buckets yet: */
if (nr < ja->nr)
- return 0;
-
- closure_init_stack(&cl);
+ goto unlock;
- while (ja->nr != nr) {
+ while (ja->nr < nr) {
struct disk_reservation disk_res = { 0, 0 };
/*
@@ -938,7 +932,8 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
if (ret)
bch_err(c, "%s: err %s", __func__, bch2_err_str(ret));
-
+unlock:
+ up_write(&c->state_lock);
return ret;
}
@@ -977,7 +972,7 @@ static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
seq++) {
struct journal_buf *buf = journal_seq_to_buf(j, seq);
- if (bch2_bkey_has_device(bkey_i_to_s_c(&buf->key), dev_idx))
+ if (bch2_bkey_has_device_c(bkey_i_to_s_c(&buf->key), dev_idx))
ret = true;
}
spin_unlock(&j->lock);
@@ -1353,6 +1348,7 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64
{
struct journal_entry_pin_list *pin_list;
struct journal_entry_pin *pin;
+ unsigned i;
spin_lock(&j->lock);
*seq = max(*seq, j->pin.front);
@@ -1370,15 +1366,11 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64
prt_newline(out);
printbuf_indent_add(out, 2);
- list_for_each_entry(pin, &pin_list->list, list) {
- prt_printf(out, "\t%px %ps", pin, pin->flush);
- prt_newline(out);
- }
-
- list_for_each_entry(pin, &pin_list->key_cache_list, list) {
- prt_printf(out, "\t%px %ps", pin, pin->flush);
- prt_newline(out);
- }
+ for (i = 0; i < ARRAY_SIZE(pin_list->list); i++)
+ list_for_each_entry(pin, &pin_list->list[i], list) {
+ prt_printf(out, "\t%px %ps", pin, pin->flush);
+ prt_newline(out);
+ }
if (!list_empty(&pin_list->flushed)) {
prt_printf(out, "flushed:");
diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c
index 8d3878bd..cfd92d8b 100644
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@@ -1339,8 +1339,7 @@ static void __journal_write_alloc(struct journal *j,
if (!ca->mi.durability ||
ca->mi.state != BCH_MEMBER_STATE_rw ||
!ja->nr ||
- bch2_bkey_has_device(bkey_i_to_s_c(&w->key),
- ca->dev_idx) ||
+ bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) ||
sectors > ja->sectors_free)
continue;
diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c
index 8744581d..8c88884c 100644
--- a/libbcachefs/journal_reclaim.c
+++ b/libbcachefs/journal_reclaim.c
@@ -2,6 +2,7 @@
#include "bcachefs.h"
#include "btree_key_cache.h"
+#include "btree_update.h"
#include "errcode.h"
#include "error.h"
#include "journal.h"
@@ -318,9 +319,7 @@ static void bch2_journal_reclaim_fast(struct journal *j)
*/
while (!fifo_empty(&j->pin) &&
!atomic_read(&fifo_peek_front(&j->pin).count)) {
- BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
- BUG_ON(!list_empty(&fifo_peek_front(&j->pin).flushed));
- BUG_ON(!fifo_pop(&j->pin, temp));
+ fifo_pop(&j->pin, temp);
popped = true;
}
@@ -379,6 +378,17 @@ void bch2_journal_pin_drop(struct journal *j,
spin_unlock(&j->lock);
}
+enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn)
+{
+ if (fn == bch2_btree_node_flush0 ||
+ fn == bch2_btree_node_flush1)
+ return JOURNAL_PIN_btree;
+ else if (fn == bch2_btree_key_cache_journal_flush)
+ return JOURNAL_PIN_key_cache;
+ else
+ return JOURNAL_PIN_other;
+}
+
void bch2_journal_pin_set(struct journal *j, u64 seq,
struct journal_entry_pin *pin,
journal_pin_flush_fn flush_fn)
@@ -407,10 +417,8 @@ void bch2_journal_pin_set(struct journal *j, u64 seq,
pin->seq = seq;
pin->flush = flush_fn;
- if (flush_fn == bch2_btree_key_cache_journal_flush)
- list_add(&pin->list, &pin_list->key_cache_list);
- else if (flush_fn)
- list_add(&pin->list, &pin_list->list);
+ if (flush_fn)
+ list_add(&pin->list, &pin_list->list[journal_pin_type(flush_fn)]);
else
list_add(&pin->list, &pin_list->flushed);
@@ -446,37 +454,37 @@ void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
static struct journal_entry_pin *
journal_get_next_pin(struct journal *j,
- bool get_any,
- bool get_key_cache,
- u64 max_seq, u64 *seq)
+ u64 seq_to_flush,
+ unsigned allowed_below_seq,
+ unsigned allowed_above_seq,
+ u64 *seq)
{
struct journal_entry_pin_list *pin_list;
struct journal_entry_pin *ret = NULL;
+ unsigned i;
fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) {
- if (*seq > max_seq && !get_any && !get_key_cache)
+ if (*seq > seq_to_flush && !allowed_above_seq)
break;
- if (*seq <= max_seq || get_any) {
- ret = list_first_entry_or_null(&pin_list->list,
- struct journal_entry_pin, list);
- if (ret)
- return ret;
- }
-
- if (*seq <= max_seq || get_any || get_key_cache) {
- ret = list_first_entry_or_null(&pin_list->key_cache_list,
- struct journal_entry_pin, list);
- if (ret)
- return ret;
- }
+ for (i = 0; i < JOURNAL_PIN_NR; i++)
+ if ((((1U << i) & allowed_below_seq) && *seq <= seq_to_flush) ||
+ ((1U << i) & allowed_above_seq)) {
+ ret = list_first_entry_or_null(&pin_list->list[i],
+ struct journal_entry_pin, list);
+ if (ret)
+ return ret;
+ }
}
return NULL;
}
/* returns true if we did work */
-static size_t journal_flush_pins(struct journal *j, u64 seq_to_flush,
+static size_t journal_flush_pins(struct journal *j,
+ u64 seq_to_flush,
+ unsigned allowed_below_seq,
+ unsigned allowed_above_seq,
unsigned min_any,
unsigned min_key_cache)
{
@@ -489,15 +497,25 @@ static size_t journal_flush_pins(struct journal *j, u64 seq_to_flush,
lockdep_assert_held(&j->reclaim_lock);
while (1) {
+ unsigned allowed_above = allowed_above_seq;
+ unsigned allowed_below = allowed_below_seq;
+
+ if (min_any) {
+ allowed_above |= ~0;
+ allowed_below |= ~0;
+ }
+
+ if (min_key_cache) {
+ allowed_above |= 1U << JOURNAL_PIN_key_cache;
+ allowed_below |= 1U << JOURNAL_PIN_key_cache;
+ }
+
cond_resched();
j->last_flushed = jiffies;
spin_lock(&j->lock);
- pin = journal_get_next_pin(j,
- min_any != 0,
- min_key_cache != 0,
- seq_to_flush, &seq);
+ pin = journal_get_next_pin(j, seq_to_flush, allowed_below, allowed_above, &seq);
if (pin) {
BUG_ON(j->flush_in_progress);
j->flush_in_progress = pin;
@@ -656,6 +674,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
atomic_long_read(&c->btree_key_cache.nr_keys));
nr_flushed = journal_flush_pins(j, seq_to_flush,
+ ~0, 0,
min_nr, min_key_cache);
if (direct)
@@ -776,7 +795,11 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
mutex_lock(&j->reclaim_lock);
- if (journal_flush_pins(j, seq_to_flush, 0, 0))
+ if (journal_flush_pins(j, seq_to_flush,
+ (1U << JOURNAL_PIN_key_cache)|
+ (1U << JOURNAL_PIN_other), 0, 0, 0) ||
+ journal_flush_pins(j, seq_to_flush,
+ (1U << JOURNAL_PIN_btree), 0, 0, 0))
*did_work = true;
spin_lock(&j->lock);
diff --git a/libbcachefs/journal_sb.c b/libbcachefs/journal_sb.c
index 9b933330..5be78823 100644
--- a/libbcachefs/journal_sb.c
+++ b/libbcachefs/journal_sb.c
@@ -175,46 +175,45 @@ const struct bch_sb_field_ops bch_sb_field_ops_journal_v2 = {
.to_text = bch2_sb_journal_v2_to_text,
};
-int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca)
+int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca,
+ u64 *buckets, unsigned nr)
{
- struct journal_device *ja = &ca->journal;
struct bch_sb_field_journal_v2 *j;
- unsigned i, dst = 0, nr = 1;
+ unsigned i, dst = 0, nr_compacted = 1;
if (c)
lockdep_assert_held(&c->sb_lock);
- if (!ja->nr) {
+ if (!nr) {
bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal_v2);
return 0;
}
- for (i = 0; i + 1 < ja->nr; i++)
- if (ja->buckets[i] + 1 != ja->buckets[i + 1])
- nr++;
+ for (i = 0; i + 1 < nr; i++)
+ if (buckets[i] + 1 != buckets[i + 1])
+ nr_compacted++;
j = bch2_sb_resize_journal_v2(&ca->disk_sb,
- (sizeof(*j) + sizeof(j->d[0]) * nr) / sizeof(u64));
+ (sizeof(*j) + sizeof(j->d[0]) * nr_compacted) / sizeof(u64));
if (!j)
return -BCH_ERR_ENOSPC_sb_journal;
bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
- j->d[dst].start = le64_to_cpu(ja->buckets[0]);
+ j->d[dst].start = le64_to_cpu(buckets[0]);
j->d[dst].nr = le64_to_cpu(1);
- for (i = 1; i < ja->nr; i++) {
- if (ja->buckets[i] == ja->buckets[i - 1] + 1) {
+ for (i = 1; i < nr; i++) {
+ if (buckets[i] == buckets[i - 1] + 1) {
le64_add_cpu(&j->d[dst].nr, 1);
} else {
dst++;
- j->d[dst].start = le64_to_cpu(ja->buckets[i]);
+ j->d[dst].start = le64_to_cpu(buckets[i]);
j->d[dst].nr = le64_to_cpu(1);
}
}
- BUG_ON(dst + 1 != nr);
-
+ BUG_ON(dst + 1 != nr_compacted);
return 0;
}
diff --git a/libbcachefs/journal_sb.h b/libbcachefs/journal_sb.h
index a39192e9..ba40a7e8 100644
--- a/libbcachefs/journal_sb.h
+++ b/libbcachefs/journal_sb.h
@@ -21,4 +21,4 @@ static inline unsigned bch2_sb_field_journal_v2_nr_entries(struct bch_sb_field_j
extern const struct bch_sb_field_ops bch_sb_field_ops_journal;
extern const struct bch_sb_field_ops bch_sb_field_ops_journal_v2;
-int bch2_journal_buckets_to_sb(struct bch_fs *, struct bch_dev *);
+int bch2_journal_buckets_to_sb(struct bch_fs *, struct bch_dev *, u64 *, unsigned);
diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h
index 0e6bde66..8d8c0b3d 100644
--- a/libbcachefs/journal_types.h
+++ b/libbcachefs/journal_types.h
@@ -43,9 +43,15 @@ struct journal_buf {
* flushed:
*/
+enum journal_pin_type {
+ JOURNAL_PIN_btree,
+ JOURNAL_PIN_key_cache,
+ JOURNAL_PIN_other,
+ JOURNAL_PIN_NR,
+};
+
struct journal_entry_pin_list {
- struct list_head list;
- struct list_head key_cache_list;
+ struct list_head list[JOURNAL_PIN_NR];
struct list_head flushed;
atomic_t count;
struct bch_devs_list devs;
diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c
index e3e39127..d93db07f 100644
--- a/libbcachefs/migrate.c
+++ b/libbcachefs/migrate.c
@@ -46,7 +46,7 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
struct bkey_i *n;
int ret;
- if (!bch2_bkey_has_device(k, dev_idx))
+ if (!bch2_bkey_has_device_c(k, dev_idx))
return 0;
n = bch2_bkey_make_mut(trans, k);
@@ -130,8 +130,7 @@ retry:
while (bch2_trans_begin(&trans),
(b = bch2_btree_iter_peek_node(&iter)) &&
!(ret = PTR_ERR_OR_ZERO(b))) {
- if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key),
- dev_idx))
+ if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx))
goto next;
bch2_bkey_buf_copy(&k, c, &b->key);
diff --git a/libbcachefs/move.c b/libbcachefs/move.c
index 5e952d6c..bb5061bc 100644
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@@ -41,7 +41,8 @@ static void progress_list_del(struct bch_fs *c, struct bch_move_stats *stats)
}
struct moving_io {
- struct list_head list;
+ struct list_head read_list;
+ struct list_head io_list;
struct move_bucket_in_flight *b;
struct closure cl;
bool read_completed;
@@ -65,8 +66,12 @@ static void move_free(struct moving_io *io)
atomic_dec(&io->b->count);
bch2_data_update_exit(&io->write);
+
+ mutex_lock(&ctxt->lock);
+ list_del(&io->io_list);
wake_up(&ctxt->wait);
- bch2_write_ref_put(c, BCH_WRITE_REF_move);
+ mutex_unlock(&ctxt->lock);
+
kfree(io);
}
@@ -101,7 +106,7 @@ static void move_write(struct moving_io *io)
struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt)
{
struct moving_io *io =
- list_first_entry_or_null(&ctxt->reads, struct moving_io, list);
+ list_first_entry_or_null(&ctxt->reads, struct moving_io, read_list);
return io && io->read_completed ? io : NULL;
}
@@ -128,7 +133,7 @@ void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt,
bch2_trans_unlock(trans);
while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) {
- list_del(&io->list);
+ list_del(&io->read_list);
move_write(io);
}
}
@@ -145,6 +150,8 @@ static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt,
void bch2_moving_ctxt_exit(struct moving_context *ctxt)
{
+ struct bch_fs *c = ctxt->c;
+
move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads));
closure_sync(&ctxt->cl);
@@ -154,12 +161,15 @@ void bch2_moving_ctxt_exit(struct moving_context *ctxt)
EBUG_ON(atomic_read(&ctxt->read_ios));
if (ctxt->stats) {
- progress_list_del(ctxt->c, ctxt->stats);
-
- trace_move_data(ctxt->c,
+ progress_list_del(c, ctxt->stats);
+ trace_move_data(c,
atomic64_read(&ctxt->stats->sectors_moved),
atomic64_read(&ctxt->stats->keys_moved));
}
+
+ mutex_lock(&c->moving_context_lock);
+ list_del(&ctxt->list);
+ mutex_unlock(&c->moving_context_lock);
}
void bch2_moving_ctxt_init(struct moving_context *ctxt,
@@ -172,15 +182,23 @@ void bch2_moving_ctxt_init(struct moving_context *ctxt,
memset(ctxt, 0, sizeof(*ctxt));
ctxt->c = c;
+ ctxt->fn = (void *) _RET_IP_;
ctxt->rate = rate;
ctxt->stats = stats;
ctxt->wp = wp;
ctxt->wait_on_copygc = wait_on_copygc;
closure_init_stack(&ctxt->cl);
+
+ mutex_init(&ctxt->lock);
INIT_LIST_HEAD(&ctxt->reads);
+ INIT_LIST_HEAD(&ctxt->ios);
init_waitqueue_head(&ctxt->wait);
+ mutex_lock(&c->moving_context_lock);
+ list_add(&ctxt->list, &c->moving_context_list);
+ mutex_unlock(&c->moving_context_lock);
+
if (stats) {
progress_list_add(c, stats);
stats->data_type = BCH_DATA_user;
@@ -262,9 +280,6 @@ static int bch2_move_extent(struct btree_trans *trans,
return 0;
}
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_move))
- return -BCH_ERR_erofs_no_writes;
-
/*
* Before memory allocations & taking nocow locks in
* bch2_data_update_init():
@@ -334,9 +349,14 @@ static int bch2_move_extent(struct btree_trans *trans,
this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size);
trace_move_extent_read(k.k);
+
+ mutex_lock(&ctxt->lock);
atomic_add(io->read_sectors, &ctxt->read_sectors);
atomic_inc(&ctxt->read_ios);
- list_add_tail(&io->list, &ctxt->reads);
+
+ list_add_tail(&io->read_list, &ctxt->reads);
+ list_add_tail(&io->io_list, &ctxt->ios);
+ mutex_unlock(&ctxt->lock);
/*
* dropped by move_read_endio() - guards against use after free of
@@ -354,7 +374,6 @@ err_free_pages:
err_free:
kfree(io);
err:
- bch2_write_ref_put(c, BCH_WRITE_REF_move);
trace_and_count(c, move_extent_alloc_mem_fail, k.k);
return ret;
}
@@ -759,8 +778,13 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
data_opts.rewrite_ptrs = 0;
bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
- if (ptr->dev == bucket.inode)
+ if (ptr->dev == bucket.inode) {
data_opts.rewrite_ptrs |= 1U << i;
+ if (ptr->cached) {
+ bch2_trans_iter_exit(trans, &iter);
+ goto next;
+ }
+ }
i++;
}
@@ -819,14 +843,6 @@ next:
}
trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret);
-
- if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && gen >= 0) {
- bch2_trans_unlock(trans);
- move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads));
- closure_sync(&ctxt->cl);
- if (!ctxt->write_error)
- bch2_verify_bucket_evacuated(trans, bucket, gen);
- }
err:
bch2_bkey_buf_exit(&sk, c);
return ret;
@@ -1111,3 +1127,67 @@ int bch2_data_job(struct bch_fs *c,
return ret;
}
+
+void bch2_data_jobs_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ struct bch_move_stats *stats;
+
+ mutex_lock(&c->data_progress_lock);
+ list_for_each_entry(stats, &c->data_progress_list, list) {
+ prt_printf(out, "%s: data type %s btree_id %s position: ",
+ stats->name,
+ bch2_data_types[stats->data_type],
+ bch2_btree_ids[stats->btree_id]);
+ bch2_bpos_to_text(out, stats->pos);
+ prt_printf(out, "%s", "\n");
+ }
+ mutex_unlock(&c->data_progress_lock);
+}
+
+static void bch2_moving_ctxt_to_text(struct printbuf *out, struct moving_context *ctxt)
+{
+ struct moving_io *io;
+
+ prt_printf(out, "%ps:", ctxt->fn);
+ prt_newline(out);
+ printbuf_indent_add(out, 2);
+
+ prt_printf(out, "reads: %u sectors %u",
+ atomic_read(&ctxt->read_ios),
+ atomic_read(&ctxt->read_sectors));
+ prt_newline(out);
+
+ prt_printf(out, "writes: %u sectors %u",
+ atomic_read(&ctxt->write_ios),
+ atomic_read(&ctxt->write_sectors));
+ prt_newline(out);
+
+ printbuf_indent_add(out, 2);
+
+ mutex_lock(&ctxt->lock);
+ list_for_each_entry(io, &ctxt->ios, io_list) {
+ bch2_write_op_to_text(out, &io->write.op);
+ }
+ mutex_unlock(&ctxt->lock);
+
+ printbuf_indent_sub(out, 4);
+}
+
+void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ struct moving_context *ctxt;
+
+ mutex_lock(&c->moving_context_lock);
+ list_for_each_entry(ctxt, &c->moving_context_list, list)
+ bch2_moving_ctxt_to_text(out, ctxt);
+ mutex_unlock(&c->moving_context_lock);
+}
+
+void bch2_fs_move_init(struct bch_fs *c)
+{
+ INIT_LIST_HEAD(&c->moving_context_list);
+ mutex_init(&c->moving_context_lock);
+
+ INIT_LIST_HEAD(&c->data_progress_list);
+ mutex_init(&c->data_progress_lock);
+}
diff --git a/libbcachefs/move.h b/libbcachefs/move.h
index 4c001387..50a6f7d7 100644
--- a/libbcachefs/move.h
+++ b/libbcachefs/move.h
@@ -11,6 +11,9 @@ struct bch_read_bio;
struct moving_context {
struct bch_fs *c;
+ struct list_head list;
+ void *fn;
+
struct bch_ratelimit *rate;
struct bch_move_stats *stats;
struct write_point_specifier wp;
@@ -19,7 +22,10 @@ struct moving_context {
/* For waiting on outstanding reads and writes: */
struct closure cl;
+
+ struct mutex lock;
struct list_head reads;
+ struct list_head ios;
/* in flight sectors: */
atomic_t read_sectors;
@@ -84,6 +90,9 @@ int bch2_data_job(struct bch_fs *,
struct bch_ioctl_data);
void bch2_move_stats_init(struct bch_move_stats *stats, char *name);
+void bch2_data_jobs_to_text(struct printbuf *, struct bch_fs *);
+void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *);
+void bch2_fs_move_init(struct bch_fs *);
#endif /* _BCACHEFS_MOVE_H */
diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c
index 79aaa45f..178f96a6 100644
--- a/libbcachefs/movinggc.c
+++ b/libbcachefs/movinggc.c
@@ -46,7 +46,7 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
if (bch2_bucket_is_open(trans->c, bucket.inode, bucket.offset))
return 0;
- bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, 0);
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_CACHED);
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
bch2_trans_iter_exit(trans, &iter);
@@ -85,7 +85,7 @@ static int move_bucket_cmp(const void *_l, const void *_r)
const struct move_bucket *l = _l;
const struct move_bucket *r = _r;
- return bpos_cmp(l->bucket, r->bucket) ?: cmp_int(l->gen, r->gen);
+ return bkey_cmp(l->bucket, r->bucket);
}
static bool bucket_in_flight(move_buckets *buckets_sorted, struct move_bucket b)
@@ -178,13 +178,13 @@ static int bch2_copygc(struct btree_trans *trans,
move_buckets_in_flight *buckets_in_flight)
{
struct bch_fs *c = trans->c;
- struct bch_move_stats move_stats;
struct data_update_opts data_opts = {
.btree_insert_flags = BTREE_INSERT_USE_RESERVE|JOURNAL_WATERMARK_copygc,
};
move_buckets buckets = { 0 };
struct move_bucket_in_flight *f;
struct move_bucket *i;
+ u64 moved = atomic64_read(&ctxt->stats->sectors_moved);
int ret = 0;
ret = bch2_btree_write_buffer_flush(trans);
@@ -192,9 +192,6 @@ static int bch2_copygc(struct btree_trans *trans,
__func__, bch2_err_str(ret)))
return ret;
- bch2_move_stats_init(&move_stats, "copygc");
- ctxt->stats = &move_stats;
-
ret = bch2_copygc_get_buckets(trans, ctxt, buckets_in_flight, &buckets);
if (ret)
goto err;
@@ -222,8 +219,8 @@ err:
if (ret < 0 && !bch2_err_matches(ret, EROFS))
bch_err(c, "error from bch2_move_data() in copygc: %s", bch2_err_str(ret));
- trace_and_count(c, copygc, c, atomic64_read(&move_stats.sectors_moved), 0, 0, 0);
- ctxt->stats = NULL;
+ moved = atomic64_read(&ctxt->stats->sectors_moved) - moved;
+ trace_and_count(c, copygc, c, moved, 0, 0, 0);
return ret;
}
@@ -282,6 +279,7 @@ static int bch2_copygc_thread(void *arg)
struct bch_fs *c = arg;
struct btree_trans trans;
struct moving_context ctxt;
+ struct bch_move_stats move_stats;
struct io_clock *clock = &c->io_clock[WRITE];
move_buckets_in_flight move_buckets;
u64 last, wait;
@@ -294,7 +292,9 @@ static int bch2_copygc_thread(void *arg)
set_freezable();
bch2_trans_init(&trans, c, 0, 0);
- bch2_moving_ctxt_init(&ctxt, c, NULL, NULL,
+
+ bch2_move_stats_init(&move_stats, "copygc");
+ bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats,
writepoint_ptr(&c->copygc_write_point),
false);
@@ -334,8 +334,8 @@ static int bch2_copygc_thread(void *arg)
wake_up(&c->copygc_running_wq);
}
- bch2_moving_ctxt_exit(&ctxt);
bch2_trans_exit(&trans);
+ bch2_moving_ctxt_exit(&ctxt);
free_fifo(&move_buckets);
return 0;
diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h
index afbf82d6..719693b3 100644
--- a/libbcachefs/opts.h
+++ b/libbcachefs/opts.h
@@ -92,6 +92,12 @@ enum opt_type {
#define RATELIMIT_ERRORS_DEFAULT false
#endif
+#ifdef CONFIG_BCACHEFS_DEBUG
+#define BCACHEFS_VERBOSE_DEFAULT true
+#else
+#define BCACHEFS_VERBOSE_DEFAULT false
+#endif
+
#define BCH_OPTS() \
x(block_size, u16, \
OPT_FS|OPT_FORMAT| \
@@ -276,7 +282,7 @@ enum opt_type {
x(verbose, u8, \
OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
- BCH2_NO_SB_OPT, false, \
+ BCH2_NO_SB_OPT, BCACHEFS_VERBOSE_DEFAULT, \
NULL, "Extra debugging information during mount/recovery")\
x(journal_flush_delay, u32, \
OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
diff --git a/libbcachefs/reflink.c b/libbcachefs/reflink.c
index d2e6adc1..d8426e75 100644
--- a/libbcachefs/reflink.c
+++ b/libbcachefs/reflink.c
@@ -189,7 +189,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
for_each_btree_key_norestart(trans, reflink_iter, BTREE_ID_reflink,
POS(0, c->reflink_hint),
- BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) {
+ BTREE_ITER_SLOTS, k, ret) {
if (reflink_iter.pos.inode) {
bch2_btree_iter_set_pos(&reflink_iter, POS_MIN);
continue;
diff --git a/libbcachefs/subvolume.c b/libbcachefs/subvolume.c
index d7623965..bcc67c0f 100644
--- a/libbcachefs/subvolume.c
+++ b/libbcachefs/subvolume.c
@@ -513,7 +513,9 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
n->v.pad = 0;
SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
- ret = bch2_trans_update(trans, &iter, &n->k_i, 0);
+ ret = bch2_trans_update(trans, &iter, &n->k_i, 0) ?:
+ bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
+ bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
if (ret)
goto err;
@@ -540,7 +542,7 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
n->v.children[1] = cpu_to_le32(new_snapids[1]);
n->v.subvol = 0;
SET_BCH_SNAPSHOT_SUBVOL(&n->v, false);
- ret = bch2_trans_update(trans, &iter, &n->k_i, 0);
+ ret = bch2_trans_update(trans, &iter, &n->k_i, 0);
if (ret)
goto err;
}
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index 359ca164..3a7f4e29 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -206,11 +206,15 @@ static void __bch2_fs_read_only(struct bch_fs *c)
unsigned i, clean_passes = 0;
u64 seq = 0;
+ bch2_fs_ec_stop(c);
+ bch2_open_buckets_stop(c, NULL, true);
bch2_rebalance_stop(c);
bch2_copygc_stop(c);
bch2_gc_thread_stop(c);
+ bch2_fs_ec_flush(c);
- bch_verbose(c, "flushing journal and stopping allocators");
+ bch_verbose(c, "flushing journal and stopping allocators, journal seq %llu",
+ journal_cur_seq(&c->journal));
do {
clean_passes++;
@@ -224,7 +228,8 @@ static void __bch2_fs_read_only(struct bch_fs *c)
}
} while (clean_passes < 2);
- bch_verbose(c, "flushing journal and stopping allocators complete");
+ bch_verbose(c, "flushing journal and stopping allocators complete, journal seq %llu",
+ journal_cur_seq(&c->journal));
if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) &&
!test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
@@ -679,6 +684,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_fs_rebalance_init(c);
bch2_fs_quota_init(c);
bch2_fs_ec_init_early(c);
+ bch2_fs_move_init(c);
INIT_LIST_HEAD(&c->list);
@@ -697,17 +703,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
INIT_LIST_HEAD(&c->fsck_errors);
mutex_init(&c->fsck_error_lock);
- INIT_LIST_HEAD(&c->ec_stripe_head_list);
- mutex_init(&c->ec_stripe_head_lock);
-
- INIT_LIST_HEAD(&c->ec_stripe_new_list);
- mutex_init(&c->ec_stripe_new_lock);
-
- INIT_LIST_HEAD(&c->data_progress_list);
- mutex_init(&c->data_progress_lock);
-
- mutex_init(&c->ec_stripes_heap_lock);
-
seqcount_init(&c->gc_pos_lock);
seqcount_init(&c->usage_lock);
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index ed17b27f..1344ae4c 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -248,6 +248,7 @@ read_attribute(io_timers_read);
read_attribute(io_timers_write);
read_attribute(data_jobs);
+read_attribute(moving_ctxts);
#ifdef CONFIG_BCACHEFS_TESTS
write_attribute(perf_test);
@@ -277,25 +278,6 @@ static size_t bch2_btree_cache_size(struct bch_fs *c)
return ret;
}
-static long data_progress_to_text(struct printbuf *out, struct bch_fs *c)
-{
- long ret = 0;
- struct bch_move_stats *stats;
-
- mutex_lock(&c->data_progress_lock);
- list_for_each_entry(stats, &c->data_progress_list, list) {
- prt_printf(out, "%s: data type %s btree_id %s position: ",
- stats->name,
- bch2_data_types[stats->data_type],
- bch2_btree_ids[stats->btree_id]);
- bch2_bpos_to_text(out, stats->pos);
- prt_printf(out, "%s", "\n");
- }
-
- mutex_unlock(&c->data_progress_lock);
- return ret;
-}
-
static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c)
{
struct btree_trans trans;
@@ -476,7 +458,10 @@ SHOW(bch2_fs)
bch2_io_timers_to_text(out, &c->io_clock[WRITE]);
if (attr == &sysfs_data_jobs)
- data_progress_to_text(out, c);
+ bch2_data_jobs_to_text(out, c);
+
+ if (attr == &sysfs_moving_ctxts)
+ bch2_fs_moving_ctxts_to_text(out, c);
#ifdef BCH_WRITE_REF_DEBUG
if (attr == &sysfs_write_refs)
@@ -693,6 +678,7 @@ struct attribute *bch2_fs_internal_files[] = {
sysfs_pd_controller_files(rebalance),
&sysfs_data_jobs,
+ &sysfs_moving_ctxts,
&sysfs_internal_uuid,
NULL
diff --git a/linux/six.c b/linux/six.c
index 5a6eadc0..3d366a84 100644
--- a/linux/six.c
+++ b/linux/six.c
@@ -143,8 +143,17 @@ static int __do_six_trylock_type(struct six_lock *lock,
* lock, issue a wakeup because we might have caused a
* spurious trylock failure:
*/
+#if 0
+ /*
+ * This code should be sufficient, but we're seeing unexplained
+ * lost wakeups:
+ */
if (old.write_locking)
ret = -1 - SIX_LOCK_write;
+#else
+ if (!ret)
+ ret = -1 - SIX_LOCK_write;
+#endif
} else if (type == SIX_LOCK_write && lock->readers) {
if (try) {
atomic64_add(__SIX_VAL(write_locking, 1),
@@ -320,11 +329,10 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
* Similar to the lock path, we may have caused a spurious write
* lock fail and need to issue a wakeup:
*/
- if (old.write_locking)
- six_lock_wakeup(lock, old, SIX_LOCK_write);
-
if (ret)
six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip);
+ else
+ six_lock_wakeup(lock, old, SIX_LOCK_write);
return ret;
}