From fa358537725c8065b058b558125cf15359936f94 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 14 Mar 2023 12:56:38 -0400 Subject: Update bcachefs sources to 72405e7ff8 bcachefs: Fix bch2_check_extents_to_backpointers() --- .bcachefs_revision | 2 +- libbcachefs/alloc_background.c | 40 +-------- libbcachefs/alloc_background.h | 2 +- libbcachefs/alloc_foreground.c | 149 +++++++++++++++++++++++++--------- libbcachefs/alloc_foreground.h | 8 +- libbcachefs/backpointers.c | 30 ++++--- libbcachefs/bcachefs.h | 23 ++++-- libbcachefs/btree_iter.c | 2 +- libbcachefs/btree_key_cache.c | 23 +++++- libbcachefs/btree_key_cache.h | 2 +- libbcachefs/btree_locking.c | 34 ++++++++ libbcachefs/btree_locking.h | 13 +-- libbcachefs/btree_update.h | 3 + libbcachefs/btree_update_leaf.c | 10 +-- libbcachefs/buckets.c | 2 +- libbcachefs/data_update.c | 105 ++++++++++++------------ libbcachefs/ec.c | 133 ++++++++++++++++++++++-------- libbcachefs/ec.h | 40 ++++++--- libbcachefs/extents.c | 32 ++++---- libbcachefs/extents.h | 16 +++- libbcachefs/fsck.c | 4 +- libbcachefs/io.c | 72 ++++++++++------ libbcachefs/io.h | 61 +++++++------- libbcachefs/io_types.h | 2 +- libbcachefs/journal.c | 176 +++++++++++++++++++--------------------- libbcachefs/journal_io.c | 3 +- libbcachefs/journal_reclaim.c | 83 ++++++++++++------- libbcachefs/journal_sb.c | 27 +++--- libbcachefs/journal_sb.h | 2 +- libbcachefs/journal_types.h | 10 ++- libbcachefs/migrate.c | 5 +- libbcachefs/move.c | 122 +++++++++++++++++++++++----- libbcachefs/move.h | 9 ++ libbcachefs/movinggc.c | 20 ++--- libbcachefs/opts.h | 8 +- libbcachefs/reflink.c | 2 +- libbcachefs/subvolume.c | 6 +- libbcachefs/super.c | 21 ++--- libbcachefs/sysfs.c | 26 ++---- linux/six.c | 14 +++- 40 files changed, 822 insertions(+), 520 deletions(-) diff --git a/.bcachefs_revision b/.bcachefs_revision index 2845be68..d8d13865 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -3856459b1b9f37cebee2bca3c9edcafaf393aa98 +72405e7ff8c5fb569b74b046d19866ee480f29b7 diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index 5f4bb82c..009a85bc 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -1006,7 +1006,7 @@ static bool next_bucket(struct bch_fs *c, struct bpos *bucket) iter = bucket->inode; ca = __bch2_next_dev(c, &iter, NULL); if (ca) - bucket->offset = ca->mi.first_bucket; + *bucket = POS(ca->dev_idx, ca->mi.first_bucket); rcu_read_unlock(); return ca != NULL; @@ -2158,43 +2158,7 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) */ bch2_recalc_capacity(c); - /* Next, close write points that point to this device... */ - for (i = 0; i < ARRAY_SIZE(c->write_points); i++) - bch2_writepoint_stop(c, ca, &c->write_points[i]); - - bch2_writepoint_stop(c, ca, &c->copygc_write_point); - bch2_writepoint_stop(c, ca, &c->rebalance_write_point); - bch2_writepoint_stop(c, ca, &c->btree_write_point); - - mutex_lock(&c->btree_reserve_cache_lock); - while (c->btree_reserve_cache_nr) { - struct btree_alloc *a = - &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; - - bch2_open_buckets_put(c, &a->ob); - } - mutex_unlock(&c->btree_reserve_cache_lock); - - spin_lock(&c->freelist_lock); - i = 0; - while (i < c->open_buckets_partial_nr) { - struct open_bucket *ob = - c->open_buckets + c->open_buckets_partial[i]; - - if (ob->dev == ca->dev_idx) { - swap(c->open_buckets_partial[i], - c->open_buckets_partial[--c->open_buckets_partial_nr]); - ob->on_partial_list = false; - spin_unlock(&c->freelist_lock); - bch2_open_bucket_put(c, ob); - spin_lock(&c->freelist_lock); - } else { - i++; - } - } - spin_unlock(&c->freelist_lock); - - bch2_ec_stop_dev(c, ca); + bch2_open_buckets_stop(c, ca, false); /* * Wake up threads that were blocked on allocation, so they can notice diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h index c9ff590e..32479839 100644 --- a/libbcachefs/alloc_background.h +++ b/libbcachefs/alloc_background.h @@ -216,7 +216,7 @@ static inline u64 should_invalidate_buckets(struct bch_dev *ca, u64 free = max_t(s64, 0, u.d[BCH_DATA_free].buckets + u.d[BCH_DATA_need_discard].buckets - - bch2_dev_buckets_reserved(ca, RESERVE_none)); + - bch2_dev_buckets_reserved(ca, RESERVE_stripe)); return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets); } diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index 3a67ac0d..d52f30ac 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -97,7 +97,7 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); if (ob->ec) { - ec_stripe_new_put(c, ob->ec); + ec_stripe_new_put(c, ob->ec, STRIPE_REF_io); return; } @@ -658,9 +658,11 @@ static int add_new_bucket(struct bch_fs *c, bch_dev_bkey_exists(c, ob->dev)->mi.durability; BUG_ON(*nr_effective >= nr_replicas); + BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS); __clear_bit(ob->dev, devs_may_alloc->d); - *nr_effective += durability; + *nr_effective += (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) + ? durability : 1; *have_cache |= !durability; ob_push(c, ptrs, ob); @@ -679,6 +681,7 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, unsigned nr_replicas, unsigned *nr_effective, bool *have_cache, + unsigned flags, enum bch_data_type data_type, enum alloc_reserve reserve, struct closure *cl) @@ -729,7 +732,7 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, if (add_new_bucket(c, ptrs, devs_may_alloc, nr_replicas, nr_effective, - have_cache, 0, ob)) { + have_cache, flags, ob)) { ret = 0; break; } @@ -796,7 +799,7 @@ got_bucket: ob->ec_idx = ec_idx; ob->ec = h->s; - ec_stripe_new_get(h->s); + ec_stripe_new_get(h->s, STRIPE_REF_io); ret = add_new_bucket(c, ptrs, devs_may_alloc, nr_replicas, nr_effective, @@ -823,7 +826,7 @@ static bool want_bucket(struct bch_fs *c, return false; if (!ca->mi.durability && - (wp->data_type != BCH_DATA_user || !*have_cache)) + (wp->data_type == BCH_DATA_btree || ec || *have_cache)) return false; if (ec != (ob->ec != NULL)) @@ -877,6 +880,9 @@ static int bucket_alloc_set_partial(struct bch_fs *c, spin_lock(&c->freelist_lock); + if (!c->open_buckets_partial_nr) + goto unlock; + for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) { struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i]; @@ -902,7 +908,7 @@ static int bucket_alloc_set_partial(struct bch_fs *c, break; } } - +unlock: spin_unlock(&c->freelist_lock); return ret; } @@ -967,7 +973,7 @@ retry_blocking: */ ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs, nr_replicas, nr_effective, have_cache, - wp->data_type, reserve, cl); + flags, wp->data_type, reserve, cl); if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart) && !bch2_err_matches(ret, BCH_ERR_insufficient_devices) && @@ -1017,45 +1023,96 @@ static int open_bucket_add_buckets(struct btree_trans *trans, return ret < 0 ? ret : 0; } -void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca, - struct open_buckets *obs) +static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c, + struct bch_dev *ca, bool ec) { - struct open_buckets ptrs = { .nr = 0 }; - struct open_bucket *ob, *ob2; - unsigned i, j; - - open_bucket_for_each(c, obs, ob, i) { - bool drop = !ca || ob->dev == ca->dev_idx; + if (ec) { + return ob->ec != NULL; + } else if (ca) { + bool drop = ob->dev == ca->dev_idx; + struct open_bucket *ob2; + unsigned i; if (!drop && ob->ec) { mutex_lock(&ob->ec->lock); - for (j = 0; j < ob->ec->new_stripe.key.v.nr_blocks; j++) { - if (!ob->ec->blocks[j]) + for (i = 0; i < ob->ec->new_stripe.key.v.nr_blocks; i++) { + if (!ob->ec->blocks[i]) continue; - ob2 = c->open_buckets + ob->ec->blocks[j]; + ob2 = c->open_buckets + ob->ec->blocks[i]; drop |= ob2->dev == ca->dev_idx; } mutex_unlock(&ob->ec->lock); } - if (drop) - bch2_open_bucket_put(c, ob); - else - ob_push(c, &ptrs, ob); + return drop; + } else { + return true; } - - *obs = ptrs; } -void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca, - struct write_point *wp) +static void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca, + bool ec, struct write_point *wp) { + struct open_buckets ptrs = { .nr = 0 }; + struct open_bucket *ob; + unsigned i; + mutex_lock(&wp->lock); - bch2_open_buckets_stop_dev(c, ca, &wp->ptrs); + open_bucket_for_each(c, &wp->ptrs, ob, i) + if (should_drop_bucket(ob, c, ca, ec)) + bch2_open_bucket_put(c, ob); + else + ob_push(c, &ptrs, ob); + wp->ptrs = ptrs; mutex_unlock(&wp->lock); } +void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca, + bool ec) +{ + unsigned i; + + /* Next, close write points that point to this device... */ + for (i = 0; i < ARRAY_SIZE(c->write_points); i++) + bch2_writepoint_stop(c, ca, ec, &c->write_points[i]); + + bch2_writepoint_stop(c, ca, ec, &c->copygc_write_point); + bch2_writepoint_stop(c, ca, ec, &c->rebalance_write_point); + bch2_writepoint_stop(c, ca, ec, &c->btree_write_point); + + mutex_lock(&c->btree_reserve_cache_lock); + while (c->btree_reserve_cache_nr) { + struct btree_alloc *a = + &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; + + bch2_open_buckets_put(c, &a->ob); + } + mutex_unlock(&c->btree_reserve_cache_lock); + + spin_lock(&c->freelist_lock); + i = 0; + while (i < c->open_buckets_partial_nr) { + struct open_bucket *ob = + c->open_buckets + c->open_buckets_partial[i]; + + if (should_drop_bucket(ob, c, ca, ec)) { + --c->open_buckets_partial_nr; + swap(c->open_buckets_partial[i], + c->open_buckets_partial[c->open_buckets_partial_nr]); + ob->on_partial_list = false; + spin_unlock(&c->freelist_lock); + bch2_open_bucket_put(c, ob); + spin_lock(&c->freelist_lock); + } else { + i++; + } + } + spin_unlock(&c->freelist_lock); + + bch2_ec_stop_dev(c, ca); +} + static inline struct hlist_head *writepoint_hash(struct bch_fs *c, unsigned long write_point) { @@ -1101,8 +1158,7 @@ static bool try_increase_writepoints(struct bch_fs *c) return true; } -static bool try_decrease_writepoints(struct bch_fs *c, - unsigned old_nr) +static bool try_decrease_writepoints(struct bch_fs *c, unsigned old_nr) { struct write_point *wp; @@ -1123,7 +1179,7 @@ static bool try_decrease_writepoints(struct bch_fs *c, hlist_del_rcu(&wp->node); mutex_unlock(&c->write_points_hash_lock); - bch2_writepoint_stop(c, NULL, wp); + bch2_writepoint_stop(c, NULL, false, wp); return true; } @@ -1217,6 +1273,8 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, int ret; int i; + BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS); + BUG_ON(!nr_replicas || !nr_replicas_required); retry: ptrs.nr = 0; @@ -1230,13 +1288,7 @@ retry: if (wp->data_type != BCH_DATA_user) have_cache = true; - if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { - ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, - target, erasure_code, - nr_replicas, &nr_effective, - &have_cache, reserve, - flags, cl); - } else { + if (target && !(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, target, erasure_code, nr_replicas, &nr_effective, @@ -1246,11 +1298,28 @@ retry: bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto alloc_done; + /* Don't retry from all devices if we're out of open buckets: */ + if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) + goto allocate_blocking; + + /* + * Only try to allocate cache (durability = 0 devices) from the + * specified target: + */ + have_cache = true; + ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, 0, erasure_code, nr_replicas, &nr_effective, &have_cache, reserve, flags, cl); + } else { +allocate_blocking: + ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, + target, erasure_code, + nr_replicas, &nr_effective, + &have_cache, reserve, + flags, cl); } alloc_done: BUG_ON(!ret && nr_effective < nr_replicas); @@ -1380,14 +1449,16 @@ void bch2_fs_allocator_foreground_init(struct bch_fs *c) static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); unsigned data_type = ob->data_type; barrier(); /* READ_ONCE() doesn't work on bitfields */ - prt_printf(out, "%zu ref %u %s %u:%llu gen %u", + prt_printf(out, "%zu ref %u %s %u:%llu gen %u allocated %u/%u", ob - c->open_buckets, atomic_read(&ob->pin), data_type < BCH_DATA_NR ? bch2_data_types[data_type] : "invalid data type", - ob->dev, ob->bucket, ob->gen); + ob->dev, ob->bucket, ob->gen, + ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size); if (ob->ec) prt_printf(out, " ec idx %llu", ob->ec->idx); if (ob->on_partial_list) diff --git a/libbcachefs/alloc_foreground.h b/libbcachefs/alloc_foreground.h index e9b3b142..8a1cf425 100644 --- a/libbcachefs/alloc_foreground.h +++ b/libbcachefs/alloc_foreground.h @@ -151,7 +151,7 @@ static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 int bch2_bucket_alloc_set_trans(struct btree_trans *, struct open_buckets *, struct dev_stripe_state *, struct bch_devs_mask *, - unsigned, unsigned *, bool *, + unsigned, unsigned *, bool *, unsigned, enum bch_data_type, enum alloc_reserve, struct closure *); @@ -202,11 +202,7 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *, struct bkey_i *, unsigned, bool); void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *); -void bch2_open_buckets_stop_dev(struct bch_fs *, struct bch_dev *, - struct open_buckets *); - -void bch2_writepoint_stop(struct bch_fs *, struct bch_dev *, - struct write_point *); +void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *, bool); static inline struct write_point_specifier writepoint_hashed(unsigned long v) { diff --git a/libbcachefs/backpointers.c b/libbcachefs/backpointers.c index a40c2612..8517c563 100644 --- a/libbcachefs/backpointers.c +++ b/libbcachefs/backpointers.c @@ -549,13 +549,18 @@ int bch2_check_btree_backpointers(struct bch_fs *c) bch2_check_btree_backpointer(&trans, &iter, k))); } +struct bpos_level { + unsigned level; + struct bpos pos; +}; + static int check_bp_exists(struct btree_trans *trans, struct bpos bucket_pos, struct bch_backpointer bp, struct bkey_s_c orig_k, struct bpos bucket_start, struct bpos bucket_end, - struct bpos *last_flushed_pos) + struct bpos_level *last_flushed) { struct bch_fs *c = trans->c; struct btree_iter alloc_iter, bp_iter = { NULL }; @@ -600,8 +605,11 @@ static int check_bp_exists(struct btree_trans *trans, if (bp_k.k->type != KEY_TYPE_backpointer || memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) { - if (!bpos_eq(*last_flushed_pos, orig_k.k->p)) { - *last_flushed_pos = orig_k.k->p; + if (last_flushed->level != bp.level || + !bpos_eq(last_flushed->pos, orig_k.k->p)) { + last_flushed->level = bp.level; + last_flushed->pos = orig_k.k->p; + ret = bch2_btree_write_buffer_flush_sync(trans) ?: -BCH_ERR_transaction_restart_write_buffer_flush; goto out; @@ -639,7 +647,7 @@ static int check_extent_to_backpointers(struct btree_trans *trans, struct btree_iter *iter, struct bpos bucket_start, struct bpos bucket_end, - struct bpos *last_flushed_pos) + struct bpos_level *last_flushed) { struct bch_fs *c = trans->c; struct bkey_ptrs_c ptrs; @@ -668,7 +676,7 @@ static int check_extent_to_backpointers(struct btree_trans *trans, ret = check_bp_exists(trans, bucket_pos, bp, k, bucket_start, bucket_end, - last_flushed_pos); + last_flushed); if (ret) return ret; } @@ -680,7 +688,7 @@ static int check_btree_root_to_backpointers(struct btree_trans *trans, enum btree_id btree_id, struct bpos bucket_start, struct bpos bucket_end, - struct bpos *last_flushed_pos) + struct bpos_level *last_flushed) { struct bch_fs *c = trans->c; struct btree_iter iter; @@ -709,12 +717,12 @@ static int check_btree_root_to_backpointers(struct btree_trans *trans, if (p.ptr.cached) continue; - bch2_extent_ptr_to_bp(c, iter.btree_id, iter.path->level + 1, + bch2_extent_ptr_to_bp(c, iter.btree_id, b->c.level + 1, k, p, &bucket_pos, &bp); ret = check_bp_exists(trans, bucket_pos, bp, k, bucket_start, bucket_end, - last_flushed_pos); + last_flushed); if (ret) goto err; } @@ -794,7 +802,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, { struct btree_iter iter; enum btree_id btree_id; - struct bpos last_flushed_pos = SPOS_MAX; + struct bpos_level last_flushed = { UINT_MAX }; int ret = 0; for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) { @@ -811,7 +819,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, BTREE_INSERT_NOFAIL, check_extent_to_backpointers(trans, &iter, bucket_start, bucket_end, - &last_flushed_pos)); + &last_flushed)); if (ret) break; } while (!bch2_btree_iter_advance(&iter)); @@ -826,7 +834,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, BTREE_INSERT_NOFAIL, check_btree_root_to_backpointers(trans, btree_id, bucket_start, bucket_end, - &last_flushed_pos)); + &last_flushed)); if (ret) break; } diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 25a32fd6..348ee8e8 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -214,8 +214,11 @@ #define BCH_WRITE_REF_DEBUG #endif +#ifndef dynamic_fault #define dynamic_fault(...) 0 -#define race_fault(...) 0 +#endif + +#define race_fault(...) dynamic_fault("bcachefs:race") #define trace_and_count(_c, _name, ...) \ do { \ @@ -652,7 +655,6 @@ typedef struct { x(fallocate) \ x(discard) \ x(invalidate) \ - x(move) \ x(delete_dead_snapshots) \ x(snapshot_delete_pagecache) \ x(sysfs) @@ -922,6 +924,13 @@ struct bch_fs { mempool_t large_bkey_pool; + /* MOVE.C */ + struct list_head moving_context_list; + struct mutex moving_context_lock; + + struct list_head data_progress_list; + struct mutex data_progress_lock; + /* REBALANCE */ struct bch_fs_rebalance rebalance; @@ -932,10 +941,6 @@ struct bch_fs { bool copygc_running; wait_queue_head_t copygc_running_wq; - /* DATA PROGRESS STATS */ - struct list_head data_progress_list; - struct mutex data_progress_lock; - /* STRIPES: */ GENRADIX(struct stripe) stripes; GENRADIX(struct gc_stripe) gc_stripes; @@ -952,14 +957,14 @@ struct bch_fs { struct list_head ec_stripe_new_list; struct mutex ec_stripe_new_lock; + wait_queue_head_t ec_stripe_new_wait; struct work_struct ec_stripe_create_work; u64 ec_stripe_hint; - struct bio_set ec_bioset; - struct work_struct ec_stripe_delete_work; - struct llist_head ec_stripe_delete_list; + + struct bio_set ec_bioset; /* REFLINK */ u64 reflink_hint; diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 2d344993..0a3e5605 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -16,7 +16,7 @@ #include "replicas.h" #include "subvolume.h" -#include +#include #include #include diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index 298a674d..27a73933 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -770,11 +770,11 @@ int bch2_btree_key_cache_flush(struct btree_trans *trans, bool bch2_btree_insert_key_cached(struct btree_trans *trans, unsigned flags, - struct btree_path *path, - struct bkey_i *insert) + struct btree_insert_entry *insert_entry) { struct bch_fs *c = trans->c; - struct bkey_cached *ck = (void *) path->l[0].b; + struct bkey_cached *ck = (void *) insert_entry->path->l[0].b; + struct bkey_i *insert = insert_entry->k; bool kick_reclaim = false; BUG_ON(insert->k.u64s > ck->u64s); @@ -802,9 +802,24 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, kick_reclaim = true; } + /* + * To minimize lock contention, we only add the journal pin here and + * defer pin updates to the flush callback via ->seq. Be careful not to + * update ->seq on nojournal commits because we don't want to update the + * pin to a seq that doesn't include journal updates on disk. Otherwise + * we risk losing the update after a crash. + * + * The only exception is if the pin is not active in the first place. We + * have to add the pin because journal reclaim drives key cache + * flushing. The flush callback will not proceed unless ->seq matches + * the latest pin, so make sure it starts with a consistent value. + */ + if (!(insert_entry->flags & BTREE_UPDATE_NOJOURNAL) || + !journal_pin_active(&ck->journal)) { + ck->seq = trans->journal_res.seq; + } bch2_journal_pin_add(&c->journal, trans->journal_res.seq, &ck->journal, bch2_btree_key_cache_journal_flush); - ck->seq = trans->journal_res.seq; if (kick_reclaim) journal_reclaim_kick(&c->journal); diff --git a/libbcachefs/btree_key_cache.h b/libbcachefs/btree_key_cache.h index c86d5e48..be3acde2 100644 --- a/libbcachefs/btree_key_cache.h +++ b/libbcachefs/btree_key_cache.h @@ -30,7 +30,7 @@ int bch2_btree_path_traverse_cached(struct btree_trans *, struct btree_path *, unsigned); bool bch2_btree_insert_key_cached(struct btree_trans *, unsigned, - struct btree_path *, struct bkey_i *); + struct btree_insert_entry *); int bch2_btree_key_cache_flush(struct btree_trans *, enum btree_id, struct bpos); void bch2_btree_key_cache_drop(struct btree_trans *, diff --git a/libbcachefs/btree_locking.c b/libbcachefs/btree_locking.c index 0032d0eb..b9998665 100644 --- a/libbcachefs/btree_locking.c +++ b/libbcachefs/btree_locking.c @@ -388,6 +388,40 @@ int __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree_path *p return ret; } +void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, + struct btree_path *path, + struct btree_bkey_cached_common *b) +{ + struct btree_path *linked; + unsigned i; + int ret; + + /* + * XXX BIG FAT NOTICE + * + * Drop all read locks before taking a write lock: + * + * This is a hack, because bch2_btree_node_lock_write_nofail() is a + * hack - but by dropping read locks first, this should never fail, and + * we only use this in code paths where whatever read locks we've + * already taken are no longer needed: + */ + + trans_for_each_path(trans, linked) { + if (!linked->nodes_locked) + continue; + + for (i = 0; i < BTREE_MAX_DEPTH; i++) + if (btree_node_read_locked(linked, i)) { + btree_node_unlock(trans, linked, i); + btree_path_set_dirty(linked, BTREE_ITER_NEED_RELOCK); + } + } + + ret = __btree_node_lock_write(trans, path, b, true); + BUG_ON(ret); +} + /* relock */ static inline bool btree_path_get_locks(struct btree_trans *trans, diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h index bd658e5c..327780ce 100644 --- a/libbcachefs/btree_locking.h +++ b/libbcachefs/btree_locking.h @@ -299,15 +299,6 @@ static inline int __btree_node_lock_write(struct btree_trans *trans, : __bch2_btree_node_lock_write(trans, path, b, lock_may_not_fail); } -static inline void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, - struct btree_path *path, - struct btree_bkey_cached_common *b) -{ - int ret = __btree_node_lock_write(trans, path, b, true); - - BUG_ON(ret); -} - static inline int __must_check bch2_btree_node_lock_write(struct btree_trans *trans, struct btree_path *path, @@ -316,6 +307,10 @@ bch2_btree_node_lock_write(struct btree_trans *trans, return __btree_node_lock_write(trans, path, b, false); } +void bch2_btree_node_lock_write_nofail(struct btree_trans *, + struct btree_path *, + struct btree_bkey_cached_common *); + /* relock: */ bool bch2_btree_path_relock_norestart(struct btree_trans *, diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index ee1d1593..46fb4a9e 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -13,6 +13,9 @@ void bch2_btree_node_prep_for_write(struct btree_trans *, bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *, struct btree *, struct btree_node_iter *, struct bkey_i *); + +int bch2_btree_node_flush0(struct journal *, struct journal_entry_pin *, u64); +int bch2_btree_node_flush1(struct journal *, struct journal_entry_pin *, u64); void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64); void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *, diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index c93c132d..629e5288 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -227,12 +227,12 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, return 0; } -static int btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq) +int bch2_btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq) { return __btree_node_flush(j, pin, 0, seq); } -static int btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq) +int bch2_btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq) { return __btree_node_flush(j, pin, 1, seq); } @@ -244,8 +244,8 @@ inline void bch2_btree_add_journal_pin(struct bch_fs *c, bch2_journal_pin_add(&c->journal, seq, &w->journal, btree_node_write_idx(b) == 0 - ? btree_node_flush0 - : btree_node_flush1); + ? bch2_btree_node_flush0 + : bch2_btree_node_flush1); } /** @@ -765,7 +765,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, if (!i->cached) btree_insert_key_leaf(trans, i); else if (!i->key_cache_already_flushed) - bch2_btree_insert_key_cached(trans, flags, i->path, i->k); + bch2_btree_insert_key_cached(trans, flags, i); else { bch2_btree_key_cache_drop(trans, i->path); btree_path_set_dirty(i->path, BTREE_ITER_NEED_TRAVERSE); diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 6805f2c0..1bcef419 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -1855,7 +1855,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, if (IS_ERR(a)) return PTR_ERR(a); - if (a->v.data_type && a->v.data_type != type) { + if (a->v.data_type && type && a->v.data_type != type) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" "while marking %s", diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c index eb91e24c..e414d1af 100644 --- a/libbcachefs/data_update.c +++ b/libbcachefs/data_update.c @@ -92,18 +92,6 @@ static int insert_snapshot_whiteouts(struct btree_trans *trans, return ret; } -static void bch2_bkey_mark_dev_cached(struct bkey_s k, unsigned dev) -{ - struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); - struct bch_extent_ptr *ptr; - - bkey_for_each_ptr(ptrs, ptr) - if (ptr->dev == dev) { - bch2_extent_ptr_set_cached(k, ptr); - return; - } -} - static int __bch2_data_update_index_update(struct btree_trans *trans, struct bch_write_op *op) { @@ -126,15 +114,17 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, while (1) { struct bkey_s_c k; struct bkey_s_c old = bkey_i_to_s_c(m->k.k); - struct bkey_i *insert; + struct bkey_i *insert = NULL; struct bkey_i_extent *new; - const union bch_extent_entry *entry; + const union bch_extent_entry *entry_c; + union bch_extent_entry *entry; struct extent_ptr_decoded p; + struct bch_extent_ptr *ptr; + const struct bch_extent_ptr *ptr_c; struct bpos next_pos; - bool did_work = false; bool should_check_enospc; s64 i_sectors_delta = 0, disk_sectors_delta = 0; - unsigned i; + unsigned rewrites_found = 0, durability, i; bch2_trans_begin(trans); @@ -146,7 +136,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, new = bkey_i_to_extent(bch2_keylist_front(keys)); if (!bch2_extents_match(k, old)) - goto nomatch; + goto nowork; bkey_reassemble(_insert.k, k); insert = _insert.k; @@ -169,50 +159,60 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, * Fist, drop rewrite_ptrs from @new: */ i = 0; - bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) { + bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry_c) { if (((1U << i) & m->data_opts.rewrite_ptrs) && - bch2_extent_has_ptr(old, p, bkey_i_to_s_c(insert))) { - /* - * If we're going to be adding a pointer to the - * same device, we have to drop the old one - - * otherwise, we can just mark it cached: - */ - if (bch2_bkey_has_device(bkey_i_to_s_c(&new->k_i), p.ptr.dev)) - bch2_bkey_drop_device_noerror(bkey_i_to_s(insert), p.ptr.dev); - else - bch2_bkey_mark_dev_cached(bkey_i_to_s(insert), p.ptr.dev); + (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && + !ptr->cached) { + bch2_extent_ptr_set_cached(bkey_i_to_s(insert), ptr); + rewrites_found |= 1U << i; } i++; } + if (m->data_opts.rewrite_ptrs && + !rewrites_found && + bch2_bkey_durability(c, k) >= m->op.opts.data_replicas) + goto nowork; - /* Add new ptrs: */ - extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) { - const struct bch_extent_ptr *existing_ptr = - bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev); - - if (existing_ptr && existing_ptr->cached) { - /* - * We're replacing a cached pointer with a non - * cached pointer: - */ - bch2_bkey_drop_device_noerror(bkey_i_to_s(insert), - existing_ptr->dev); - } else if (existing_ptr) { - /* - * raced with another move op? extent already - * has a pointer to the device we just wrote - * data to - */ - continue; + /* + * A replica that we just wrote might conflict with a replica + * that we want to keep, due to racing with another move: + */ +restart_drop_conflicting_replicas: + extent_for_each_ptr(extent_i_to_s(new), ptr) + if ((ptr_c = bch2_bkey_has_device_c(bkey_i_to_s_c(insert), ptr->dev)) && + !ptr_c->cached) { + bch2_bkey_drop_ptr_noerror(bkey_i_to_s(&new->k_i), ptr); + goto restart_drop_conflicting_replicas; } - bch2_extent_ptr_decoded_append(insert, &p); - did_work = true; + if (!bkey_val_u64s(&new->k)) + goto nowork; + + /* Now, drop pointers that conflict with what we just wrote: */ + extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) + if ((ptr = bch2_bkey_has_device(bkey_i_to_s(insert), p.ptr.dev))) + bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr); + + durability = bch2_bkey_durability(c, bkey_i_to_s_c(insert)) + + bch2_bkey_durability(c, bkey_i_to_s_c(&new->k_i)); + + /* Now, drop excess replicas: */ +restart_drop_extra_replicas: + bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) { + unsigned ptr_durability = bch2_extent_ptr_durability(c, &p); + + if (!p.ptr.cached && + durability - ptr_durability >= m->op.opts.data_replicas) { + durability -= ptr_durability; + bch2_extent_ptr_set_cached(bkey_i_to_s(insert), &entry->ptr); + goto restart_drop_extra_replicas; + } } - if (!did_work) - goto nomatch; + /* Finally, add the pointers we just wrote: */ + extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) + bch2_extent_ptr_decoded_append(insert, &p); bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 }); bch2_extent_normalize(c, bkey_i_to_s(insert)); @@ -253,6 +253,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: bch2_trans_commit(trans, &op->res, NULL, + BTREE_INSERT_NOCHECK_RW| BTREE_INSERT_NOFAIL| m->data_opts.btree_insert_flags); if (!ret) { @@ -273,7 +274,7 @@ next: goto out; } continue; -nomatch: +nowork: if (m->ctxt && m->ctxt->stats) { BUG_ON(k.k->p.offset <= iter.pos.offset); atomic64_inc(&m->ctxt->stats->keys_raced); diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index 7d43fd4a..09c6f93c 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -659,14 +659,13 @@ static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s) static u64 stripe_idx_to_delete(struct bch_fs *c) { ec_stripes_heap *h = &c->ec_stripes_heap; - size_t heap_idx; lockdep_assert_held(&c->ec_stripes_heap_lock); - for (heap_idx = 0; heap_idx < h->used; heap_idx++) - if (h->data[heap_idx].blocks_nonempty == 0 && - !bch2_stripe_is_open(c, h->data[heap_idx].idx)) - return h->data[heap_idx].idx; + if (h->used && + h->data[0].blocks_nonempty == 0 && + !bch2_stripe_is_open(c, h->data[0].idx)) + return h->data[0].idx; return 0; } @@ -959,7 +958,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans, bkey_reassemble(n, k); bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev); - ec_ptr = (void *) bch2_bkey_has_device(bkey_i_to_s_c(n), dev); + ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev); BUG_ON(!ec_ptr); stripe_ptr = (struct bch_extent_stripe_ptr) { @@ -990,6 +989,7 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b while (1) { ret = commit_do(trans, NULL, NULL, + BTREE_INSERT_NOCHECK_RW| BTREE_INSERT_NOFAIL, ec_stripe_update_extent(trans, bucket_pos, bucket.gen, s, &bp_offset)); @@ -1057,6 +1057,13 @@ static void zero_out_rest_of_ec_bucket(struct bch_fs *c, s->err = ret; } +void bch2_ec_stripe_new_free(struct bch_fs *c, struct ec_stripe_new *s) +{ + if (s->idx) + bch2_stripe_close(c, s); + kfree(s); +} + /* * data buckets of new stripe all written: create the stripe */ @@ -1072,13 +1079,15 @@ static void ec_stripe_create(struct ec_stripe_new *s) closure_sync(&s->iodone); - for (i = 0; i < nr_data; i++) - if (s->blocks[i]) { - ob = c->open_buckets + s->blocks[i]; + if (!s->err) { + for (i = 0; i < nr_data; i++) + if (s->blocks[i]) { + ob = c->open_buckets + s->blocks[i]; - if (ob->sectors_free) - zero_out_rest_of_ec_bucket(c, s, i, ob); - } + if (ob->sectors_free) + zero_out_rest_of_ec_bucket(c, s, i, ob); + } + } if (s->err) { if (!bch2_err_matches(s->err, EROFS)) @@ -1119,7 +1128,9 @@ static void ec_stripe_create(struct ec_stripe_new *s) goto err; } - ret = bch2_trans_do(c, &s->res, NULL, BTREE_INSERT_NOFAIL, + ret = bch2_trans_do(c, &s->res, NULL, + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_NOFAIL, ec_stripe_key_update(&trans, &s->new_stripe.key, !s->have_existing_stripe)); if (ret) { @@ -1152,13 +1163,11 @@ err: list_del(&s->list); mutex_unlock(&c->ec_stripe_new_lock); - if (s->idx) - bch2_stripe_close(c, s); - ec_stripe_buf_exit(&s->existing_stripe); ec_stripe_buf_exit(&s->new_stripe); closure_debug_destroy(&s->iodone); - kfree(s); + + ec_stripe_new_put(c, s, STRIPE_REF_stripe); } static struct ec_stripe_new *get_pending_stripe(struct bch_fs *c) @@ -1167,7 +1176,7 @@ static struct ec_stripe_new *get_pending_stripe(struct bch_fs *c) mutex_lock(&c->ec_stripe_new_lock); list_for_each_entry(s, &c->ec_stripe_new_list, list) - if (!atomic_read(&s->pin)) + if (!atomic_read(&s->ref[STRIPE_REF_io])) goto out; s = NULL; out: @@ -1209,7 +1218,7 @@ static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h) list_add(&s->list, &c->ec_stripe_new_list); mutex_unlock(&c->ec_stripe_new_lock); - ec_stripe_new_put(c, s); + ec_stripe_new_put(c, s, STRIPE_REF_io); } void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob) @@ -1321,7 +1330,8 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) mutex_init(&s->lock); closure_init(&s->iodone, NULL); - atomic_set(&s->pin, 1); + atomic_set(&s->ref[STRIPE_REF_stripe], 1); + atomic_set(&s->ref[STRIPE_REF_io], 1); s->c = c; s->h = h; s->nr_data = min_t(unsigned, h->nr_active_devs, @@ -1402,6 +1412,11 @@ struct ec_stripe_head *__bch2_ec_stripe_head_get(struct btree_trans *trans, if (ret) return ERR_PTR(ret); + if (test_bit(BCH_FS_GOING_RO, &c->flags)) { + h = ERR_PTR(-EROFS); + goto found; + } + list_for_each_entry(h, &c->ec_stripe_head_list, list) if (h->target == target && h->algo == algo && @@ -1451,7 +1466,7 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_ &devs, h->s->nr_parity, &nr_have_parity, - &have_cache, + &have_cache, 0, BCH_DATA_parity, reserve, cl); @@ -1478,7 +1493,7 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_ &devs, h->s->nr_data, &nr_have_data, - &have_cache, + &have_cache, 0, BCH_DATA_user, reserve, cl); @@ -1706,6 +1721,14 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked) goto err; + if (reserve == RESERVE_movinggc) { + ret = new_stripe_alloc_buckets(trans, h, reserve, NULL) ?: + __bch2_ec_stripe_head_reserve(trans, h); + if (ret) + goto err; + goto allocate_buf; + } + /* XXX freelist_wait? */ closure_wait(&c->freelist_wait, cl); waiting = true; @@ -1738,7 +1761,7 @@ err: return ERR_PTR(ret); } -void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) +static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca) { struct ec_stripe_head *h; struct open_bucket *ob; @@ -1746,11 +1769,13 @@ void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) mutex_lock(&c->ec_stripe_head_lock); list_for_each_entry(h, &c->ec_stripe_head_list, list) { - mutex_lock(&h->lock); if (!h->s) goto unlock; + if (!ca) + goto found; + for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) { if (!h->s->blocks[i]) continue; @@ -1769,6 +1794,32 @@ unlock: mutex_unlock(&c->ec_stripe_head_lock); } +void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) +{ + __bch2_ec_stop(c, ca); +} + +void bch2_fs_ec_stop(struct bch_fs *c) +{ + __bch2_ec_stop(c, NULL); +} + +static bool bch2_fs_ec_flush_done(struct bch_fs *c) +{ + bool ret; + + mutex_lock(&c->ec_stripe_new_lock); + ret = list_empty(&c->ec_stripe_new_list); + mutex_unlock(&c->ec_stripe_new_lock); + + return ret; +} + +void bch2_fs_ec_flush(struct bch_fs *c) +{ + wait_event(c->ec_stripe_new_wait, bch2_fs_ec_flush_done(c)); +} + int bch2_stripes_read(struct bch_fs *c) { struct btree_trans trans; @@ -1821,13 +1872,16 @@ void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c) size_t i; mutex_lock(&c->ec_stripes_heap_lock); - for (i = 0; i < min_t(size_t, h->used, 20); i++) { + for (i = 0; i < min_t(size_t, h->used, 50); i++) { m = genradix_ptr(&c->stripes, h->data[i].idx); - prt_printf(out, "%zu %u/%u+%u\n", h->data[i].idx, + prt_printf(out, "%zu %u/%u+%u", h->data[i].idx, h->data[i].blocks_nonempty, m->nr_blocks - m->nr_redundant, m->nr_redundant); + if (bch2_stripe_is_open(c, h->data[i].idx)) + prt_str(out, " open"); + prt_newline(out); } mutex_unlock(&c->ec_stripes_heap_lock); } @@ -1839,22 +1893,27 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) mutex_lock(&c->ec_stripe_head_lock); list_for_each_entry(h, &c->ec_stripe_head_list, list) { - prt_printf(out, "target %u algo %u redundancy %u:\n", - h->target, h->algo, h->redundancy); + prt_printf(out, "target %u algo %u redundancy %u %s:\n", + h->target, h->algo, h->redundancy, + bch2_alloc_reserves[h->reserve]); if (h->s) - prt_printf(out, "\tpending: idx %llu blocks %u+%u allocated %u\n", + prt_printf(out, "\tidx %llu blocks %u+%u allocated %u\n", h->s->idx, h->s->nr_data, h->s->nr_parity, bitmap_weight(h->s->blocks_allocated, h->s->nr_data)); } mutex_unlock(&c->ec_stripe_head_lock); + prt_printf(out, "in flight:\n"); + mutex_lock(&c->ec_stripe_new_lock); list_for_each_entry(s, &c->ec_stripe_new_list, list) { - prt_printf(out, "\tin flight: idx %llu blocks %u+%u pin %u\n", + prt_printf(out, "\tidx %llu blocks %u+%u ref %u %u %s\n", s->idx, s->nr_data, s->nr_parity, - atomic_read(&s->pin)); + atomic_read(&s->ref[STRIPE_REF_io]), + atomic_read(&s->ref[STRIPE_REF_stripe]), + bch2_alloc_reserves[s->h->reserve]); } mutex_unlock(&c->ec_stripe_new_lock); } @@ -1892,14 +1951,22 @@ void bch2_fs_ec_exit(struct bch_fs *c) void bch2_fs_ec_init_early(struct bch_fs *c) { + spin_lock_init(&c->ec_stripes_new_lock); + mutex_init(&c->ec_stripes_heap_lock); + + INIT_LIST_HEAD(&c->ec_stripe_head_list); + mutex_init(&c->ec_stripe_head_lock); + + INIT_LIST_HEAD(&c->ec_stripe_new_list); + mutex_init(&c->ec_stripe_new_lock); + init_waitqueue_head(&c->ec_stripe_new_wait); + INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work); INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work); } int bch2_fs_ec_init(struct bch_fs *c) { - spin_lock_init(&c->ec_stripes_new_lock); - return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio), BIOSET_NEED_BVECS); } diff --git a/libbcachefs/ec.h b/libbcachefs/ec.h index d112aea9..7c08a49d 100644 --- a/libbcachefs/ec.h +++ b/libbcachefs/ec.h @@ -143,6 +143,12 @@ struct ec_stripe_buf { struct ec_stripe_head; +enum ec_stripe_ref { + STRIPE_REF_io, + STRIPE_REF_stripe, + STRIPE_REF_NR +}; + struct ec_stripe_new { struct bch_fs *c; struct ec_stripe_head *h; @@ -154,8 +160,7 @@ struct ec_stripe_new { struct closure iodone; - /* counts in flight writes, stripe is created when pin == 0 */ - atomic_t pin; + atomic_t ref[STRIPE_REF_NR]; int err; @@ -213,24 +218,35 @@ void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t); void bch2_do_stripe_deletes(struct bch_fs *); void bch2_ec_do_stripe_creates(struct bch_fs *); +void bch2_ec_stripe_new_free(struct bch_fs *, struct ec_stripe_new *); -static inline void ec_stripe_new_get(struct ec_stripe_new *s) +static inline void ec_stripe_new_get(struct ec_stripe_new *s, + enum ec_stripe_ref ref) { - atomic_inc(&s->pin); + atomic_inc(&s->ref[ref]); } -static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s) +static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s, + enum ec_stripe_ref ref) { - BUG_ON(atomic_read(&s->pin) <= 0); - BUG_ON(!s->err && !s->idx); - - if (atomic_dec_and_test(&s->pin)) - bch2_ec_do_stripe_creates(c); + BUG_ON(atomic_read(&s->ref[ref]) <= 0); + + if (atomic_dec_and_test(&s->ref[ref])) + switch (ref) { + case STRIPE_REF_stripe: + bch2_ec_stripe_new_free(c, s); + break; + case STRIPE_REF_io: + bch2_ec_do_stripe_creates(c); + break; + default: + unreachable(); + } } void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *); - -void bch2_ec_flush_new_stripes(struct bch_fs *); +void bch2_fs_ec_stop(struct bch_fs *); +void bch2_fs_ec_flush(struct bch_fs *); int bch2_stripes_read(struct bch_fs *); diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index 4fc581be..e2c09ea4 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -26,8 +26,6 @@ #include -static union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *); - static unsigned bch2_crc_field_size_max[] = { [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX, [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX, @@ -512,7 +510,7 @@ restart_narrow_pointers: bkey_for_each_ptr_decode(&k->k, ptrs, p, i) if (can_narrow_crc(p.crc, n)) { - __bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr); + bch2_bkey_drop_ptr_noerror(bkey_i_to_s(k), &i->ptr); p.ptr.offset += p.crc.offset; p.crc = n; bch2_extent_ptr_decoded_append(k, &p); @@ -765,8 +763,8 @@ static void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry) /* * Returns pointer to the next entry after the one being dropped: */ -static union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s k, - struct bch_extent_ptr *ptr) +union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s k, + struct bch_extent_ptr *ptr) { struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); union bch_extent_entry *entry = to_entry(ptr), *next; @@ -809,7 +807,7 @@ union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, { bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr; union bch_extent_entry *ret = - __bch2_bkey_drop_ptr(k, ptr); + bch2_bkey_drop_ptr_noerror(k, ptr); /* * If we deleted all the dirty pointers and there's still cached @@ -840,14 +838,13 @@ void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev) { - struct bch_extent_ptr *ptr = (void *) bch2_bkey_has_device(k.s_c, dev); + struct bch_extent_ptr *ptr = bch2_bkey_has_device(k, dev); if (ptr) - __bch2_bkey_drop_ptr(k, ptr); + bch2_bkey_drop_ptr_noerror(k, ptr); } -const struct bch_extent_ptr * -bch2_bkey_has_device(struct bkey_s_c k, unsigned dev) +const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const struct bch_extent_ptr *ptr; @@ -922,11 +919,11 @@ bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2) } } -bool bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, - struct bkey_s_c k2) +struct bch_extent_ptr * +bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bkey_s k2) { - struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2); - const union bch_extent_entry *entry2; + struct bkey_ptrs ptrs2 = bch2_bkey_ptrs(k2); + union bch_extent_entry *entry2; struct extent_ptr_decoded p2; bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) @@ -934,9 +931,9 @@ bool bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, p1.ptr.gen == p2.ptr.gen && (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k)) - return true; + return &entry2->ptr; - return false; + return NULL; } void bch2_extent_ptr_set_cached(struct bkey_s k, struct bch_extent_ptr *ptr) @@ -992,6 +989,9 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct bch_dev *ca; bool first = true; + if (c) + prt_printf(out, "durability: %u ", bch2_bkey_durability(c, k)); + bkey_extent_entry_for_each(ptrs, entry) { if (!first) prt_printf(out, " "); diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index bac6a1ed..9b026ae9 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -613,14 +613,21 @@ unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); void bch2_bkey_drop_device(struct bkey_s, unsigned); void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned); -const struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s_c, unsigned); + +const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c, unsigned); + +static inline struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s k, unsigned dev) +{ + return (void *) bch2_bkey_has_device_c(k.s_c, dev); +} + bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned); void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *); static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr ptr) { - EBUG_ON(bch2_bkey_has_device(bkey_i_to_s_c(k), ptr.dev)); + EBUG_ON(bch2_bkey_has_device(bkey_i_to_s(k), ptr.dev)); switch (k->k.type) { case KEY_TYPE_btree_ptr: @@ -642,6 +649,8 @@ static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr void bch2_extent_ptr_decoded_append(struct bkey_i *, struct extent_ptr_decoded *); +union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s, + struct bch_extent_ptr *); union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *); @@ -665,7 +674,8 @@ do { \ bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c, struct bch_extent_ptr, u64); bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c); -bool bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s_c); +struct bch_extent_ptr * +bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s); void bch2_extent_ptr_set_cached(struct bkey_s, struct bch_extent_ptr *); diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index e232f331..5e6dc6c3 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -954,11 +954,11 @@ static int check_inode(struct btree_trans *trans, iter->pos.snapshot), POS(u.bi_inum, U64_MAX), 0, NULL); - if (ret) { + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) bch_err(c, "error in fsck: error truncating inode: %s", bch2_err_str(ret)); + if (ret) return ret; - } /* * We truncated without our normal sector accounting hook, just diff --git a/libbcachefs/io.c b/libbcachefs/io.c index ea0fd631..76856bfd 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -218,7 +218,8 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, bch2_trans_copy_iter(&iter, extent_iter); - for_each_btree_key_continue_norestart(iter, BTREE_ITER_SLOTS, old, ret) { + for_each_btree_key_upto_continue_norestart(iter, + new->k.p, BTREE_ITER_SLOTS, old, ret) { s64 sectors = min(new->k.p.offset, old.k->p.offset) - max(bkey_start_offset(&new->k), bkey_start_offset(old.k)); @@ -705,7 +706,8 @@ static void bch2_write_done(struct closure *cl) struct bch_fs *c = op->c; bch2_disk_reservation_put(c, &op->res); - bch2_write_ref_put(c, BCH_WRITE_REF_write); + if (!(op->flags & BCH_WRITE_MOVE)) + bch2_write_ref_put(c, BCH_WRITE_REF_write); bch2_keylist_free(&op->insert_keys, op->inline_keys); bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); @@ -834,36 +836,30 @@ static void bch2_write_index(struct closure *cl) struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); struct write_point *wp = op->wp; struct workqueue_struct *wq = index_update_wq(op); + unsigned long flags; if ((op->flags & BCH_WRITE_DONE) && (op->flags & BCH_WRITE_MOVE)) bch2_bio_free_pages_pool(op->c, &op->wbio.bio); - barrier(); - - /* - * We're not using wp->writes_lock here, so this is racey: that's ok, - * because this is just for diagnostic purposes, and we're running out - * of interrupt context here so if we were to take the log we'd have to - * switch to spin_lock_irq()/irqsave(), which is not free: - */ + spin_lock_irqsave(&wp->writes_lock, flags); if (wp->state == WRITE_POINT_waiting_io) __wp_update_state(wp, WRITE_POINT_waiting_work); + list_add_tail(&op->wp_list, &wp->writes); + spin_unlock_irqrestore (&wp->writes_lock, flags); - op->btree_update_ready = true; queue_work(wq, &wp->index_update_work); } static inline void bch2_write_queue(struct bch_write_op *op, struct write_point *wp) { - op->btree_update_ready = false; op->wp = wp; - spin_lock(&wp->writes_lock); - list_add_tail(&op->wp_list, &wp->writes); - if (wp->state == WRITE_POINT_stopped) + if (wp->state == WRITE_POINT_stopped) { + spin_lock_irq(&wp->writes_lock); __wp_update_state(wp, WRITE_POINT_waiting_io); - spin_unlock(&wp->writes_lock); + spin_unlock_irq(&wp->writes_lock); + } } void bch2_write_point_do_index_updates(struct work_struct *work) @@ -873,16 +869,12 @@ void bch2_write_point_do_index_updates(struct work_struct *work) struct bch_write_op *op; while (1) { - spin_lock(&wp->writes_lock); - list_for_each_entry(op, &wp->writes, wp_list) - if (op->btree_update_ready) { - list_del(&op->wp_list); - goto unlock; - } - op = NULL; -unlock: + spin_lock_irq(&wp->writes_lock); + op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list); + if (op) + list_del(&op->wp_list); wp_update_state(wp, op != NULL); - spin_unlock(&wp->writes_lock); + spin_unlock_irq(&wp->writes_lock); if (!op) break; @@ -1673,7 +1665,6 @@ static void __bch2_write(struct bch_write_op *op) } again: memset(&op->failed, 0, sizeof(op->failed)); - op->btree_update_ready = false; do { struct bkey_i *key_to_write; @@ -1853,7 +1844,12 @@ void bch2_write(struct closure *cl) goto err; } - if (c->opts.nochanges || + if (c->opts.nochanges) { + op->error = -BCH_ERR_erofs_no_writes; + goto err; + } + + if (!(op->flags & BCH_WRITE_MOVE) && !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) { op->error = -BCH_ERR_erofs_no_writes; goto err; @@ -1881,6 +1877,28 @@ err: op->end_io(op); } +const char * const bch2_write_flags[] = { +#define x(f) #f, + BCH_WRITE_FLAGS() +#undef x + NULL +}; + +void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) +{ + prt_str(out, "pos: "); + bch2_bpos_to_text(out, op->pos); + prt_newline(out); + + prt_str(out, "started: "); + bch2_pr_time_units(out, local_clock() - op->start_time); + prt_newline(out); + + prt_str(out, "flags: "); + prt_bitflags(out, bch2_write_flags, op->flags); + prt_newline(out); +} + /* Cache promotion on read */ struct promote_op { diff --git a/libbcachefs/io.h b/libbcachefs/io.h index 166ad681..90948bb0 100644 --- a/libbcachefs/io.h +++ b/libbcachefs/io.h @@ -28,41 +28,34 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, const char *bch2_blk_status_to_str(blk_status_t); -enum bch_write_flags { - __BCH_WRITE_ALLOC_NOWAIT, - __BCH_WRITE_CACHED, - __BCH_WRITE_DATA_ENCODED, - __BCH_WRITE_PAGES_STABLE, - __BCH_WRITE_PAGES_OWNED, - __BCH_WRITE_ONLY_SPECIFIED_DEVS, - __BCH_WRITE_WROTE_DATA_INLINE, - __BCH_WRITE_FROM_INTERNAL, - __BCH_WRITE_CHECK_ENOSPC, - __BCH_WRITE_SYNC, - __BCH_WRITE_MOVE, - __BCH_WRITE_IN_WORKER, - __BCH_WRITE_DONE, - __BCH_WRITE_IO_ERROR, - __BCH_WRITE_CONVERT_UNWRITTEN, +#define BCH_WRITE_FLAGS() \ + x(ALLOC_NOWAIT) \ + x(CACHED) \ + x(DATA_ENCODED) \ + x(PAGES_STABLE) \ + x(PAGES_OWNED) \ + x(ONLY_SPECIFIED_DEVS) \ + x(WROTE_DATA_INLINE) \ + x(FROM_INTERNAL) \ + x(CHECK_ENOSPC) \ + x(SYNC) \ + x(MOVE) \ + x(IN_WORKER) \ + x(DONE) \ + x(IO_ERROR) \ + x(CONVERT_UNWRITTEN) + +enum __bch_write_flags { +#define x(f) __BCH_WRITE_##f, + BCH_WRITE_FLAGS() +#undef x }; -#define BCH_WRITE_ALLOC_NOWAIT (1U << __BCH_WRITE_ALLOC_NOWAIT) -#define BCH_WRITE_CACHED (1U << __BCH_WRITE_CACHED) -#define BCH_WRITE_DATA_ENCODED (1U << __BCH_WRITE_DATA_ENCODED) -#define BCH_WRITE_PAGES_STABLE (1U << __BCH_WRITE_PAGES_STABLE) -#define BCH_WRITE_PAGES_OWNED (1U << __BCH_WRITE_PAGES_OWNED) -#define BCH_WRITE_ONLY_SPECIFIED_DEVS (1U << __BCH_WRITE_ONLY_SPECIFIED_DEVS) -#define BCH_WRITE_WROTE_DATA_INLINE (1U << __BCH_WRITE_WROTE_DATA_INLINE) -#define BCH_WRITE_FROM_INTERNAL (1U << __BCH_WRITE_FROM_INTERNAL) -#define BCH_WRITE_CHECK_ENOSPC (1U << __BCH_WRITE_CHECK_ENOSPC) -#define BCH_WRITE_SYNC (1U << __BCH_WRITE_SYNC) -#define BCH_WRITE_MOVE (1U << __BCH_WRITE_MOVE) - -/* Internal: */ -#define BCH_WRITE_IN_WORKER (1U << __BCH_WRITE_IN_WORKER) -#define BCH_WRITE_DONE (1U << __BCH_WRITE_DONE) -#define BCH_WRITE_IO_ERROR (1U << __BCH_WRITE_IO_ERROR) -#define BCH_WRITE_CONVERT_UNWRITTEN (1U << __BCH_WRITE_CONVERT_UNWRITTEN) +enum bch_write_flags { +#define x(f) BCH_WRITE_##f = 1U << __BCH_WRITE_##f, + BCH_WRITE_FLAGS() +#undef x +}; static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) { @@ -124,6 +117,8 @@ static inline struct bch_write_bio *wbio_init(struct bio *bio) return wbio; } +void bch2_write_op_to_text(struct printbuf *, struct bch_write_op *); + struct bch_devs_mask; struct cache_promote_op; struct extent_ptr_decoded; diff --git a/libbcachefs/io_types.h b/libbcachefs/io_types.h index 4e5d3106..3b2ed0fa 100644 --- a/libbcachefs/io_types.h +++ b/libbcachefs/io_types.h @@ -119,7 +119,7 @@ struct bch_write_op { unsigned nr_replicas_required:4; unsigned alloc_reserve:3; unsigned incompressible:1; - unsigned btree_update_ready:1; + unsigned stripe_waited:1; struct bch_devs_list devs_have; u16 target; diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index e0c4f51a..5699a9d8 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -68,8 +68,9 @@ journal_seq_to_buf(struct journal *j, u64 seq) static void journal_pin_list_init(struct journal_entry_pin_list *p, int count) { - INIT_LIST_HEAD(&p->list); - INIT_LIST_HEAD(&p->key_cache_list); + unsigned i; + for (i = 0; i < ARRAY_SIZE(p->list); i++) + INIT_LIST_HEAD(&p->list[i]); INIT_LIST_HEAD(&p->flushed); atomic_set(&p->count, count); p->devs.nr = 0; @@ -758,19 +759,10 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, u64 *new_bucket_seq = NULL, *new_buckets = NULL; struct open_bucket **ob = NULL; long *bu = NULL; - unsigned i, nr_got = 0, nr_want = nr - ja->nr; - unsigned old_nr = ja->nr; - unsigned old_discard_idx = ja->discard_idx; - unsigned old_dirty_idx_ondisk = ja->dirty_idx_ondisk; - unsigned old_dirty_idx = ja->dirty_idx; - unsigned old_cur_idx = ja->cur_idx; + unsigned i, pos, nr_got = 0, nr_want = nr - ja->nr; int ret = 0; - if (c) { - bch2_journal_flush_all_pins(&c->journal); - bch2_journal_block(&c->journal); - mutex_lock(&c->sb_lock); - } + BUG_ON(nr <= ja->nr); bu = kcalloc(nr_want, sizeof(*bu), GFP_KERNEL); ob = kcalloc(nr_want, sizeof(*ob), GFP_KERNEL); @@ -778,7 +770,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, new_bucket_seq = kcalloc(nr, sizeof(u64), GFP_KERNEL); if (!bu || !ob || !new_buckets || !new_bucket_seq) { ret = -ENOMEM; - goto err_unblock; + goto err_free; } for (nr_got = 0; nr_got < nr_want; nr_got++) { @@ -794,87 +786,92 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, if (ret) break; + ret = bch2_trans_run(c, + bch2_trans_mark_metadata_bucket(&trans, ca, + ob[nr_got]->bucket, BCH_DATA_journal, + ca->mi.bucket_size)); + if (ret) { + bch2_open_bucket_put(c, ob[nr_got]); + bch_err(c, "error marking new journal buckets: %s", bch2_err_str(ret)); + break; + } + bu[nr_got] = ob[nr_got]->bucket; } } if (!nr_got) - goto err_unblock; + goto err_free; - /* - * We may be called from the device add path, before the new device has - * actually been added to the running filesystem: - */ - if (!new_fs) - spin_lock(&c->journal.lock); + /* Don't return an error if we successfully allocated some buckets: */ + ret = 0; + + if (c) { + bch2_journal_flush_all_pins(&c->journal); + bch2_journal_block(&c->journal); + mutex_lock(&c->sb_lock); + } memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64)); memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64)); - swap(new_buckets, ja->buckets); - swap(new_bucket_seq, ja->bucket_seq); + + BUG_ON(ja->discard_idx > ja->nr); + + pos = ja->discard_idx ?: ja->nr; + + memmove(new_buckets + pos + nr_got, + new_buckets + pos, + sizeof(new_buckets[0]) * (ja->nr - pos)); + memmove(new_bucket_seq + pos + nr_got, + new_bucket_seq + pos, + sizeof(new_bucket_seq[0]) * (ja->nr - pos)); for (i = 0; i < nr_got; i++) { - unsigned pos = ja->discard_idx ?: ja->nr; - long b = bu[i]; - - __array_insert_item(ja->buckets, ja->nr, pos); - __array_insert_item(ja->bucket_seq, ja->nr, pos); - ja->nr++; - - ja->buckets[pos] = b; - ja->bucket_seq[pos] = 0; - - if (pos <= ja->discard_idx) - ja->discard_idx = (ja->discard_idx + 1) % ja->nr; - if (pos <= ja->dirty_idx_ondisk) - ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr; - if (pos <= ja->dirty_idx) - ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr; - if (pos <= ja->cur_idx) - ja->cur_idx = (ja->cur_idx + 1) % ja->nr; + new_buckets[pos + i] = bu[i]; + new_bucket_seq[pos + i] = 0; } - ret = bch2_journal_buckets_to_sb(c, ca); - if (ret) { - /* Revert: */ - swap(new_buckets, ja->buckets); - swap(new_bucket_seq, ja->bucket_seq); - ja->nr = old_nr; - ja->discard_idx = old_discard_idx; - ja->dirty_idx_ondisk = old_dirty_idx_ondisk; - ja->dirty_idx = old_dirty_idx; - ja->cur_idx = old_cur_idx; - } + nr = ja->nr + nr_got; - if (!new_fs) - spin_unlock(&c->journal.lock); + ret = bch2_journal_buckets_to_sb(c, ca, new_buckets, nr); + if (ret) + goto err_unblock; - if (ja->nr != old_nr && !new_fs) + if (!new_fs) bch2_write_super(c); + /* Commit: */ if (c) - bch2_journal_unblock(&c->journal); + spin_lock(&c->journal.lock); - if (ret) - goto err; + swap(new_buckets, ja->buckets); + swap(new_bucket_seq, ja->bucket_seq); + ja->nr = nr; + + if (pos <= ja->discard_idx) + ja->discard_idx = (ja->discard_idx + nr_got) % ja->nr; + if (pos <= ja->dirty_idx_ondisk) + ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + nr_got) % ja->nr; + if (pos <= ja->dirty_idx) + ja->dirty_idx = (ja->dirty_idx + nr_got) % ja->nr; + if (pos <= ja->cur_idx) + ja->cur_idx = (ja->cur_idx + nr_got) % ja->nr; - if (!new_fs) { - for (i = 0; i < nr_got; i++) { - ret = bch2_trans_run(c, - bch2_trans_mark_metadata_bucket(&trans, ca, - bu[i], BCH_DATA_journal, - ca->mi.bucket_size)); - if (ret) { - bch2_fs_inconsistent(c, "error marking new journal buckets: %i", ret); - goto err; - } - } - } -err: if (c) + spin_unlock(&c->journal.lock); +err_unblock: + if (c) { + bch2_journal_unblock(&c->journal); mutex_unlock(&c->sb_lock); + } - if (ob && !new_fs) + if (ret && !new_fs) + for (i = 0; i < nr_got; i++) + bch2_trans_run(c, + bch2_trans_mark_metadata_bucket(&trans, ca, + bu[i], BCH_DATA_free, 0)); +err_free: + if (!new_fs) for (i = 0; i < nr_got; i++) bch2_open_bucket_put(c, ob[i]); @@ -882,12 +879,7 @@ err: kfree(new_buckets); kfree(ob); kfree(bu); - return ret; -err_unblock: - if (c) - bch2_journal_unblock(&c->journal); - goto err; } /* @@ -901,13 +893,15 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, struct closure cl; int ret = 0; + closure_init_stack(&cl); + + down_write(&c->state_lock); + /* don't handle reducing nr of buckets yet: */ if (nr < ja->nr) - return 0; - - closure_init_stack(&cl); + goto unlock; - while (ja->nr != nr) { + while (ja->nr < nr) { struct disk_reservation disk_res = { 0, 0 }; /* @@ -938,7 +932,8 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, if (ret) bch_err(c, "%s: err %s", __func__, bch2_err_str(ret)); - +unlock: + up_write(&c->state_lock); return ret; } @@ -977,7 +972,7 @@ static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx) seq++) { struct journal_buf *buf = journal_seq_to_buf(j, seq); - if (bch2_bkey_has_device(bkey_i_to_s_c(&buf->key), dev_idx)) + if (bch2_bkey_has_device_c(bkey_i_to_s_c(&buf->key), dev_idx)) ret = true; } spin_unlock(&j->lock); @@ -1353,6 +1348,7 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 { struct journal_entry_pin_list *pin_list; struct journal_entry_pin *pin; + unsigned i; spin_lock(&j->lock); *seq = max(*seq, j->pin.front); @@ -1370,15 +1366,11 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 prt_newline(out); printbuf_indent_add(out, 2); - list_for_each_entry(pin, &pin_list->list, list) { - prt_printf(out, "\t%px %ps", pin, pin->flush); - prt_newline(out); - } - - list_for_each_entry(pin, &pin_list->key_cache_list, list) { - prt_printf(out, "\t%px %ps", pin, pin->flush); - prt_newline(out); - } + for (i = 0; i < ARRAY_SIZE(pin_list->list); i++) + list_for_each_entry(pin, &pin_list->list[i], list) { + prt_printf(out, "\t%px %ps", pin, pin->flush); + prt_newline(out); + } if (!list_empty(&pin_list->flushed)) { prt_printf(out, "flushed:"); diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index 8d3878bd..cfd92d8b 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -1339,8 +1339,7 @@ static void __journal_write_alloc(struct journal *j, if (!ca->mi.durability || ca->mi.state != BCH_MEMBER_STATE_rw || !ja->nr || - bch2_bkey_has_device(bkey_i_to_s_c(&w->key), - ca->dev_idx) || + bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) || sectors > ja->sectors_free) continue; diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index 8744581d..8c88884c 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -2,6 +2,7 @@ #include "bcachefs.h" #include "btree_key_cache.h" +#include "btree_update.h" #include "errcode.h" #include "error.h" #include "journal.h" @@ -318,9 +319,7 @@ static void bch2_journal_reclaim_fast(struct journal *j) */ while (!fifo_empty(&j->pin) && !atomic_read(&fifo_peek_front(&j->pin).count)) { - BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list)); - BUG_ON(!list_empty(&fifo_peek_front(&j->pin).flushed)); - BUG_ON(!fifo_pop(&j->pin, temp)); + fifo_pop(&j->pin, temp); popped = true; } @@ -379,6 +378,17 @@ void bch2_journal_pin_drop(struct journal *j, spin_unlock(&j->lock); } +enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn) +{ + if (fn == bch2_btree_node_flush0 || + fn == bch2_btree_node_flush1) + return JOURNAL_PIN_btree; + else if (fn == bch2_btree_key_cache_journal_flush) + return JOURNAL_PIN_key_cache; + else + return JOURNAL_PIN_other; +} + void bch2_journal_pin_set(struct journal *j, u64 seq, struct journal_entry_pin *pin, journal_pin_flush_fn flush_fn) @@ -407,10 +417,8 @@ void bch2_journal_pin_set(struct journal *j, u64 seq, pin->seq = seq; pin->flush = flush_fn; - if (flush_fn == bch2_btree_key_cache_journal_flush) - list_add(&pin->list, &pin_list->key_cache_list); - else if (flush_fn) - list_add(&pin->list, &pin_list->list); + if (flush_fn) + list_add(&pin->list, &pin_list->list[journal_pin_type(flush_fn)]); else list_add(&pin->list, &pin_list->flushed); @@ -446,37 +454,37 @@ void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin) static struct journal_entry_pin * journal_get_next_pin(struct journal *j, - bool get_any, - bool get_key_cache, - u64 max_seq, u64 *seq) + u64 seq_to_flush, + unsigned allowed_below_seq, + unsigned allowed_above_seq, + u64 *seq) { struct journal_entry_pin_list *pin_list; struct journal_entry_pin *ret = NULL; + unsigned i; fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) { - if (*seq > max_seq && !get_any && !get_key_cache) + if (*seq > seq_to_flush && !allowed_above_seq) break; - if (*seq <= max_seq || get_any) { - ret = list_first_entry_or_null(&pin_list->list, - struct journal_entry_pin, list); - if (ret) - return ret; - } - - if (*seq <= max_seq || get_any || get_key_cache) { - ret = list_first_entry_or_null(&pin_list->key_cache_list, - struct journal_entry_pin, list); - if (ret) - return ret; - } + for (i = 0; i < JOURNAL_PIN_NR; i++) + if ((((1U << i) & allowed_below_seq) && *seq <= seq_to_flush) || + ((1U << i) & allowed_above_seq)) { + ret = list_first_entry_or_null(&pin_list->list[i], + struct journal_entry_pin, list); + if (ret) + return ret; + } } return NULL; } /* returns true if we did work */ -static size_t journal_flush_pins(struct journal *j, u64 seq_to_flush, +static size_t journal_flush_pins(struct journal *j, + u64 seq_to_flush, + unsigned allowed_below_seq, + unsigned allowed_above_seq, unsigned min_any, unsigned min_key_cache) { @@ -489,15 +497,25 @@ static size_t journal_flush_pins(struct journal *j, u64 seq_to_flush, lockdep_assert_held(&j->reclaim_lock); while (1) { + unsigned allowed_above = allowed_above_seq; + unsigned allowed_below = allowed_below_seq; + + if (min_any) { + allowed_above |= ~0; + allowed_below |= ~0; + } + + if (min_key_cache) { + allowed_above |= 1U << JOURNAL_PIN_key_cache; + allowed_below |= 1U << JOURNAL_PIN_key_cache; + } + cond_resched(); j->last_flushed = jiffies; spin_lock(&j->lock); - pin = journal_get_next_pin(j, - min_any != 0, - min_key_cache != 0, - seq_to_flush, &seq); + pin = journal_get_next_pin(j, seq_to_flush, allowed_below, allowed_above, &seq); if (pin) { BUG_ON(j->flush_in_progress); j->flush_in_progress = pin; @@ -656,6 +674,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) atomic_long_read(&c->btree_key_cache.nr_keys)); nr_flushed = journal_flush_pins(j, seq_to_flush, + ~0, 0, min_nr, min_key_cache); if (direct) @@ -776,7 +795,11 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, mutex_lock(&j->reclaim_lock); - if (journal_flush_pins(j, seq_to_flush, 0, 0)) + if (journal_flush_pins(j, seq_to_flush, + (1U << JOURNAL_PIN_key_cache)| + (1U << JOURNAL_PIN_other), 0, 0, 0) || + journal_flush_pins(j, seq_to_flush, + (1U << JOURNAL_PIN_btree), 0, 0, 0)) *did_work = true; spin_lock(&j->lock); diff --git a/libbcachefs/journal_sb.c b/libbcachefs/journal_sb.c index 9b933330..5be78823 100644 --- a/libbcachefs/journal_sb.c +++ b/libbcachefs/journal_sb.c @@ -175,46 +175,45 @@ const struct bch_sb_field_ops bch_sb_field_ops_journal_v2 = { .to_text = bch2_sb_journal_v2_to_text, }; -int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca) +int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca, + u64 *buckets, unsigned nr) { - struct journal_device *ja = &ca->journal; struct bch_sb_field_journal_v2 *j; - unsigned i, dst = 0, nr = 1; + unsigned i, dst = 0, nr_compacted = 1; if (c) lockdep_assert_held(&c->sb_lock); - if (!ja->nr) { + if (!nr) { bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal); bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal_v2); return 0; } - for (i = 0; i + 1 < ja->nr; i++) - if (ja->buckets[i] + 1 != ja->buckets[i + 1]) - nr++; + for (i = 0; i + 1 < nr; i++) + if (buckets[i] + 1 != buckets[i + 1]) + nr_compacted++; j = bch2_sb_resize_journal_v2(&ca->disk_sb, - (sizeof(*j) + sizeof(j->d[0]) * nr) / sizeof(u64)); + (sizeof(*j) + sizeof(j->d[0]) * nr_compacted) / sizeof(u64)); if (!j) return -BCH_ERR_ENOSPC_sb_journal; bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal); - j->d[dst].start = le64_to_cpu(ja->buckets[0]); + j->d[dst].start = le64_to_cpu(buckets[0]); j->d[dst].nr = le64_to_cpu(1); - for (i = 1; i < ja->nr; i++) { - if (ja->buckets[i] == ja->buckets[i - 1] + 1) { + for (i = 1; i < nr; i++) { + if (buckets[i] == buckets[i - 1] + 1) { le64_add_cpu(&j->d[dst].nr, 1); } else { dst++; - j->d[dst].start = le64_to_cpu(ja->buckets[i]); + j->d[dst].start = le64_to_cpu(buckets[i]); j->d[dst].nr = le64_to_cpu(1); } } - BUG_ON(dst + 1 != nr); - + BUG_ON(dst + 1 != nr_compacted); return 0; } diff --git a/libbcachefs/journal_sb.h b/libbcachefs/journal_sb.h index a39192e9..ba40a7e8 100644 --- a/libbcachefs/journal_sb.h +++ b/libbcachefs/journal_sb.h @@ -21,4 +21,4 @@ static inline unsigned bch2_sb_field_journal_v2_nr_entries(struct bch_sb_field_j extern const struct bch_sb_field_ops bch_sb_field_ops_journal; extern const struct bch_sb_field_ops bch_sb_field_ops_journal_v2; -int bch2_journal_buckets_to_sb(struct bch_fs *, struct bch_dev *); +int bch2_journal_buckets_to_sb(struct bch_fs *, struct bch_dev *, u64 *, unsigned); diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h index 0e6bde66..8d8c0b3d 100644 --- a/libbcachefs/journal_types.h +++ b/libbcachefs/journal_types.h @@ -43,9 +43,15 @@ struct journal_buf { * flushed: */ +enum journal_pin_type { + JOURNAL_PIN_btree, + JOURNAL_PIN_key_cache, + JOURNAL_PIN_other, + JOURNAL_PIN_NR, +}; + struct journal_entry_pin_list { - struct list_head list; - struct list_head key_cache_list; + struct list_head list[JOURNAL_PIN_NR]; struct list_head flushed; atomic_t count; struct bch_devs_list devs; diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c index e3e39127..d93db07f 100644 --- a/libbcachefs/migrate.c +++ b/libbcachefs/migrate.c @@ -46,7 +46,7 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, struct bkey_i *n; int ret; - if (!bch2_bkey_has_device(k, dev_idx)) + if (!bch2_bkey_has_device_c(k, dev_idx)) return 0; n = bch2_bkey_make_mut(trans, k); @@ -130,8 +130,7 @@ retry: while (bch2_trans_begin(&trans), (b = bch2_btree_iter_peek_node(&iter)) && !(ret = PTR_ERR_OR_ZERO(b))) { - if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key), - dev_idx)) + if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx)) goto next; bch2_bkey_buf_copy(&k, c, &b->key); diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 5e952d6c..bb5061bc 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -41,7 +41,8 @@ static void progress_list_del(struct bch_fs *c, struct bch_move_stats *stats) } struct moving_io { - struct list_head list; + struct list_head read_list; + struct list_head io_list; struct move_bucket_in_flight *b; struct closure cl; bool read_completed; @@ -65,8 +66,12 @@ static void move_free(struct moving_io *io) atomic_dec(&io->b->count); bch2_data_update_exit(&io->write); + + mutex_lock(&ctxt->lock); + list_del(&io->io_list); wake_up(&ctxt->wait); - bch2_write_ref_put(c, BCH_WRITE_REF_move); + mutex_unlock(&ctxt->lock); + kfree(io); } @@ -101,7 +106,7 @@ static void move_write(struct moving_io *io) struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt) { struct moving_io *io = - list_first_entry_or_null(&ctxt->reads, struct moving_io, list); + list_first_entry_or_null(&ctxt->reads, struct moving_io, read_list); return io && io->read_completed ? io : NULL; } @@ -128,7 +133,7 @@ void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt, bch2_trans_unlock(trans); while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) { - list_del(&io->list); + list_del(&io->read_list); move_write(io); } } @@ -145,6 +150,8 @@ static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt, void bch2_moving_ctxt_exit(struct moving_context *ctxt) { + struct bch_fs *c = ctxt->c; + move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads)); closure_sync(&ctxt->cl); @@ -154,12 +161,15 @@ void bch2_moving_ctxt_exit(struct moving_context *ctxt) EBUG_ON(atomic_read(&ctxt->read_ios)); if (ctxt->stats) { - progress_list_del(ctxt->c, ctxt->stats); - - trace_move_data(ctxt->c, + progress_list_del(c, ctxt->stats); + trace_move_data(c, atomic64_read(&ctxt->stats->sectors_moved), atomic64_read(&ctxt->stats->keys_moved)); } + + mutex_lock(&c->moving_context_lock); + list_del(&ctxt->list); + mutex_unlock(&c->moving_context_lock); } void bch2_moving_ctxt_init(struct moving_context *ctxt, @@ -172,15 +182,23 @@ void bch2_moving_ctxt_init(struct moving_context *ctxt, memset(ctxt, 0, sizeof(*ctxt)); ctxt->c = c; + ctxt->fn = (void *) _RET_IP_; ctxt->rate = rate; ctxt->stats = stats; ctxt->wp = wp; ctxt->wait_on_copygc = wait_on_copygc; closure_init_stack(&ctxt->cl); + + mutex_init(&ctxt->lock); INIT_LIST_HEAD(&ctxt->reads); + INIT_LIST_HEAD(&ctxt->ios); init_waitqueue_head(&ctxt->wait); + mutex_lock(&c->moving_context_lock); + list_add(&ctxt->list, &c->moving_context_list); + mutex_unlock(&c->moving_context_lock); + if (stats) { progress_list_add(c, stats); stats->data_type = BCH_DATA_user; @@ -262,9 +280,6 @@ static int bch2_move_extent(struct btree_trans *trans, return 0; } - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_move)) - return -BCH_ERR_erofs_no_writes; - /* * Before memory allocations & taking nocow locks in * bch2_data_update_init(): @@ -334,9 +349,14 @@ static int bch2_move_extent(struct btree_trans *trans, this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size); trace_move_extent_read(k.k); + + mutex_lock(&ctxt->lock); atomic_add(io->read_sectors, &ctxt->read_sectors); atomic_inc(&ctxt->read_ios); - list_add_tail(&io->list, &ctxt->reads); + + list_add_tail(&io->read_list, &ctxt->reads); + list_add_tail(&io->io_list, &ctxt->ios); + mutex_unlock(&ctxt->lock); /* * dropped by move_read_endio() - guards against use after free of @@ -354,7 +374,6 @@ err_free_pages: err_free: kfree(io); err: - bch2_write_ref_put(c, BCH_WRITE_REF_move); trace_and_count(c, move_extent_alloc_mem_fail, k.k); return ret; } @@ -759,8 +778,13 @@ int __bch2_evacuate_bucket(struct btree_trans *trans, data_opts.rewrite_ptrs = 0; bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { - if (ptr->dev == bucket.inode) + if (ptr->dev == bucket.inode) { data_opts.rewrite_ptrs |= 1U << i; + if (ptr->cached) { + bch2_trans_iter_exit(trans, &iter); + goto next; + } + } i++; } @@ -819,14 +843,6 @@ next: } trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret); - - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && gen >= 0) { - bch2_trans_unlock(trans); - move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads)); - closure_sync(&ctxt->cl); - if (!ctxt->write_error) - bch2_verify_bucket_evacuated(trans, bucket, gen); - } err: bch2_bkey_buf_exit(&sk, c); return ret; @@ -1111,3 +1127,67 @@ int bch2_data_job(struct bch_fs *c, return ret; } + +void bch2_data_jobs_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct bch_move_stats *stats; + + mutex_lock(&c->data_progress_lock); + list_for_each_entry(stats, &c->data_progress_list, list) { + prt_printf(out, "%s: data type %s btree_id %s position: ", + stats->name, + bch2_data_types[stats->data_type], + bch2_btree_ids[stats->btree_id]); + bch2_bpos_to_text(out, stats->pos); + prt_printf(out, "%s", "\n"); + } + mutex_unlock(&c->data_progress_lock); +} + +static void bch2_moving_ctxt_to_text(struct printbuf *out, struct moving_context *ctxt) +{ + struct moving_io *io; + + prt_printf(out, "%ps:", ctxt->fn); + prt_newline(out); + printbuf_indent_add(out, 2); + + prt_printf(out, "reads: %u sectors %u", + atomic_read(&ctxt->read_ios), + atomic_read(&ctxt->read_sectors)); + prt_newline(out); + + prt_printf(out, "writes: %u sectors %u", + atomic_read(&ctxt->write_ios), + atomic_read(&ctxt->write_sectors)); + prt_newline(out); + + printbuf_indent_add(out, 2); + + mutex_lock(&ctxt->lock); + list_for_each_entry(io, &ctxt->ios, io_list) { + bch2_write_op_to_text(out, &io->write.op); + } + mutex_unlock(&ctxt->lock); + + printbuf_indent_sub(out, 4); +} + +void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct moving_context *ctxt; + + mutex_lock(&c->moving_context_lock); + list_for_each_entry(ctxt, &c->moving_context_list, list) + bch2_moving_ctxt_to_text(out, ctxt); + mutex_unlock(&c->moving_context_lock); +} + +void bch2_fs_move_init(struct bch_fs *c) +{ + INIT_LIST_HEAD(&c->moving_context_list); + mutex_init(&c->moving_context_lock); + + INIT_LIST_HEAD(&c->data_progress_list); + mutex_init(&c->data_progress_lock); +} diff --git a/libbcachefs/move.h b/libbcachefs/move.h index 4c001387..50a6f7d7 100644 --- a/libbcachefs/move.h +++ b/libbcachefs/move.h @@ -11,6 +11,9 @@ struct bch_read_bio; struct moving_context { struct bch_fs *c; + struct list_head list; + void *fn; + struct bch_ratelimit *rate; struct bch_move_stats *stats; struct write_point_specifier wp; @@ -19,7 +22,10 @@ struct moving_context { /* For waiting on outstanding reads and writes: */ struct closure cl; + + struct mutex lock; struct list_head reads; + struct list_head ios; /* in flight sectors: */ atomic_t read_sectors; @@ -84,6 +90,9 @@ int bch2_data_job(struct bch_fs *, struct bch_ioctl_data); void bch2_move_stats_init(struct bch_move_stats *stats, char *name); +void bch2_data_jobs_to_text(struct printbuf *, struct bch_fs *); +void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *); +void bch2_fs_move_init(struct bch_fs *); #endif /* _BCACHEFS_MOVE_H */ diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index 79aaa45f..178f96a6 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -46,7 +46,7 @@ static int bch2_bucket_is_movable(struct btree_trans *trans, if (bch2_bucket_is_open(trans->c, bucket.inode, bucket.offset)) return 0; - bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, 0); + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_CACHED); k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); bch2_trans_iter_exit(trans, &iter); @@ -85,7 +85,7 @@ static int move_bucket_cmp(const void *_l, const void *_r) const struct move_bucket *l = _l; const struct move_bucket *r = _r; - return bpos_cmp(l->bucket, r->bucket) ?: cmp_int(l->gen, r->gen); + return bkey_cmp(l->bucket, r->bucket); } static bool bucket_in_flight(move_buckets *buckets_sorted, struct move_bucket b) @@ -178,13 +178,13 @@ static int bch2_copygc(struct btree_trans *trans, move_buckets_in_flight *buckets_in_flight) { struct bch_fs *c = trans->c; - struct bch_move_stats move_stats; struct data_update_opts data_opts = { .btree_insert_flags = BTREE_INSERT_USE_RESERVE|JOURNAL_WATERMARK_copygc, }; move_buckets buckets = { 0 }; struct move_bucket_in_flight *f; struct move_bucket *i; + u64 moved = atomic64_read(&ctxt->stats->sectors_moved); int ret = 0; ret = bch2_btree_write_buffer_flush(trans); @@ -192,9 +192,6 @@ static int bch2_copygc(struct btree_trans *trans, __func__, bch2_err_str(ret))) return ret; - bch2_move_stats_init(&move_stats, "copygc"); - ctxt->stats = &move_stats; - ret = bch2_copygc_get_buckets(trans, ctxt, buckets_in_flight, &buckets); if (ret) goto err; @@ -222,8 +219,8 @@ err: if (ret < 0 && !bch2_err_matches(ret, EROFS)) bch_err(c, "error from bch2_move_data() in copygc: %s", bch2_err_str(ret)); - trace_and_count(c, copygc, c, atomic64_read(&move_stats.sectors_moved), 0, 0, 0); - ctxt->stats = NULL; + moved = atomic64_read(&ctxt->stats->sectors_moved) - moved; + trace_and_count(c, copygc, c, moved, 0, 0, 0); return ret; } @@ -282,6 +279,7 @@ static int bch2_copygc_thread(void *arg) struct bch_fs *c = arg; struct btree_trans trans; struct moving_context ctxt; + struct bch_move_stats move_stats; struct io_clock *clock = &c->io_clock[WRITE]; move_buckets_in_flight move_buckets; u64 last, wait; @@ -294,7 +292,9 @@ static int bch2_copygc_thread(void *arg) set_freezable(); bch2_trans_init(&trans, c, 0, 0); - bch2_moving_ctxt_init(&ctxt, c, NULL, NULL, + + bch2_move_stats_init(&move_stats, "copygc"); + bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats, writepoint_ptr(&c->copygc_write_point), false); @@ -334,8 +334,8 @@ static int bch2_copygc_thread(void *arg) wake_up(&c->copygc_running_wq); } - bch2_moving_ctxt_exit(&ctxt); bch2_trans_exit(&trans); + bch2_moving_ctxt_exit(&ctxt); free_fifo(&move_buckets); return 0; diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index afbf82d6..719693b3 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -92,6 +92,12 @@ enum opt_type { #define RATELIMIT_ERRORS_DEFAULT false #endif +#ifdef CONFIG_BCACHEFS_DEBUG +#define BCACHEFS_VERBOSE_DEFAULT true +#else +#define BCACHEFS_VERBOSE_DEFAULT false +#endif + #define BCH_OPTS() \ x(block_size, u16, \ OPT_FS|OPT_FORMAT| \ @@ -276,7 +282,7 @@ enum opt_type { x(verbose, u8, \ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, BCACHEFS_VERBOSE_DEFAULT, \ NULL, "Extra debugging information during mount/recovery")\ x(journal_flush_delay, u32, \ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ diff --git a/libbcachefs/reflink.c b/libbcachefs/reflink.c index d2e6adc1..d8426e75 100644 --- a/libbcachefs/reflink.c +++ b/libbcachefs/reflink.c @@ -189,7 +189,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, for_each_btree_key_norestart(trans, reflink_iter, BTREE_ID_reflink, POS(0, c->reflink_hint), - BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) { + BTREE_ITER_SLOTS, k, ret) { if (reflink_iter.pos.inode) { bch2_btree_iter_set_pos(&reflink_iter, POS_MIN); continue; diff --git a/libbcachefs/subvolume.c b/libbcachefs/subvolume.c index d7623965..bcc67c0f 100644 --- a/libbcachefs/subvolume.c +++ b/libbcachefs/subvolume.c @@ -513,7 +513,9 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, n->v.pad = 0; SET_BCH_SNAPSHOT_SUBVOL(&n->v, true); - ret = bch2_trans_update(trans, &iter, &n->k_i, 0); + ret = bch2_trans_update(trans, &iter, &n->k_i, 0) ?: + bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, + bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0); if (ret) goto err; @@ -540,7 +542,7 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, n->v.children[1] = cpu_to_le32(new_snapids[1]); n->v.subvol = 0; SET_BCH_SNAPSHOT_SUBVOL(&n->v, false); - ret = bch2_trans_update(trans, &iter, &n->k_i, 0); + ret = bch2_trans_update(trans, &iter, &n->k_i, 0); if (ret) goto err; } diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 359ca164..3a7f4e29 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -206,11 +206,15 @@ static void __bch2_fs_read_only(struct bch_fs *c) unsigned i, clean_passes = 0; u64 seq = 0; + bch2_fs_ec_stop(c); + bch2_open_buckets_stop(c, NULL, true); bch2_rebalance_stop(c); bch2_copygc_stop(c); bch2_gc_thread_stop(c); + bch2_fs_ec_flush(c); - bch_verbose(c, "flushing journal and stopping allocators"); + bch_verbose(c, "flushing journal and stopping allocators, journal seq %llu", + journal_cur_seq(&c->journal)); do { clean_passes++; @@ -224,7 +228,8 @@ static void __bch2_fs_read_only(struct bch_fs *c) } } while (clean_passes < 2); - bch_verbose(c, "flushing journal and stopping allocators complete"); + bch_verbose(c, "flushing journal and stopping allocators complete, journal seq %llu", + journal_cur_seq(&c->journal)); if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) && !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) @@ -679,6 +684,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_rebalance_init(c); bch2_fs_quota_init(c); bch2_fs_ec_init_early(c); + bch2_fs_move_init(c); INIT_LIST_HEAD(&c->list); @@ -697,17 +703,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) INIT_LIST_HEAD(&c->fsck_errors); mutex_init(&c->fsck_error_lock); - INIT_LIST_HEAD(&c->ec_stripe_head_list); - mutex_init(&c->ec_stripe_head_lock); - - INIT_LIST_HEAD(&c->ec_stripe_new_list); - mutex_init(&c->ec_stripe_new_lock); - - INIT_LIST_HEAD(&c->data_progress_list); - mutex_init(&c->data_progress_lock); - - mutex_init(&c->ec_stripes_heap_lock); - seqcount_init(&c->gc_pos_lock); seqcount_init(&c->usage_lock); diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index ed17b27f..1344ae4c 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -248,6 +248,7 @@ read_attribute(io_timers_read); read_attribute(io_timers_write); read_attribute(data_jobs); +read_attribute(moving_ctxts); #ifdef CONFIG_BCACHEFS_TESTS write_attribute(perf_test); @@ -277,25 +278,6 @@ static size_t bch2_btree_cache_size(struct bch_fs *c) return ret; } -static long data_progress_to_text(struct printbuf *out, struct bch_fs *c) -{ - long ret = 0; - struct bch_move_stats *stats; - - mutex_lock(&c->data_progress_lock); - list_for_each_entry(stats, &c->data_progress_list, list) { - prt_printf(out, "%s: data type %s btree_id %s position: ", - stats->name, - bch2_data_types[stats->data_type], - bch2_btree_ids[stats->btree_id]); - bch2_bpos_to_text(out, stats->pos); - prt_printf(out, "%s", "\n"); - } - - mutex_unlock(&c->data_progress_lock); - return ret; -} - static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c) { struct btree_trans trans; @@ -476,7 +458,10 @@ SHOW(bch2_fs) bch2_io_timers_to_text(out, &c->io_clock[WRITE]); if (attr == &sysfs_data_jobs) - data_progress_to_text(out, c); + bch2_data_jobs_to_text(out, c); + + if (attr == &sysfs_moving_ctxts) + bch2_fs_moving_ctxts_to_text(out, c); #ifdef BCH_WRITE_REF_DEBUG if (attr == &sysfs_write_refs) @@ -693,6 +678,7 @@ struct attribute *bch2_fs_internal_files[] = { sysfs_pd_controller_files(rebalance), &sysfs_data_jobs, + &sysfs_moving_ctxts, &sysfs_internal_uuid, NULL diff --git a/linux/six.c b/linux/six.c index 5a6eadc0..3d366a84 100644 --- a/linux/six.c +++ b/linux/six.c @@ -143,8 +143,17 @@ static int __do_six_trylock_type(struct six_lock *lock, * lock, issue a wakeup because we might have caused a * spurious trylock failure: */ +#if 0 + /* + * This code should be sufficient, but we're seeing unexplained + * lost wakeups: + */ if (old.write_locking) ret = -1 - SIX_LOCK_write; +#else + if (!ret) + ret = -1 - SIX_LOCK_write; +#endif } else if (type == SIX_LOCK_write && lock->readers) { if (try) { atomic64_add(__SIX_VAL(write_locking, 1), @@ -320,11 +329,10 @@ static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type, * Similar to the lock path, we may have caused a spurious write * lock fail and need to issue a wakeup: */ - if (old.write_locking) - six_lock_wakeup(lock, old, SIX_LOCK_write); - if (ret) six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip); + else + six_lock_wakeup(lock, old, SIX_LOCK_write); return ret; } -- cgit 1.2.3-korg