aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@linux.dev>2023-10-25 02:09:44 -0400
committerKent Overstreet <kent.overstreet@linux.dev>2023-10-25 13:59:16 -0400
commit9799b119c34d7be1ee96d143209cfe5fc543d92a (patch)
treef973e4d166a98c57d7ddb32b15eed16a7382278e
parentbd9e0153342c51390ec655b4e78eda1aa1c32a84 (diff)
downloadbcachefs-tools-9799b119c34d7be1ee96d143209cfe5fc543d92a.tar.gz
Update bcachefs sources to 0d63ed13ea3d closures: Fix race in closure_sync()
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
-rw-r--r--.bcachefs_revision2
-rw-r--r--include/linux/atomic.h6
-rw-r--r--include/linux/closure.h12
-rw-r--r--include/linux/sched.h8
-rw-r--r--libbcachefs/bbpos.h14
-rw-r--r--libbcachefs/bbpos_types.h18
-rw-r--r--libbcachefs/bcachefs.h5
-rw-r--r--libbcachefs/bcachefs_format.h34
-rw-r--r--libbcachefs/bkey_methods.h10
-rw-r--r--libbcachefs/btree_trans_commit.c6
-rw-r--r--libbcachefs/buckets.c165
-rw-r--r--libbcachefs/buckets.h15
-rw-r--r--libbcachefs/chardev.c4
-rw-r--r--libbcachefs/compress.c26
-rw-r--r--libbcachefs/compress.h36
-rw-r--r--libbcachefs/data_update.c21
-rw-r--r--libbcachefs/data_update.h1
-rw-r--r--libbcachefs/disk_groups.c146
-rw-r--r--libbcachefs/disk_groups.h7
-rw-r--r--libbcachefs/disk_groups_types.h18
-rw-r--r--libbcachefs/errcode.h1
-rw-r--r--libbcachefs/extents.c185
-rw-r--r--libbcachefs/extents.h43
-rw-r--r--libbcachefs/fs-io-direct.c1
-rw-r--r--libbcachefs/fsck.c49
-rw-r--r--libbcachefs/fsck.h1
-rw-r--r--libbcachefs/inode.c20
-rw-r--r--libbcachefs/inode.h1
-rw-r--r--libbcachefs/io_misc.c11
-rw-r--r--libbcachefs/io_write.c25
-rw-r--r--libbcachefs/journal.c19
-rw-r--r--libbcachefs/journal.h1
-rw-r--r--libbcachefs/move.c400
-rw-r--r--libbcachefs/move.h62
-rw-r--r--libbcachefs/move_types.h8
-rw-r--r--libbcachefs/movinggc.c37
-rw-r--r--libbcachefs/opts.c3
-rw-r--r--libbcachefs/opts.h1
-rw-r--r--libbcachefs/printbuf.c4
-rw-r--r--libbcachefs/rebalance.c553
-rw-r--r--libbcachefs/rebalance.h9
-rw-r--r--libbcachefs/rebalance_types.h31
-rw-r--r--libbcachefs/recovery.c25
-rw-r--r--libbcachefs/recovery_types.h4
-rw-r--r--libbcachefs/reflink.c53
-rw-r--r--libbcachefs/super.c38
-rw-r--r--libbcachefs/super_types.h12
-rw-r--r--libbcachefs/sysfs.c23
-rw-r--r--libbcachefs/trace.c1
-rw-r--r--libbcachefs/trace.h31
-rw-r--r--libbcachefs/xattr.c2
-rw-r--r--linux/closure.c9
52 files changed, 1419 insertions, 798 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index da47120b..db0b3f7a 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-f70a3402188ea797a38fa9f5b729fb6fbe5f5b83
+0d63ed13ea3d867055ae5752e2e0514a227d1dcb
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index f4d047c1..f1464cf3 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -47,6 +47,7 @@ typedef struct {
#define smp_rmb() cmm_smp_rmb()
#define smp_mb() cmm_smp_mb()
#define smp_read_barrier_depends() cmm_smp_read_barrier_depends()
+#define smp_acquire__after_ctrl_dep() cmm_smp_mb()
#else /* C11_ATOMICS */
@@ -205,6 +206,11 @@ static inline i_type a_type##_dec_return(a_type##_t *v) \
return __ATOMIC_DEC_RETURN(&v->counter); \
} \
\
+static inline i_type a_type##_dec_return_release(a_type##_t *v) \
+{ \
+ return __ATOMIC_SUB_RETURN_RELEASE(1, &v->counter); \
+} \
+ \
static inline void a_type##_inc(a_type##_t *v) \
{ \
__ATOMIC_INC(&v->counter); \
diff --git a/include/linux/closure.h b/include/linux/closure.h
index 722a586b..de7bb47d 100644
--- a/include/linux/closure.h
+++ b/include/linux/closure.h
@@ -154,6 +154,7 @@ struct closure {
struct closure *parent;
atomic_t remaining;
+ bool closure_get_happened;
#ifdef CONFIG_DEBUG_CLOSURES
#define CLOSURE_MAGIC_DEAD 0xc054dead
@@ -185,7 +186,11 @@ static inline unsigned closure_nr_remaining(struct closure *cl)
*/
static inline void closure_sync(struct closure *cl)
{
- if (closure_nr_remaining(cl) != 1)
+#ifdef CONFIG_DEBUG_CLOSURES
+ BUG_ON(closure_nr_remaining(cl) != 1 && !cl->closure_get_happened);
+#endif
+
+ if (cl->closure_get_happened)
__closure_sync(cl);
}
@@ -233,8 +238,6 @@ static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
closure_set_ip(cl);
cl->fn = fn;
cl->wq = wq;
- /* between atomic_dec() in closure_put() */
- smp_mb__before_atomic();
}
static inline void closure_queue(struct closure *cl)
@@ -259,6 +262,8 @@ static inline void closure_queue(struct closure *cl)
*/
static inline void closure_get(struct closure *cl)
{
+ cl->closure_get_happened = true;
+
#ifdef CONFIG_DEBUG_CLOSURES
BUG_ON((atomic_inc_return(&cl->remaining) &
CLOSURE_REMAINING_MASK) <= 1);
@@ -281,6 +286,7 @@ static inline void closure_init(struct closure *cl, struct closure *parent)
closure_get(parent);
atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
+ cl->closure_get_happened = false;
closure_debug_create(cl);
closure_set_ip(cl);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 825eea5c..7afb6d54 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -151,6 +151,14 @@ static inline u64 ktime_get_seconds(void)
return ts.tv_sec;
}
+static inline u64 ktime_get_real_ns(void)
+{
+ struct timespec ts;
+
+ clock_gettime(CLOCK_REALTIME, &ts);
+ return timespec_to_ns(&ts);
+}
+
static inline u64 ktime_get_real_seconds(void)
{
struct timespec ts;
diff --git a/libbcachefs/bbpos.h b/libbcachefs/bbpos.h
index 0038bc28..be2edced 100644
--- a/libbcachefs/bbpos.h
+++ b/libbcachefs/bbpos.h
@@ -2,22 +2,10 @@
#ifndef _BCACHEFS_BBPOS_H
#define _BCACHEFS_BBPOS_H
+#include "bbpos_types.h"
#include "bkey_methods.h"
#include "btree_cache.h"
-struct bbpos {
- enum btree_id btree;
- struct bpos pos;
-};
-
-static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos)
-{
- return (struct bbpos) { btree, pos };
-}
-
-#define BBPOS_MIN BBPOS(0, POS_MIN)
-#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, POS_MAX)
-
static inline int bbpos_cmp(struct bbpos l, struct bbpos r)
{
return cmp_int(l.btree, r.btree) ?: bpos_cmp(l.pos, r.pos);
diff --git a/libbcachefs/bbpos_types.h b/libbcachefs/bbpos_types.h
new file mode 100644
index 00000000..5198e94c
--- /dev/null
+++ b/libbcachefs/bbpos_types.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BBPOS_TYPES_H
+#define _BCACHEFS_BBPOS_TYPES_H
+
+struct bbpos {
+ enum btree_id btree;
+ struct bpos pos;
+};
+
+static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos)
+{
+ return (struct bbpos) { btree, pos };
+}
+
+#define BBPOS_MIN BBPOS(0, POS_MIN)
+#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, POS_MAX)
+
+#endif /* _BCACHEFS_BBPOS_TYPES_H */
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index 9863571f..68f0ff03 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -418,6 +418,7 @@ enum bch_time_stats {
#include "buckets_types.h"
#include "buckets_waiting_for_journal_types.h"
#include "clock_types.h"
+#include "disk_groups_types.h"
#include "ec_types.h"
#include "journal_types.h"
#include "keylist_types.h"
@@ -463,6 +464,7 @@ enum gc_phase {
GC_PHASE_BTREE_snapshot_trees,
GC_PHASE_BTREE_deleted_inodes,
GC_PHASE_BTREE_logged_ops,
+ GC_PHASE_BTREE_rebalance_work,
GC_PHASE_PENDING_DELETE,
};
@@ -938,9 +940,6 @@ struct bch_fs {
struct list_head moving_context_list;
struct mutex moving_context_lock;
- struct list_head data_progress_list;
- struct mutex data_progress_lock;
-
/* REBALANCE */
struct bch_fs_rebalance rebalance;
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h
index 99749f33..e04999c5 100644
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -613,31 +613,17 @@ struct bch_extent_stripe_ptr {
#endif
};
-struct bch_extent_reservation {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:6,
- unused:22,
- replicas:4,
- generation:32;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 generation:32,
- replicas:4,
- unused:22,
- type:6;
-#endif
-};
-
struct bch_extent_rebalance {
#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:7,
- unused:33,
- compression:8,
+ __u64 type:6,
+ unused:34,
+ compression:8, /* enum bch_compression_opt */
target:16;
#elif defined (__BIG_ENDIAN_BITFIELD)
__u64 target:16,
compression:8,
- unused:33,
- type:7;
+ unused:34,
+ type:6;
#endif
};
@@ -1682,7 +1668,9 @@ struct bch_sb_field_journal_seq_blacklist {
x(snapshot_skiplists, BCH_VERSION(1, 1), \
BIT_ULL(BCH_RECOVERY_PASS_check_snapshots)) \
x(deleted_inodes, BCH_VERSION(1, 2), \
- BIT_ULL(BCH_RECOVERY_PASS_check_inodes))
+ BIT_ULL(BCH_RECOVERY_PASS_check_inodes)) \
+ x(rebalance_work, BCH_VERSION(1, 3), \
+ BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance))
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
@@ -1693,7 +1681,7 @@ enum bcachefs_metadata_version {
};
static const __maybe_unused
-unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_major_minor;
+unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_rebalance_work;
#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1)
@@ -2306,7 +2294,9 @@ enum btree_id_flags {
BIT_ULL(KEY_TYPE_set)) \
x(logged_ops, 17, 0, \
BIT_ULL(KEY_TYPE_logged_op_truncate)| \
- BIT_ULL(KEY_TYPE_logged_op_finsert))
+ BIT_ULL(KEY_TYPE_logged_op_finsert)) \
+ x(rebalance_work, 18, BTREE_ID_SNAPSHOTS, \
+ BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie))
enum btree_id {
#define x(name, nr, ...) BTREE_ID_##name = nr,
diff --git a/libbcachefs/bkey_methods.h b/libbcachefs/bkey_methods.h
index 668f595e..c829c8e3 100644
--- a/libbcachefs/bkey_methods.h
+++ b/libbcachefs/bkey_methods.h
@@ -119,16 +119,6 @@ enum btree_update_flags {
#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE)
#define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC)
-#define BTREE_TRIGGER_WANTS_OLD_AND_NEW \
- ((1U << KEY_TYPE_alloc)| \
- (1U << KEY_TYPE_alloc_v2)| \
- (1U << KEY_TYPE_alloc_v3)| \
- (1U << KEY_TYPE_alloc_v4)| \
- (1U << KEY_TYPE_stripe)| \
- (1U << KEY_TYPE_inode)| \
- (1U << KEY_TYPE_inode_v2)| \
- (1U << KEY_TYPE_snapshot))
-
static inline int bch2_trans_mark_key(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_i *new,
diff --git a/libbcachefs/btree_trans_commit.c b/libbcachefs/btree_trans_commit.c
index 1000b456..53ddcaf0 100644
--- a/libbcachefs/btree_trans_commit.c
+++ b/libbcachefs/btree_trans_commit.c
@@ -382,8 +382,7 @@ static int run_one_mem_trigger(struct btree_trans *trans,
if (!btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id)))
return 0;
- if (old_ops->atomic_trigger == new_ops->atomic_trigger &&
- ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
+ if (old_ops->atomic_trigger == new_ops->atomic_trigger) {
ret = bch2_mark_key(trans, i->btree_id, i->level,
old, bkey_i_to_s_c(new),
BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
@@ -425,8 +424,7 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
if (!i->insert_trigger_run &&
!i->overwrite_trigger_run &&
- old_ops->trans_trigger == new_ops->trans_trigger &&
- ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
+ old_ops->trans_trigger == new_ops->trans_trigger) {
i->overwrite_trigger_run = true;
i->insert_trigger_run = true;
return bch2_trans_mark_key(trans, i->btree_id, i->level, old, i->k,
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c
index a1a4b5fe..a8af803e 100644
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -935,14 +935,12 @@ static int bch2_mark_stripe_ptr(struct btree_trans *trans,
return 0;
}
-int bch2_mark_extent(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
+static int __mark_extent(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, unsigned flags)
{
u64 journal_seq = trans->journal_res.seq;
struct bch_fs *c = trans->c;
- struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
@@ -1018,6 +1016,14 @@ int bch2_mark_extent(struct btree_trans *trans,
return 0;
}
+int bch2_mark_extent(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
+{
+ return mem_trigger_run_overwrite_then_insert(__mark_extent, trans, btree_id, level, old, new, flags);
+}
+
int bch2_mark_stripe(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_s_c new,
@@ -1124,13 +1130,11 @@ int bch2_mark_stripe(struct btree_trans *trans,
return 0;
}
-int bch2_mark_reservation(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
+static int __mark_reservation(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, unsigned flags)
{
struct bch_fs *c = trans->c;
- struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new;
struct bch_fs_usage *fs_usage;
unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
s64 sectors = (s64) k.k->size;
@@ -1157,6 +1161,14 @@ int bch2_mark_reservation(struct btree_trans *trans,
return 0;
}
+int bch2_mark_reservation(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
+{
+ return mem_trigger_run_overwrite_then_insert(__mark_reservation, trans, btree_id, level, old, new, flags);
+}
+
static s64 __bch2_mark_reflink_p(struct btree_trans *trans,
struct bkey_s_c_reflink_p p,
u64 start, u64 end,
@@ -1211,13 +1223,11 @@ fsck_err:
return ret;
}
-int bch2_mark_reflink_p(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
+static int __mark_reflink_p(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, unsigned flags)
{
struct bch_fs *c = trans->c;
- struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new;
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
struct reflink_gc *ref;
size_t l, r, m;
@@ -1251,6 +1261,14 @@ int bch2_mark_reflink_p(struct btree_trans *trans,
return ret;
}
+int bch2_mark_reflink_p(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_s_c new,
+ unsigned flags)
+{
+ return mem_trigger_run_overwrite_then_insert(__mark_reflink_p, trans, btree_id, level, old, new, flags);
+}
+
void bch2_trans_fs_usage_revert(struct btree_trans *trans,
struct replicas_delta_list *deltas)
{
@@ -1452,15 +1470,11 @@ err:
return ret;
}
-int bch2_trans_mark_extent(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_i *new,
- unsigned flags)
+static int __trans_mark_extent(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, unsigned flags)
{
struct bch_fs *c = trans->c;
- struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
- ? old
- : bkey_i_to_s_c(new);
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
@@ -1517,6 +1531,24 @@ int bch2_trans_mark_extent(struct btree_trans *trans,
return ret;
}
+int bch2_trans_mark_extent(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_i *new,
+ unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ int mod = (int) bch2_bkey_needs_rebalance(c, bkey_i_to_s_c(new)) -
+ (int) bch2_bkey_needs_rebalance(c, old);
+
+ if (mod) {
+ int ret = bch2_btree_bit_mod(trans, BTREE_ID_rebalance_work, new->k.p, mod > 0);
+ if (ret)
+ return ret;
+ }
+
+ return trigger_run_overwrite_then_insert(__trans_mark_extent, trans, btree_id, level, old, new, flags);
+}
+
static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
struct bkey_s_c_stripe s,
unsigned idx, bool deleting)
@@ -1670,15 +1702,10 @@ int bch2_trans_mark_stripe(struct btree_trans *trans,
return ret;
}
-int bch2_trans_mark_reservation(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old,
- struct bkey_i *new,
- unsigned flags)
+static int __trans_mark_reservation(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, unsigned flags)
{
- struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
- ? old
- : bkey_i_to_s_c(new);
unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
s64 sectors = (s64) k.k->size;
struct replicas_delta_list *d;
@@ -1700,7 +1727,16 @@ int bch2_trans_mark_reservation(struct btree_trans *trans,
return 0;
}
-static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
+int bch2_trans_mark_reservation(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old,
+ struct bkey_i *new,
+ unsigned flags)
+{
+ return trigger_run_overwrite_then_insert(__trans_mark_reservation, trans, btree_id, level, old, new, flags);
+}
+
+static int trans_mark_reflink_p_segment(struct btree_trans *trans,
struct bkey_s_c_reflink_p p,
u64 *idx, unsigned flags)
{
@@ -1767,35 +1803,38 @@ err:
return ret;
}
-int bch2_trans_mark_reflink_p(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old,
- struct bkey_i *new,
- unsigned flags)
+static int __trans_mark_reflink_p(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, unsigned flags)
{
- struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
- ? old
- : bkey_i_to_s_c(new);
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
u64 idx, end_idx;
int ret = 0;
- if (flags & BTREE_TRIGGER_INSERT) {
- struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
-
- v->front_pad = v->back_pad = 0;
- }
-
idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad);
end_idx = le64_to_cpu(p.v->idx) + p.k->size +
le32_to_cpu(p.v->back_pad);
while (idx < end_idx && !ret)
- ret = __bch2_trans_mark_reflink_p(trans, p, &idx, flags);
-
+ ret = trans_mark_reflink_p_segment(trans, p, &idx, flags);
return ret;
}
+int bch2_trans_mark_reflink_p(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old,
+ struct bkey_i *new,
+ unsigned flags)
+{
+ if (flags & BTREE_TRIGGER_INSERT) {
+ struct bch_reflink_p *v = &bkey_i_to_reflink_p(new)->v;
+
+ v->front_pad = v->back_pad = 0;
+ }
+
+ return trigger_run_overwrite_then_insert(__trans_mark_reflink_p, trans, btree_id, level, old, new, flags);
+}
+
static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
struct bch_dev *ca, size_t b,
enum bch_data_type type,
@@ -1825,16 +1864,16 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
bch2_data_types[type],
bch2_data_types[type]);
ret = -EIO;
- goto out;
+ goto err;
}
- a->v.data_type = type;
- a->v.dirty_sectors = sectors;
-
- ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
- if (ret)
- goto out;
-out:
+ if (a->v.data_type != type ||
+ a->v.dirty_sectors != sectors) {
+ a->v.data_type = type;
+ a->v.dirty_sectors = sectors;
+ ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
+ }
+err:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -1929,6 +1968,22 @@ int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca)
return ret;
}
+int bch2_trans_mark_dev_sbs(struct bch_fs *c)
+{
+ struct bch_dev *ca;
+ unsigned i;
+
+ for_each_online_member(ca, c, i) {
+ int ret = bch2_trans_mark_dev_sb(c, ca);
+ if (ret) {
+ percpu_ref_put(&ca->ref);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
/* Disk reservations: */
#define SECTORS_CACHE 1024
diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h
index bf8d7f40..21f6cb35 100644
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@@ -339,12 +339,27 @@ int bch2_trans_mark_stripe(struct btree_trans *, enum btree_id, unsigned, struct
int bch2_trans_mark_reservation(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
+#define mem_trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)\
+({ \
+ int ret = 0; \
+ \
+ if (_old.k->type) \
+ ret = _fn(_trans, _btree_id, _level, _old, _flags & ~BTREE_TRIGGER_INSERT); \
+ if (!ret && _new.k->type) \
+ ret = _fn(_trans, _btree_id, _level, _new, _flags & ~BTREE_TRIGGER_OVERWRITE); \
+ ret; \
+})
+
+#define trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags) \
+ mem_trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, bkey_i_to_s_c(_new), _flags)
+
void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *);
int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,
size_t, enum bch_data_type, unsigned);
int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *);
+int bch2_trans_mark_dev_sbs(struct bch_fs *);
static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b)
{
diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c
index f69e15dc..4bb88aef 100644
--- a/libbcachefs/chardev.c
+++ b/libbcachefs/chardev.c
@@ -332,8 +332,8 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
struct bch_ioctl_data_event e = {
.type = BCH_DATA_EVENT_PROGRESS,
.p.data_type = ctx->stats.data_type,
- .p.btree_id = ctx->stats.btree_id,
- .p.pos = ctx->stats.pos,
+ .p.btree_id = ctx->stats.pos.btree,
+ .p.pos = ctx->stats.pos.pos,
.p.sectors_done = atomic64_read(&ctx->stats.sectors_seen),
.p.sectors_total = bch2_fs_usage_read_short(c).used,
};
diff --git a/libbcachefs/compress.c b/libbcachefs/compress.c
index 1480b645..a8b148ec 100644
--- a/libbcachefs/compress.c
+++ b/libbcachefs/compress.c
@@ -697,14 +697,32 @@ err:
return ret;
}
+void bch2_compression_opt_to_text(struct printbuf *out, u64 v)
+{
+ struct bch_compression_opt opt = bch2_compression_decode(v);
+
+ if (opt.type < BCH_COMPRESSION_OPT_NR)
+ prt_str(out, bch2_compression_opts[opt.type]);
+ else
+ prt_printf(out, "(unknown compression opt %u)", opt.type);
+ if (opt.level)
+ prt_printf(out, ":%u", opt.level);
+}
+
void bch2_opt_compression_to_text(struct printbuf *out,
struct bch_fs *c,
struct bch_sb *sb,
u64 v)
{
- struct bch_compression_opt opt = bch2_compression_decode(v);
+ return bch2_compression_opt_to_text(out, v);
+}
- prt_str(out, bch2_compression_opts[opt.type]);
- if (opt.level)
- prt_printf(out, ":%u", opt.level);
+int bch2_opt_compression_validate(u64 v, struct printbuf *err)
+{
+ if (!bch2_compression_opt_valid(v)) {
+ prt_printf(err, "invalid compression opt %llu", v);
+ return -BCH_ERR_invalid_sb_opt_compression;
+ }
+
+ return 0;
}
diff --git a/libbcachefs/compress.h b/libbcachefs/compress.h
index 052ea303..607fd5e2 100644
--- a/libbcachefs/compress.h
+++ b/libbcachefs/compress.h
@@ -4,12 +4,18 @@
#include "extents_types.h"
+static const unsigned __bch2_compression_opt_to_type[] = {
+#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t,
+ BCH_COMPRESSION_OPTS()
+#undef x
+};
+
struct bch_compression_opt {
u8 type:4,
level:4;
};
-static inline struct bch_compression_opt bch2_compression_decode(unsigned v)
+static inline struct bch_compression_opt __bch2_compression_decode(unsigned v)
{
return (struct bch_compression_opt) {
.type = v & 15,
@@ -17,17 +23,25 @@ static inline struct bch_compression_opt bch2_compression_decode(unsigned v)
};
}
+static inline bool bch2_compression_opt_valid(unsigned v)
+{
+ struct bch_compression_opt opt = __bch2_compression_decode(v);
+
+ return opt.type < ARRAY_SIZE(__bch2_compression_opt_to_type) && !(!opt.type && opt.level);
+}
+
+static inline struct bch_compression_opt bch2_compression_decode(unsigned v)
+{
+ return bch2_compression_opt_valid(v)
+ ? __bch2_compression_decode(v)
+ : (struct bch_compression_opt) { 0 };
+}
+
static inline unsigned bch2_compression_encode(struct bch_compression_opt opt)
{
return opt.type|(opt.level << 4);
}
-static const unsigned __bch2_compression_opt_to_type[] = {
-#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t,
- BCH_COMPRESSION_OPTS()
-#undef x
-};
-
static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v)
{
return __bch2_compression_opt_to_type[bch2_compression_decode(v).type];
@@ -44,12 +58,16 @@ int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned);
void bch2_fs_compress_exit(struct bch_fs *);
int bch2_fs_compress_init(struct bch_fs *);
+void bch2_compression_opt_to_text(struct printbuf *, u64);
+
int bch2_opt_compression_parse(struct bch_fs *, const char *, u64 *, struct printbuf *);
void bch2_opt_compression_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
+int bch2_opt_compression_validate(u64, struct printbuf *);
#define bch2_opt_compression (struct bch_opt_fn) { \
- .parse = bch2_opt_compression_parse, \
- .to_text = bch2_opt_compression_to_text, \
+ .parse = bch2_opt_compression_parse, \
+ .to_text = bch2_opt_compression_to_text, \
+ .validate = bch2_opt_compression_validate, \
}
#endif /* _BCACHEFS_COMPRESS_H */
diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c
index 899ff46d..d116f2f0 100644
--- a/libbcachefs/data_update.c
+++ b/libbcachefs/data_update.c
@@ -13,6 +13,7 @@
#include "keylist.h"
#include "move.h"
#include "nocow_locking.h"
+#include "rebalance.h"
#include "subvolume.h"
#include "trace.h"
@@ -251,11 +252,11 @@ restart_drop_extra_replicas:
ret = bch2_insert_snapshot_whiteouts(trans, m->btree_id,
k.k->p, bkey_start_pos(&insert->k)) ?:
bch2_insert_snapshot_whiteouts(trans, m->btree_id,
- k.k->p, insert->k.p);
- if (ret)
- goto err;
-
- ret = bch2_trans_update(trans, &iter, insert,
+ k.k->p, insert->k.p) ?:
+ bch2_bkey_set_needs_rebalance(c, insert,
+ op->opts.background_target,
+ op->opts.background_compression) ?:
+ bch2_trans_update(trans, &iter, insert,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(trans, &op->res,
NULL,
@@ -281,11 +282,11 @@ next:
}
continue;
nowork:
- if (m->ctxt && m->ctxt->stats) {
+ if (m->stats && m->stats) {
BUG_ON(k.k->p.offset <= iter.pos.offset);
- atomic64_inc(&m->ctxt->stats->keys_raced);
+ atomic64_inc(&m->stats->keys_raced);
atomic64_add(k.k->p.offset - iter.pos.offset,
- &m->ctxt->stats->sectors_raced);
+ &m->stats->sectors_raced);
}
this_cpu_inc(c->counters[BCH_COUNTER_move_extent_fail]);
@@ -439,6 +440,8 @@ int bch2_data_update_init(struct btree_trans *trans,
bch2_bkey_buf_reassemble(&m->k, c, k);
m->btree_id = btree_id;
m->data_opts = data_opts;
+ m->ctxt = ctxt;
+ m->stats = ctxt ? ctxt->stats : NULL;
bch2_write_op_init(&m->op, c, io_opts);
m->op.pos = bkey_start_pos(k.k);
@@ -487,7 +490,7 @@ int bch2_data_update_init(struct btree_trans *trans,
if (c->opts.nocow_enabled) {
if (ctxt) {
- move_ctxt_wait_event(ctxt, trans,
+ move_ctxt_wait_event(ctxt,
(locked = bch2_bucket_nocow_trylock(&c->nocow_locks,
PTR_BUCKET_POS(c, &p.ptr), 0)) ||
!atomic_read(&ctxt->read_sectors));
diff --git a/libbcachefs/data_update.h b/libbcachefs/data_update.h
index 7ca1f98d..9dc17b9d 100644
--- a/libbcachefs/data_update.h
+++ b/libbcachefs/data_update.h
@@ -23,6 +23,7 @@ struct data_update {
struct bkey_buf k;
struct data_update_opts data_opts;
struct moving_context *ctxt;
+ struct bch_move_stats *stats;
struct bch_write_op op;
};
diff --git a/libbcachefs/disk_groups.c b/libbcachefs/disk_groups.c
index e00133b6..d613695a 100644
--- a/libbcachefs/disk_groups.c
+++ b/libbcachefs/disk_groups.c
@@ -175,6 +175,7 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
dst->deleted = BCH_GROUP_DELETED(src);
dst->parent = BCH_GROUP_PARENT(src);
+ memcpy(dst->label, src->label, sizeof(dst->label));
}
for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
@@ -382,7 +383,57 @@ int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name)
return v;
}
-void bch2_disk_path_to_text(struct printbuf *out, struct bch_sb *sb, unsigned v)
+void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
+{
+ struct bch_disk_groups_cpu *groups;
+ struct bch_disk_group_cpu *g;
+ unsigned nr = 0;
+ u16 path[32];
+
+ out->atomic++;
+ rcu_read_lock();
+ groups = rcu_dereference(c->disk_groups);
+ if (!groups)
+ goto invalid;
+
+ while (1) {
+ if (nr == ARRAY_SIZE(path))
+ goto invalid;
+
+ if (v >= groups->nr)
+ goto invalid;
+
+ g = groups->entries + v;
+
+ if (g->deleted)
+ goto invalid;
+
+ path[nr++] = v;
+
+ if (!g->parent)
+ break;
+
+ v = g->parent - 1;
+ }
+
+ while (nr) {
+ v = path[--nr];
+ g = groups->entries + v;
+
+ prt_printf(out, "%.*s", (int) sizeof(g->label), g->label);
+ if (nr)
+ prt_printf(out, ".");
+ }
+out:
+ rcu_read_unlock();
+ out->atomic--;
+ return;
+invalid:
+ prt_printf(out, "invalid label %u", v);
+ goto out;
+}
+
+void bch2_disk_path_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
{
struct bch_sb_field_disk_groups *groups =
bch2_sb_field_get(sb, disk_groups);
@@ -493,10 +544,7 @@ int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res,
return -EINVAL;
}
-void bch2_opt_target_to_text(struct printbuf *out,
- struct bch_fs *c,
- struct bch_sb *sb,
- u64 v)
+void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
{
struct target t = target_decode(v);
@@ -504,47 +552,69 @@ void bch2_opt_target_to_text(struct printbuf *out,
case TARGET_NULL:
prt_printf(out, "none");
break;
- case TARGET_DEV:
- if (c) {
- struct bch_dev *ca;
-
- rcu_read_lock();
- ca = t.dev < c->sb.nr_devices
- ? rcu_dereference(c->devs[t.dev])
- : NULL;
-
- if (ca && percpu_ref_tryget(&ca->io_ref)) {
- prt_printf(out, "/dev/%pg", ca->disk_sb.bdev);
- percpu_ref_put(&ca->io_ref);
- } else if (ca) {
- prt_printf(out, "offline device %u", t.dev);
- } else {
- prt_printf(out, "invalid device %u", t.dev);
- }
-
- rcu_read_unlock();
+ case TARGET_DEV: {
+ struct bch_dev *ca;
+
+ rcu_read_lock();
+ ca = t.dev < c->sb.nr_devices
+ ? rcu_dereference(c->devs[t.dev])
+ : NULL;
+
+ if (ca && percpu_ref_tryget(&ca->io_ref)) {
+ prt_printf(out, "/dev/%pg", ca->disk_sb.bdev);
+ percpu_ref_put(&ca->io_ref);
+ } else if (ca) {
+ prt_printf(out, "offline device %u", t.dev);
} else {
- struct bch_member m = bch2_sb_member_get(sb, t.dev);
-
- if (bch2_dev_exists(sb, t.dev)) {
- prt_printf(out, "Device ");
- pr_uuid(out, m.uuid.b);
- prt_printf(out, " (%u)", t.dev);
- } else {
- prt_printf(out, "Bad device %u", t.dev);
- }
+ prt_printf(out, "invalid device %u", t.dev);
}
+
+ rcu_read_unlock();
break;
+ }
case TARGET_GROUP:
- if (c) {
- mutex_lock(&c->sb_lock);
- bch2_disk_path_to_text(out, c->disk_sb.sb, t.group);
- mutex_unlock(&c->sb_lock);
+ bch2_disk_path_to_text(out, c, t.group);
+ break;
+ default:
+ BUG();
+ }
+}
+
+void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
+{
+ struct target t = target_decode(v);
+
+ switch (t.type) {
+ case TARGET_NULL:
+ prt_printf(out, "none");
+ break;
+ case TARGET_DEV: {
+ struct bch_member m = bch2_sb_member_get(sb, t.dev);
+
+ if (bch2_dev_exists(sb, t.dev)) {
+ prt_printf(out, "Device ");
+ pr_uuid(out, m.uuid.b);
+ prt_printf(out, " (%u)", t.dev);
} else {
- bch2_disk_path_to_text(out, sb, t.group);
+ prt_printf(out, "Bad device %u", t.dev);
}
break;
+ }
+ case TARGET_GROUP:
+ bch2_disk_path_to_text_sb(out, sb, t.group);
+ break;
default:
BUG();
}
}
+
+void bch2_opt_target_to_text(struct printbuf *out,
+ struct bch_fs *c,
+ struct bch_sb *sb,
+ u64 v)
+{
+ if (c)
+ bch2_target_to_text(out, c, v);
+ else
+ bch2_target_to_text_sb(out, sb, v);
+}
diff --git a/libbcachefs/disk_groups.h b/libbcachefs/disk_groups.h
index bd771176..441826ff 100644
--- a/libbcachefs/disk_groups.h
+++ b/libbcachefs/disk_groups.h
@@ -2,6 +2,8 @@
#ifndef _BCACHEFS_DISK_GROUPS_H
#define _BCACHEFS_DISK_GROUPS_H
+#include "disk_groups_types.h"
+
extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups;
static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups)
@@ -83,7 +85,10 @@ int bch2_disk_path_find(struct bch_sb_handle *, const char *);
/* Exported for userspace bcachefs-tools: */
int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
-void bch2_disk_path_to_text(struct printbuf *, struct bch_sb *, unsigned);
+void bch2_disk_path_to_text(struct printbuf *, struct bch_fs *, unsigned);
+void bch2_disk_path_to_text_sb(struct printbuf *, struct bch_sb *, unsigned);
+
+void bch2_target_to_text(struct printbuf *out, struct bch_fs *, unsigned);
int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *, struct printbuf *);
void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
diff --git a/libbcachefs/disk_groups_types.h b/libbcachefs/disk_groups_types.h
new file mode 100644
index 00000000..a54ef085
--- /dev/null
+++ b/libbcachefs/disk_groups_types.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DISK_GROUPS_TYPES_H
+#define _BCACHEFS_DISK_GROUPS_TYPES_H
+
+struct bch_disk_group_cpu {
+ bool deleted;
+ u16 parent;
+ u8 label[BCH_SB_LABEL_SIZE];
+ struct bch_devs_mask devs;
+};
+
+struct bch_disk_groups_cpu {
+ struct rcu_head rcu;
+ unsigned nr;
+ struct bch_disk_group_cpu entries[] __counted_by(nr);
+};
+
+#endif /* _BCACHEFS_DISK_GROUPS_TYPES_H */
diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h
index 7cc08377..3e9f09ce 100644
--- a/libbcachefs/errcode.h
+++ b/libbcachefs/errcode.h
@@ -213,6 +213,7 @@
x(BCH_ERR_invalid_sb, invalid_sb_crypt) \
x(BCH_ERR_invalid_sb, invalid_sb_clean) \
x(BCH_ERR_invalid_sb, invalid_sb_quota) \
+ x(BCH_ERR_invalid_sb, invalid_sb_opt_compression) \
x(BCH_ERR_invalid, invalid_bkey) \
x(BCH_ERR_operation_blocked, nocow_lock_blocked) \
x(EIO, btree_node_read_err) \
diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c
index 1b25f84e..0c60d49c 100644
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@@ -13,6 +13,7 @@
#include "btree_iter.h"
#include "buckets.h"
#include "checksum.h"
+#include "compress.h"
#include "debug.h"
#include "disk_groups.h"
#include "error.h"
@@ -757,18 +758,6 @@ static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
return i;
}
-static void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry)
-{
- union bch_extent_entry *next = extent_entry_next(entry);
-
- /* stripes have ptrs, but their layout doesn't work with this code */
- BUG_ON(k.k->type == KEY_TYPE_stripe);
-
- memmove_u64s_down(entry, next,
- (u64 *) bkey_val_end(k) - (u64 *) next);
- k.k->u64s -= (u64 *) next - (u64 *) entry;
-}
-
/*
* Returns pointer to the next entry after the one being dropped:
*/
@@ -992,10 +981,6 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
- struct bch_extent_crc_unpacked crc;
- const struct bch_extent_ptr *ptr;
- const struct bch_extent_stripe_ptr *ec;
- struct bch_dev *ca;
bool first = true;
if (c)
@@ -1006,9 +991,9 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
prt_printf(out, " ");
switch (__extent_entry_type(entry)) {
- case BCH_EXTENT_ENTRY_ptr:
- ptr = entry_to_ptr(entry);
- ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
+ case BCH_EXTENT_ENTRY_ptr: {
+ const struct bch_extent_ptr *ptr = entry_to_ptr(entry);
+ struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
? bch_dev_bkey_exists(c, ptr->dev)
: NULL;
@@ -1030,10 +1015,12 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
prt_printf(out, " stale");
}
break;
+ }
case BCH_EXTENT_ENTRY_crc32:
case BCH_EXTENT_ENTRY_crc64:
- case BCH_EXTENT_ENTRY_crc128:
- crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
+ case BCH_EXTENT_ENTRY_crc128: {
+ struct bch_extent_crc_unpacked crc =
+ bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s",
crc.compressed_size,
@@ -1042,12 +1029,26 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
bch2_csum_types[crc.csum_type],
bch2_compression_types[crc.compression_type]);
break;
- case BCH_EXTENT_ENTRY_stripe_ptr:
- ec = &entry->stripe_ptr;
+ }
+ case BCH_EXTENT_ENTRY_stripe_ptr: {
+ const struct bch_extent_stripe_ptr *ec = &entry->stripe_ptr;
prt_printf(out, "ec: idx %llu block %u",
(u64) ec->idx, ec->block);
break;
+ }
+ case BCH_EXTENT_ENTRY_rebalance: {
+ const struct bch_extent_rebalance *r = &entry->rebalance;
+
+ prt_str(out, "rebalance: target ");
+ if (c)
+ bch2_target_to_text(out, c, r->target);
+ else
+ prt_printf(out, "%u", r->target);
+ prt_str(out, " compression ");
+ bch2_compression_opt_to_text(out, r->compression);
+ break;
+ }
default:
prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
return;
@@ -1207,6 +1208,14 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
return -BCH_ERR_invalid_bkey;
}
crc_since_last_ptr = true;
+
+ if (crc_is_encoded(crc) &&
+ (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) &&
+ (flags & (BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT))) {
+ prt_printf(err, "too large encoded extent");
+ return -BCH_ERR_invalid_bkey;
+ }
+
break;
case BCH_EXTENT_ENTRY_stripe_ptr:
if (have_ec) {
@@ -1215,9 +1224,18 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
}
have_ec = true;
break;
- case BCH_EXTENT_ENTRY_rebalance:
+ case BCH_EXTENT_ENTRY_rebalance: {
+ const struct bch_extent_rebalance *r = &entry->rebalance;
+
+ if (!bch2_compression_opt_valid(r->compression)) {
+ struct bch_compression_opt opt = __bch2_compression_decode(r->compression);
+ prt_printf(err, "invalid compression opt %u:%u",
+ opt.type, opt.level);
+ return -BCH_ERR_invalid_bkey;
+ }
break;
}
+ }
}
if (!nr_ptrs) {
@@ -1281,6 +1299,125 @@ void bch2_ptr_swab(struct bkey_s k)
}
}
+const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+
+ bkey_extent_entry_for_each(ptrs, entry)
+ if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance)
+ return &entry->rebalance;
+
+ return NULL;
+}
+
+unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k,
+ unsigned target, unsigned compression)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ unsigned rewrite_ptrs = 0;
+
+ if (compression) {
+ unsigned compression_type = bch2_compression_opt_to_type(compression);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ unsigned i = 0;
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) {
+ rewrite_ptrs = 0;
+ goto incompressible;
+ }
+
+ if (!p.ptr.cached && p.crc.compression_type != compression_type)
+ rewrite_ptrs |= 1U << i;
+ i++;
+ }
+ }
+incompressible:
+ if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) {
+ const struct bch_extent_ptr *ptr;
+ unsigned i = 0;
+
+ bkey_for_each_ptr(ptrs, ptr) {
+ if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, target))
+ rewrite_ptrs |= 1U << i;
+ i++;
+ }
+ }
+
+ return rewrite_ptrs;
+}
+
+bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k)
+{
+ const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
+
+ /*
+ * If it's an indirect extent, we don't delete the rebalance entry when
+ * done so that we know what options were applied - check if it still
+ * needs work done:
+ */
+ if (r &&
+ k.k->type == KEY_TYPE_reflink_v &&
+ !bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression))
+ r = NULL;
+
+ return r != NULL;
+}
+
+int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k,
+ unsigned target, unsigned compression)
+{
+ struct bkey_s k = bkey_i_to_s(_k);
+ struct bch_extent_rebalance *r;
+ bool needs_rebalance;
+
+ if (!bkey_extent_is_direct_data(k.k))
+ return 0;
+
+ /* get existing rebalance entry: */
+ r = (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c);
+ if (r) {
+ if (k.k->type == KEY_TYPE_reflink_v) {
+ /*
+ * indirect extents: existing options take precedence,
+ * so that we don't move extents back and forth if
+ * they're referenced by different inodes with different
+ * options:
+ */
+ if (r->target)
+ target = r->target;
+ if (r->compression)
+ compression = r->compression;
+ }
+
+ r->target = target;
+ r->compression = compression;
+ }
+
+ needs_rebalance = bch2_bkey_ptrs_need_rebalance(c, k.s_c, target, compression);
+
+ if (needs_rebalance && !r) {
+ union bch_extent_entry *new = bkey_val_end(k);
+
+ new->rebalance.type = 1U << BCH_EXTENT_ENTRY_rebalance;
+ new->rebalance.compression = compression;
+ new->rebalance.target = target;
+ new->rebalance.unused = 0;
+ k.k->u64s += extent_entry_u64s(new);
+ } else if (!needs_rebalance && r && k.k->type != KEY_TYPE_reflink_v) {
+ /*
+ * For indirect extents, don't delete the rebalance entry when
+ * we're finished so that we know we specifically moved it or
+ * compressed it to its current location/compression type
+ */
+ extent_entry_drop(k, (union bch_extent_entry *) r);
+ }
+
+ return 0;
+}
+
/* Generic extent code: */
int bch2_cut_front_s(struct bpos where, struct bkey_s k)
diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h
index 879e7d21..9110acae 100644
--- a/libbcachefs/extents.h
+++ b/libbcachefs/extents.h
@@ -89,6 +89,18 @@ static inline void __extent_entry_insert(struct bkey_i *k,
memcpy_u64s_small(dst, new, extent_entry_u64s(new));
}
+static inline void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry)
+{
+ union bch_extent_entry *next = extent_entry_next(entry);
+
+ /* stripes have ptrs, but their layout doesn't work with this code */
+ BUG_ON(k.k->type == KEY_TYPE_stripe);
+
+ memmove_u64s_down(entry, next,
+ (u64 *) bkey_val_end(k) - (u64 *) next);
+ k.k->u64s -= (u64 *) next - (u64 *) entry;
+}
+
static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
{
return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
@@ -190,6 +202,11 @@ static inline bool crc_is_compressed(struct bch_extent_crc_unpacked crc)
crc.compression_type != BCH_COMPRESSION_TYPE_incompressible);
}
+static inline bool crc_is_encoded(struct bch_extent_crc_unpacked crc)
+{
+ return crc.csum_type != BCH_CSUM_none || crc_is_compressed(crc);
+}
+
/* bkey_ptrs: generically over any key type that has ptrs */
struct bkey_ptrs_c {
@@ -693,6 +710,14 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c,
void bch2_ptr_swab(struct bkey_s);
+const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c);
+unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c,
+ unsigned, unsigned);
+bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c);
+
+int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *,
+ unsigned, unsigned);
+
/* Generic extent code: */
enum bch_extent_overlap {
@@ -737,22 +762,4 @@ static inline void bch2_key_resize(struct bkey *k, unsigned new_size)
k->size = new_size;
}
-/*
- * In extent_sort_fix_overlapping(), insert_fixup_extent(),
- * extent_merge_inline() - we're modifying keys in place that are packed. To do
- * that we have to unpack the key, modify the unpacked key - then this
- * copies/repacks the unpacked to the original as necessary.
- */
-static inline void extent_save(struct btree *b, struct bkey_packed *dst,
- struct bkey *src)
-{
- struct bkey_format *f = &b->format;
- struct bkey_i *dst_unpacked;
-
- if ((dst_unpacked = packed_to_bkey(dst)))
- dst_unpacked->k = *src;
- else
- BUG_ON(!bch2_bkey_pack_key(dst, src, f));
-}
-
#endif /* _BCACHEFS_EXTENTS_H */
diff --git a/libbcachefs/fs-io-direct.c b/libbcachefs/fs-io-direct.c
index 6a9557e7..5b42a76c 100644
--- a/libbcachefs/fs-io-direct.c
+++ b/libbcachefs/fs-io-direct.c
@@ -113,6 +113,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
} else {
atomic_set(&dio->cl.remaining,
CLOSURE_REMAINING_INITIALIZER + 1);
+ dio->cl.closure_get_happened = true;
}
dio->req = req;
diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c
index f26b824e..328cb3b3 100644
--- a/libbcachefs/fsck.c
+++ b/libbcachefs/fsck.c
@@ -1299,6 +1299,28 @@ err:
return ret;
}
+static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c k)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ struct bch_extent_crc_unpacked crc;
+ const union bch_extent_entry *i;
+ unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9;
+
+ bkey_for_each_crc(k.k, ptrs, crc, i)
+ if (crc_is_encoded(crc) &&
+ crc.uncompressed_size > encoded_extent_max_sectors) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, c, k);
+ bch_err(c, "overbig encoded extent, please report this:\n %s", buf.buf);
+ printbuf_exit(&buf);
+ }
+
+ return 0;
+}
+
static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_s_c k,
struct inode_walker *inode,
@@ -1434,7 +1456,8 @@ int bch2_check_extents(struct bch_fs *c)
&res, NULL,
BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({
bch2_disk_reservation_put(c, &res);
- check_extent(trans, &iter, k, &w, &s, &extent_ends);
+ check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
+ check_extent_overbig(trans, &iter, k);
})) ?:
check_i_sectors(trans, &w);
@@ -1448,6 +1471,30 @@ int bch2_check_extents(struct bch_fs *c)
return ret;
}
+int bch2_check_indirect_extents(struct bch_fs *c)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct disk_reservation res = { 0 };
+ int ret = 0;
+
+ ret = for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
+ POS_MIN,
+ BTREE_ITER_PREFETCH, k,
+ &res, NULL,
+ BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({
+ bch2_disk_reservation_put(c, &res);
+ check_extent_overbig(trans, &iter, k);
+ }));
+
+ bch2_disk_reservation_put(c, &res);
+ bch2_trans_put(trans);
+
+ bch_err_fn(c, ret);
+ return ret;
+}
+
static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
{
struct bch_fs *c = trans->c;
diff --git a/libbcachefs/fsck.h b/libbcachefs/fsck.h
index 90c87b50..da991e8c 100644
--- a/libbcachefs/fsck.h
+++ b/libbcachefs/fsck.h
@@ -4,6 +4,7 @@
int bch2_check_inodes(struct bch_fs *);
int bch2_check_extents(struct bch_fs *);
+int bch2_check_indirect_extents(struct bch_fs *);
int bch2_check_dirents(struct bch_fs *);
int bch2_check_xattrs(struct bch_fs *);
int bch2_check_root(struct bch_fs *);
diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c
index bb3f443d..23fcd442 100644
--- a/libbcachefs/inode.c
+++ b/libbcachefs/inode.c
@@ -6,6 +6,7 @@
#include "bkey_methods.h"
#include "btree_update.h"
#include "buckets.h"
+#include "compress.h"
#include "error.h"
#include "extents.h"
#include "extent_update.h"
@@ -422,9 +423,10 @@ static int __bch2_inode_invalid(struct bkey_s_c k, struct printbuf *err)
return -BCH_ERR_invalid_bkey;
}
- if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1) {
- prt_printf(err, "invalid data checksum type (%u >= %u)",
- unpacked.bi_compression, BCH_COMPRESSION_OPT_NR + 1);
+ if (unpacked.bi_compression &&
+ !bch2_compression_opt_valid(unpacked.bi_compression - 1)) {
+ prt_printf(err, "invalid compression opt %u",
+ unpacked.bi_compression - 1);
return -BCH_ERR_invalid_bkey;
}
@@ -979,6 +981,18 @@ void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c,
opts->compression = opts->background_compression = opts->data_checksum = opts->erasure_code = 0;
}
+int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_io_opts *opts)
+{
+ struct bch_inode_unpacked inode;
+ int ret = lockrestart_do(trans, bch2_inode_find_by_inum_trans(trans, inum, &inode));
+
+ if (ret)
+ return ret;
+
+ bch2_inode_opts_get(opts, trans->c, &inode);
+ return 0;
+}
+
int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
{
struct bch_fs *c = trans->c;
diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h
index a7464e1b..2781e328 100644
--- a/libbcachefs/inode.h
+++ b/libbcachefs/inode.h
@@ -200,6 +200,7 @@ void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *);
struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *);
void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *,
struct bch_inode_unpacked *);
+int bch2_inum_opts_get(struct btree_trans*, subvol_inum, struct bch_io_opts *);
int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32);
int bch2_delete_dead_inodes(struct bch_fs *);
diff --git a/libbcachefs/io_misc.c b/libbcachefs/io_misc.c
index 119834cb..0979d5e0 100644
--- a/libbcachefs/io_misc.c
+++ b/libbcachefs/io_misc.c
@@ -16,6 +16,7 @@
#include "io_misc.h"
#include "io_write.h"
#include "logged_ops.h"
+#include "rebalance.h"
#include "subvolume.h"
/* Overwrites whatever was present with zeroes: */
@@ -355,6 +356,7 @@ static int __bch2_resume_logged_op_finsert(struct btree_trans *trans,
struct btree_iter iter;
struct bkey_i_logged_op_finsert *op = bkey_i_to_logged_op_finsert(op_k);
subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) };
+ struct bch_io_opts opts;
u64 dst_offset = le64_to_cpu(op->v.dst_offset);
u64 src_offset = le64_to_cpu(op->v.src_offset);
s64 shift = dst_offset - src_offset;
@@ -363,6 +365,10 @@ static int __bch2_resume_logged_op_finsert(struct btree_trans *trans,
bool insert = shift > 0;
int ret = 0;
+ ret = bch2_inum_opts_get(trans, inum, &opts);
+ if (ret)
+ return ret;
+
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
POS(inum.inum, 0),
BTREE_ITER_INTENT);
@@ -443,7 +449,10 @@ case LOGGED_OP_FINSERT_shift_extents:
op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset);
- ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?:
+ ret = bch2_bkey_set_needs_rebalance(c, copy,
+ opts.background_target,
+ opts.background_compression) ?:
+ bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?:
bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?:
bch2_logged_op_update(trans, &op->k_i) ?:
bch2_trans_commit(trans, &disk_res, NULL, BTREE_INSERT_NOFAIL);
diff --git a/libbcachefs/io_write.c b/libbcachefs/io_write.c
index 6e4f85eb..6d9c7772 100644
--- a/libbcachefs/io_write.c
+++ b/libbcachefs/io_write.c
@@ -351,10 +351,13 @@ static int bch2_write_index_default(struct bch_write_op *op)
bkey_start_pos(&sk.k->k),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- ret = bch2_extent_update(trans, inum, &iter, sk.k,
- &op->res,
- op->new_i_size, &op->i_sectors_delta,
- op->flags & BCH_WRITE_CHECK_ENOSPC);
+ ret = bch2_bkey_set_needs_rebalance(c, sk.k,
+ op->opts.background_target,
+ op->opts.background_compression) ?:
+ bch2_extent_update(trans, inum, &iter, sk.k,
+ &op->res,
+ op->new_i_size, &op->i_sectors_delta,
+ op->flags & BCH_WRITE_CHECK_ENOSPC);
bch2_trans_iter_exit(trans, &iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -495,7 +498,6 @@ static void __bch2_write_index(struct bch_write_op *op)
{
struct bch_fs *c = op->c;
struct keylist *keys = &op->insert_keys;
- struct bkey_i *k;
unsigned dev;
int ret = 0;
@@ -505,14 +507,6 @@ static void __bch2_write_index(struct bch_write_op *op)
goto err;
}
- /*
- * probably not the ideal place to hook this in, but I don't
- * particularly want to plumb io_opts all the way through the btree
- * update stack right now
- */
- for_each_keylist_key(keys, k)
- bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts);
-
if (!bch2_keylist_empty(keys)) {
u64 sectors_start = keylist_sectors(keys);
@@ -816,6 +810,7 @@ static enum prep_encoded_ret {
/* Can we just write the entire extent as is? */
if (op->crc.uncompressed_size == op->crc.live_size &&
+ op->crc.uncompressed_size <= c->opts.encoded_extent_max >> 9 &&
op->crc.compressed_size <= wp->sectors_free &&
(op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) ||
op->incompressible)) {
@@ -1091,9 +1086,7 @@ static bool bch2_extent_is_writeable(struct bch_write_op *op,
e = bkey_s_c_to_extent(k);
extent_for_each_ptr_decode(e, p, entry) {
- if (p.crc.csum_type ||
- crc_is_compressed(p.crc) ||
- p.has_ec)
+ if (crc_is_encoded(p.crc) || p.has_ec)
return false;
replicas += bch2_extent_ptr_durability(c, &p);
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index 0e7a9ffa..5b5d69f2 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -1019,6 +1019,25 @@ err:
return ret;
}
+int bch2_fs_journal_alloc(struct bch_fs *c)
+{
+ struct bch_dev *ca;
+ unsigned i;
+
+ for_each_online_member(ca, c, i) {
+ if (ca->journal.nr)
+ continue;
+
+ int ret = bch2_dev_journal_alloc(ca);
+ if (ret) {
+ percpu_ref_put(&ca->io_ref);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
/* startup/shutdown: */
static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h
index 491133cc..011711e9 100644
--- a/libbcachefs/journal.h
+++ b/libbcachefs/journal.h
@@ -534,6 +534,7 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *);
int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
unsigned nr);
int bch2_dev_journal_alloc(struct bch_dev *);
+int bch2_fs_journal_alloc(struct bch_fs *);
void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
diff --git a/libbcachefs/move.c b/libbcachefs/move.c
index 82f60c78..1b15b010 100644
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@@ -20,6 +20,7 @@
#include "keylist.h"
#include "move.h"
#include "replicas.h"
+#include "snapshot.h"
#include "super-io.h"
#include "trace.h"
@@ -59,20 +60,6 @@ static void trace_move_extent_alloc_mem_fail2(struct bch_fs *c, struct bkey_s_c
}
}
-static void progress_list_add(struct bch_fs *c, struct bch_move_stats *stats)
-{
- mutex_lock(&c->data_progress_lock);
- list_add(&stats->list, &c->data_progress_list);
- mutex_unlock(&c->data_progress_lock);
-}
-
-static void progress_list_del(struct bch_fs *c, struct bch_move_stats *stats)
-{
- mutex_lock(&c->data_progress_lock);
- list_del(&stats->list);
- mutex_unlock(&c->data_progress_lock);
-}
-
struct moving_io {
struct list_head read_list;
struct list_head io_list;
@@ -156,13 +143,11 @@ static void move_read_endio(struct bio *bio)
closure_put(&ctxt->cl);
}
-void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt,
- struct btree_trans *trans)
+void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt)
{
struct moving_io *io;
- if (trans)
- bch2_trans_unlock(trans);
+ bch2_trans_unlock(ctxt->trans);
while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) {
list_del(&io->read_list);
@@ -170,21 +155,20 @@ void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt,
}
}
-static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt,
- struct btree_trans *trans)
+void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
{
unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
- move_ctxt_wait_event(ctxt, trans,
+ move_ctxt_wait_event(ctxt,
!atomic_read(&ctxt->write_sectors) ||
atomic_read(&ctxt->write_sectors) != sectors_pending);
}
void bch2_moving_ctxt_exit(struct moving_context *ctxt)
{
- struct bch_fs *c = ctxt->c;
+ struct bch_fs *c = ctxt->trans->c;
- move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads));
+ move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
closure_sync(&ctxt->cl);
EBUG_ON(atomic_read(&ctxt->write_sectors));
@@ -192,16 +176,12 @@ void bch2_moving_ctxt_exit(struct moving_context *ctxt)
EBUG_ON(atomic_read(&ctxt->read_sectors));
EBUG_ON(atomic_read(&ctxt->read_ios));
- if (ctxt->stats) {
- progress_list_del(c, ctxt->stats);
- trace_move_data(c,
- atomic64_read(&ctxt->stats->sectors_moved),
- atomic64_read(&ctxt->stats->keys_moved));
- }
-
mutex_lock(&c->moving_context_lock);
list_del(&ctxt->list);
mutex_unlock(&c->moving_context_lock);
+
+ bch2_trans_put(ctxt->trans);
+ memset(ctxt, 0, sizeof(*ctxt));
}
void bch2_moving_ctxt_init(struct moving_context *ctxt,
@@ -213,7 +193,7 @@ void bch2_moving_ctxt_init(struct moving_context *ctxt,
{
memset(ctxt, 0, sizeof(*ctxt));
- ctxt->c = c;
+ ctxt->trans = bch2_trans_get(c);
ctxt->fn = (void *) _RET_IP_;
ctxt->rate = rate;
ctxt->stats = stats;
@@ -230,16 +210,17 @@ void bch2_moving_ctxt_init(struct moving_context *ctxt,
mutex_lock(&c->moving_context_lock);
list_add(&ctxt->list, &c->moving_context_list);
mutex_unlock(&c->moving_context_lock);
+}
- if (stats) {
- progress_list_add(c, stats);
- stats->data_type = BCH_DATA_user;
- }
+void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c)
+{
+ trace_move_data(c, stats);
}
void bch2_move_stats_init(struct bch_move_stats *stats, char *name)
{
memset(stats, 0, sizeof(*stats));
+ stats->data_type = BCH_DATA_user;
scnprintf(stats->name, sizeof(stats->name), "%s", name);
}
@@ -286,15 +267,14 @@ static int bch2_extent_drop_ptrs(struct btree_trans *trans,
bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
}
-static int bch2_move_extent(struct btree_trans *trans,
- struct btree_iter *iter,
- struct moving_context *ctxt,
- struct move_bucket_in_flight *bucket_in_flight,
- struct bch_io_opts io_opts,
- enum btree_id btree_id,
- struct bkey_s_c k,
- struct data_update_opts data_opts)
+int bch2_move_extent(struct moving_context *ctxt,
+ struct move_bucket_in_flight *bucket_in_flight,
+ struct btree_iter *iter,
+ struct bkey_s_c k,
+ struct bch_io_opts io_opts,
+ struct data_update_opts data_opts)
{
+ struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
struct moving_io *io;
@@ -303,6 +283,8 @@ static int bch2_move_extent(struct btree_trans *trans,
unsigned sectors = k.k->size, pages;
int ret = -ENOMEM;
+ if (ctxt->stats)
+ ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos);
trace_move_extent2(c, k);
bch2_data_update_opts_normalize(k, &data_opts);
@@ -355,7 +337,7 @@ static int bch2_move_extent(struct btree_trans *trans,
io->rbio.bio.bi_end_io = move_read_endio;
ret = bch2_data_update_init(trans, ctxt, &io->write, ctxt->wp,
- io_opts, data_opts, btree_id, k);
+ io_opts, data_opts, iter->btree_id, k);
if (ret && ret != -BCH_ERR_unwritten_extent_update)
goto err_free_pages;
@@ -367,9 +349,11 @@ static int bch2_move_extent(struct btree_trans *trans,
BUG_ON(ret);
- io->write.ctxt = ctxt;
io->write.op.end_io = move_write_done;
+ if (ctxt->rate)
+ bch2_ratelimit_increment(ctxt->rate, k.k->size);
+
if (ctxt->stats) {
atomic64_inc(&ctxt->stats->keys_moved);
atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
@@ -399,7 +383,7 @@ static int bch2_move_extent(struct btree_trans *trans,
closure_get(&ctxt->cl);
bch2_read_extent(trans, &io->rbio,
bkey_start_pos(k.k),
- btree_id, k, 0,
+ iter->btree_id, k, 0,
BCH_READ_NODECODE|
BCH_READ_LAST_FRAGMENT);
return 0;
@@ -413,45 +397,96 @@ err:
return ret;
}
-static int lookup_inode(struct btree_trans *trans, struct bpos pos,
- struct bch_inode_unpacked *inode)
+struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
+ struct per_snapshot_io_opts *io_opts,
+ struct bkey_s_c extent_k)
+{
+ struct bch_fs *c = trans->c;
+ u32 restart_count = trans->restart_count;
+ int ret = 0;
+
+ if (io_opts->cur_inum != extent_k.k->p.inode) {
+ struct btree_iter iter;
+ struct bkey_s_c k;
+
+ io_opts->d.nr = 0;
+
+ for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode),
+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ if (k.k->p.offset != extent_k.k->p.inode)
+ break;
+
+ if (!bkey_is_inode(k.k))
+ continue;
+
+ struct bch_inode_unpacked inode;
+ BUG_ON(bch2_inode_unpack(k, &inode));
+
+ struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot };
+ bch2_inode_opts_get(&e.io_opts, trans->c, &inode);
+
+ ret = darray_push(&io_opts->d, e);
+ if (ret)
+ break;
+ }
+ bch2_trans_iter_exit(trans, &iter);
+ io_opts->cur_inum = extent_k.k->p.inode;
+ }
+
+ ret = ret ?: trans_was_restarted(trans, restart_count);
+ if (ret)
+ return ERR_PTR(ret);
+
+ if (extent_k.k->p.snapshot) {
+ struct snapshot_io_opts_entry *i;
+ darray_for_each(io_opts->d, i)
+ if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot))
+ return &i->io_opts;
+ }
+
+ return &io_opts->fs_io_opts;
+}
+
+int bch2_move_get_io_opts_one(struct btree_trans *trans,
+ struct bch_io_opts *io_opts,
+ struct bkey_s_c extent_k)
{
struct btree_iter iter;
struct bkey_s_c k;
int ret;
- bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, pos,
- BTREE_ITER_ALL_SNAPSHOTS);
- k = bch2_btree_iter_peek(&iter);
+ /* reflink btree? */
+ if (!extent_k.k->p.inode) {
+ *io_opts = bch2_opts_to_inode_opts(trans->c->opts);
+ return 0;
+ }
+
+ k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+ SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot),
+ BTREE_ITER_CACHED);
ret = bkey_err(k);
- if (ret)
- goto err;
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ return ret;
- if (!k.k || !bkey_eq(k.k->p, pos)) {
- ret = -BCH_ERR_ENOENT_inode;
- goto err;
+ if (!ret && bkey_is_inode(k.k)) {
+ struct bch_inode_unpacked inode;
+ bch2_inode_unpack(k, &inode);
+ bch2_inode_opts_get(io_opts, trans->c, &inode);
+ } else {
+ *io_opts = bch2_opts_to_inode_opts(trans->c->opts);
}
- ret = bkey_is_inode(k.k) ? 0 : -EIO;
- if (ret)
- goto err;
-
- ret = bch2_inode_unpack(k, inode);
- if (ret)
- goto err;
-err:
bch2_trans_iter_exit(trans, &iter);
- return ret;
+ return 0;
}
-static int move_ratelimit(struct btree_trans *trans,
- struct moving_context *ctxt)
+int bch2_move_ratelimit(struct moving_context *ctxt)
{
- struct bch_fs *c = trans->c;
+ struct bch_fs *c = ctxt->trans->c;
u64 delay;
if (ctxt->wait_on_copygc) {
- bch2_trans_unlock(trans);
+ bch2_trans_unlock(ctxt->trans);
wait_event_killable(c->copygc_running_wq,
!c->copygc_running ||
kthread_should_stop());
@@ -461,7 +496,7 @@ static int move_ratelimit(struct btree_trans *trans,
delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0;
if (delay) {
- bch2_trans_unlock(trans);
+ bch2_trans_unlock(ctxt->trans);
set_current_state(TASK_INTERRUPTIBLE);
}
@@ -474,7 +509,7 @@ static int move_ratelimit(struct btree_trans *trans,
schedule_timeout(delay);
if (unlikely(freezing(current))) {
- move_ctxt_wait_event(ctxt, trans, list_empty(&ctxt->reads));
+ move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
try_to_freeze();
}
} while (delay);
@@ -483,7 +518,7 @@ static int move_ratelimit(struct btree_trans *trans,
* XXX: these limits really ought to be per device, SSDs and hard drives
* will want different limits
*/
- move_ctxt_wait_event(ctxt, trans,
+ move_ctxt_wait_event(ctxt,
atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 &&
atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 &&
atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight &&
@@ -492,52 +527,28 @@ static int move_ratelimit(struct btree_trans *trans,
return 0;
}
-static int move_get_io_opts(struct btree_trans *trans,
- struct bch_io_opts *io_opts,
- struct bkey_s_c k, u64 *cur_inum)
-{
- struct bch_inode_unpacked inode;
- int ret;
-
- if (*cur_inum == k.k->p.inode)
- return 0;
-
- ret = lookup_inode(trans,
- SPOS(0, k.k->p.inode, k.k->p.snapshot),
- &inode);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- return ret;
-
- if (!ret)
- bch2_inode_opts_get(io_opts, trans->c, &inode);
- else
- *io_opts = bch2_opts_to_inode_opts(trans->c->opts);
- *cur_inum = k.k->p.inode;
- return 0;
-}
-
-static int __bch2_move_data(struct moving_context *ctxt,
- struct bpos start,
- struct bpos end,
- move_pred_fn pred, void *arg,
- enum btree_id btree_id)
+static int bch2_move_data_btree(struct moving_context *ctxt,
+ struct bpos start,
+ struct bpos end,
+ move_pred_fn pred, void *arg,
+ enum btree_id btree_id)
{
- struct bch_fs *c = ctxt->c;
- struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
+ struct btree_trans *trans = ctxt->trans;
+ struct bch_fs *c = trans->c;
+ struct per_snapshot_io_opts snapshot_io_opts;
+ struct bch_io_opts *io_opts;
struct bkey_buf sk;
- struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_s_c k;
struct data_update_opts data_opts;
- u64 cur_inum = U64_MAX;
int ret = 0, ret2;
+ per_snapshot_io_opts_init(&snapshot_io_opts, c);
bch2_bkey_buf_init(&sk);
if (ctxt->stats) {
ctxt->stats->data_type = BCH_DATA_user;
- ctxt->stats->btree_id = btree_id;
- ctxt->stats->pos = start;
+ ctxt->stats->pos = BBPOS(btree_id, start);
}
bch2_trans_iter_init(trans, &iter, btree_id, start,
@@ -547,7 +558,7 @@ static int __bch2_move_data(struct moving_context *ctxt,
if (ctxt->rate)
bch2_ratelimit_reset(ctxt->rate);
- while (!move_ratelimit(trans, ctxt)) {
+ while (!bch2_move_ratelimit(ctxt)) {
bch2_trans_begin(trans);
k = bch2_btree_iter_peek(&iter);
@@ -564,17 +575,18 @@ static int __bch2_move_data(struct moving_context *ctxt,
break;
if (ctxt->stats)
- ctxt->stats->pos = iter.pos;
+ ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
if (!bkey_extent_is_direct_data(k.k))
goto next_nondata;
- ret = move_get_io_opts(trans, &io_opts, k, &cur_inum);
+ io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, k);
+ ret = PTR_ERR_OR_ZERO(io_opts);
if (ret)
continue;
memset(&data_opts, 0, sizeof(data_opts));
- if (!pred(c, arg, k, &io_opts, &data_opts))
+ if (!pred(c, arg, k, io_opts, &data_opts))
goto next;
/*
@@ -584,24 +596,20 @@ static int __bch2_move_data(struct moving_context *ctxt,
bch2_bkey_buf_reassemble(&sk, c, k);
k = bkey_i_to_s_c(sk.k);
- ret2 = bch2_move_extent(trans, &iter, ctxt, NULL,
- io_opts, btree_id, k, data_opts);
+ ret2 = bch2_move_extent(ctxt, NULL, &iter, k, *io_opts, data_opts);
if (ret2) {
if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
continue;
if (ret2 == -ENOMEM) {
/* memory allocation failure, wait for some IO to finish */
- bch2_move_ctxt_wait_for_io(ctxt, trans);
+ bch2_move_ctxt_wait_for_io(ctxt);
continue;
}
/* XXX signal failure */
goto next;
}
-
- if (ctxt->rate)
- bch2_ratelimit_increment(ctxt->rate, k.k->size);
next:
if (ctxt->stats)
atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
@@ -610,59 +618,68 @@ next_nondata:
}
bch2_trans_iter_exit(trans, &iter);
- bch2_trans_put(trans);
bch2_bkey_buf_exit(&sk, c);
+ per_snapshot_io_opts_exit(&snapshot_io_opts);
return ret;
}
-int bch2_move_data(struct bch_fs *c,
- enum btree_id start_btree_id, struct bpos start_pos,
- enum btree_id end_btree_id, struct bpos end_pos,
- struct bch_ratelimit *rate,
- struct bch_move_stats *stats,
- struct write_point_specifier wp,
- bool wait_on_copygc,
- move_pred_fn pred, void *arg)
+int __bch2_move_data(struct moving_context *ctxt,
+ struct bbpos start,
+ struct bbpos end,
+ move_pred_fn pred, void *arg)
{
- struct moving_context ctxt;
+ struct bch_fs *c = ctxt->trans->c;
enum btree_id id;
int ret = 0;
- bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
-
- for (id = start_btree_id;
- id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1);
+ for (id = start.btree;
+ id <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
id++) {
- stats->btree_id = id;
+ ctxt->stats->pos = BBPOS(id, POS_MIN);
- if (id != BTREE_ID_extents &&
- id != BTREE_ID_reflink)
+ if (!btree_type_has_ptrs(id) ||
+ !bch2_btree_id_root(c, id)->b)
continue;
- if (!bch2_btree_id_root(c, id)->b)
- continue;
-
- ret = __bch2_move_data(&ctxt,
- id == start_btree_id ? start_pos : POS_MIN,
- id == end_btree_id ? end_pos : POS_MAX,
+ ret = bch2_move_data_btree(ctxt,
+ id == start.btree ? start.pos : POS_MIN,
+ id == end.btree ? end.pos : POS_MAX,
pred, arg, id);
if (ret)
break;
}
+ return ret;
+}
+
+int bch2_move_data(struct bch_fs *c,
+ struct bbpos start,
+ struct bbpos end,
+ struct bch_ratelimit *rate,
+ struct bch_move_stats *stats,
+ struct write_point_specifier wp,
+ bool wait_on_copygc,
+ move_pred_fn pred, void *arg)
+{
+
+ struct moving_context ctxt;
+ int ret;
+
+ bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
+ ret = __bch2_move_data(&ctxt, start, end, pred, arg);
bch2_moving_ctxt_exit(&ctxt);
return ret;
}
-int __bch2_evacuate_bucket(struct btree_trans *trans,
- struct moving_context *ctxt,
+int __bch2_evacuate_bucket(struct moving_context *ctxt,
struct move_bucket_in_flight *bucket_in_flight,
struct bpos bucket, int gen,
struct data_update_opts _data_opts)
{
- struct bch_fs *c = ctxt->c;
+ struct btree_trans *trans = ctxt->trans;
+ struct bch_fs *c = trans->c;
struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
struct btree_iter iter;
struct bkey_buf sk;
@@ -673,7 +690,6 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
struct data_update_opts data_opts;
unsigned dirty_sectors, bucket_size;
u64 fragmentation;
- u64 cur_inum = U64_MAX;
struct bpos bp_pos = POS_MIN;
int ret = 0;
@@ -708,7 +724,7 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
goto err;
}
- while (!(ret = move_ratelimit(trans, ctxt))) {
+ while (!(ret = bch2_move_ratelimit(ctxt))) {
bch2_trans_begin(trans);
ret = bch2_get_next_backpointer(trans, bucket, gen,
@@ -737,7 +753,7 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
bch2_bkey_buf_reassemble(&sk, c, k);
k = bkey_i_to_s_c(sk.k);
- ret = move_get_io_opts(trans, &io_opts, k, &cur_inum);
+ ret = bch2_move_get_io_opts_one(trans, &io_opts, k);
if (ret) {
bch2_trans_iter_exit(trans, &iter);
continue;
@@ -758,23 +774,20 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
i++;
}
- ret = bch2_move_extent(trans, &iter, ctxt,
- bucket_in_flight,
- io_opts, bp.btree_id, k, data_opts);
+ ret = bch2_move_extent(ctxt, bucket_in_flight,
+ &iter, k, io_opts, data_opts);
bch2_trans_iter_exit(trans, &iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
if (ret == -ENOMEM) {
/* memory allocation failure, wait for some IO to finish */
- bch2_move_ctxt_wait_for_io(ctxt, trans);
+ bch2_move_ctxt_wait_for_io(ctxt);
continue;
}
if (ret)
goto err;
- if (ctxt->rate)
- bch2_ratelimit_increment(ctxt->rate, k.k->size);
if (ctxt->stats)
atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
} else {
@@ -825,14 +838,12 @@ int bch2_evacuate_bucket(struct bch_fs *c,
struct write_point_specifier wp,
bool wait_on_copygc)
{
- struct btree_trans *trans = bch2_trans_get(c);
struct moving_context ctxt;
int ret;
bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
- ret = __bch2_evacuate_bucket(trans, &ctxt, NULL, bucket, gen, data_opts);
+ ret = __bch2_evacuate_bucket(&ctxt, NULL, bucket, gen, data_opts);
bch2_moving_ctxt_exit(&ctxt);
- bch2_trans_put(trans);
return ret;
}
@@ -849,21 +860,25 @@ static int bch2_move_btree(struct bch_fs *c,
{
bool kthread = (current->flags & PF_KTHREAD) != 0;
struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
- struct btree_trans *trans = bch2_trans_get(c);
+ struct moving_context ctxt;
+ struct btree_trans *trans;
struct btree_iter iter;
struct btree *b;
enum btree_id id;
struct data_update_opts data_opts;
int ret = 0;
- progress_list_add(c, stats);
+ bch2_moving_ctxt_init(&ctxt, c, NULL, stats,
+ writepoint_ptr(&c->btree_write_point),
+ true);
+ trans = ctxt.trans;
stats->data_type = BCH_DATA_btree;
for (id = start_btree_id;
id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1);
id++) {
- stats->btree_id = id;
+ stats->pos = BBPOS(id, POS_MIN);
if (!bch2_btree_id_root(c, id)->b)
continue;
@@ -882,7 +897,7 @@ retry:
bpos_cmp(b->key.k.p, end_pos)) > 0)
break;
- stats->pos = iter.pos;
+ stats->pos = BBPOS(iter.btree_id, iter.pos);
if (!pred(c, arg, b, &io_opts, &data_opts))
goto next;
@@ -904,14 +919,10 @@ next:
break;
}
- bch2_trans_put(trans);
-
- if (ret)
- bch_err_fn(c, ret);
-
+ bch_err_fn(c, ret);
+ bch2_moving_ctxt_exit(&ctxt);
bch2_btree_interior_updates_flush(c);
- progress_list_del(c, stats);
return ret;
}
@@ -1032,8 +1043,7 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
mutex_unlock(&c->sb_lock);
}
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
@@ -1056,14 +1066,16 @@ int bch2_data_job(struct bch_fs *c,
ret = bch2_replicas_gc2(c) ?: ret;
ret = bch2_move_data(c,
- op.start_btree, op.start_pos,
- op.end_btree, op.end_pos,
+ (struct bbpos) { op.start_btree, op.start_pos },
+ (struct bbpos) { op.end_btree, op.end_pos },
NULL,
stats,
writepoint_hashed((unsigned long) current),
true,
rereplicate_pred, c) ?: ret;
ret = bch2_replicas_gc2(c) ?: ret;
+
+ bch2_move_stats_exit(stats, c);
break;
case BCH_DATA_OP_MIGRATE:
if (op.migrate.dev >= c->sb.nr_devices)
@@ -1080,18 +1092,21 @@ int bch2_data_job(struct bch_fs *c,
ret = bch2_replicas_gc2(c) ?: ret;
ret = bch2_move_data(c,
- op.start_btree, op.start_pos,
- op.end_btree, op.end_pos,
+ (struct bbpos) { op.start_btree, op.start_pos },
+ (struct bbpos) { op.end_btree, op.end_pos },
NULL,
stats,
writepoint_hashed((unsigned long) current),
true,
migrate_pred, &op) ?: ret;
ret = bch2_replicas_gc2(c) ?: ret;
+
+ bch2_move_stats_exit(stats, c);
break;
case BCH_DATA_OP_REWRITE_OLD_NODES:
bch2_move_stats_init(stats, "rewrite_old_nodes");
ret = bch2_scan_old_btree_nodes(c, stats);
+ bch2_move_stats_exit(stats, c);
break;
default:
ret = -EINVAL;
@@ -1100,19 +1115,43 @@ int bch2_data_job(struct bch_fs *c,
return ret;
}
-static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt)
+void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
{
- struct bch_move_stats *stats = ctxt->stats;
- struct moving_io *io;
+ prt_printf(out, "%s: data type=%s pos=",
+ stats->name,
+ bch2_data_types[stats->data_type]);
+ bch2_bbpos_to_text(out, stats->pos);
+ prt_newline(out);
+ printbuf_indent_add(out, 2);
+
+ prt_str(out, "keys moved: ");
+ prt_u64(out, atomic64_read(&stats->keys_moved));
+ prt_newline(out);
+
+ prt_str(out, "keys raced: ");
+ prt_u64(out, atomic64_read(&stats->keys_raced));
+ prt_newline(out);
+
+ prt_str(out, "bytes seen: ");
+ prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9);
+ prt_newline(out);
- prt_printf(out, "%s (%ps):", stats->name, ctxt->fn);
+ prt_str(out, "bytes moved: ");
+ prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9);
prt_newline(out);
- prt_printf(out, " data type %s btree_id %s position: ",
- bch2_data_types[stats->data_type],
- bch2_btree_id_str(stats->btree_id));
- bch2_bpos_to_text(out, stats->pos);
+ prt_str(out, "bytes raced: ");
+ prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9);
prt_newline(out);
+
+ printbuf_indent_sub(out, 2);
+}
+
+static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt)
+{
+ struct moving_io *io;
+
+ bch2_move_stats_to_text(out, ctxt->stats);
printbuf_indent_add(out, 2);
prt_printf(out, "reads: ios %u/%u sectors %u/%u",
@@ -1153,7 +1192,4 @@ void bch2_fs_move_init(struct bch_fs *c)
{
INIT_LIST_HEAD(&c->moving_context_list);
mutex_init(&c->moving_context_lock);
-
- INIT_LIST_HEAD(&c->data_progress_list);
- mutex_init(&c->data_progress_lock);
}
diff --git a/libbcachefs/move.h b/libbcachefs/move.h
index cbdd58db..1b1e8678 100644
--- a/libbcachefs/move.h
+++ b/libbcachefs/move.h
@@ -2,6 +2,7 @@
#ifndef _BCACHEFS_MOVE_H
#define _BCACHEFS_MOVE_H
+#include "bbpos.h"
#include "bcachefs_ioctl.h"
#include "btree_iter.h"
#include "buckets.h"
@@ -11,7 +12,7 @@
struct bch_read_bio;
struct moving_context {
- struct bch_fs *c;
+ struct btree_trans *trans;
struct list_head list;
void *fn;
@@ -37,10 +38,10 @@ struct moving_context {
wait_queue_head_t wait;
};
-#define move_ctxt_wait_event(_ctxt, _trans, _cond) \
+#define move_ctxt_wait_event(_ctxt, _cond) \
do { \
bool cond_finished = false; \
- bch2_moving_ctxt_do_pending_writes(_ctxt, _trans); \
+ bch2_moving_ctxt_do_pending_writes(_ctxt); \
\
if (_cond) \
break; \
@@ -59,22 +60,60 @@ void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *,
struct bch_ratelimit *, struct bch_move_stats *,
struct write_point_specifier, bool);
struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *);
-void bch2_moving_ctxt_do_pending_writes(struct moving_context *,
- struct btree_trans *);
+void bch2_moving_ctxt_do_pending_writes(struct moving_context *);
+void bch2_move_ctxt_wait_for_io(struct moving_context *);
+int bch2_move_ratelimit(struct moving_context *);
+
+/* Inodes in different snapshots may have different IO options: */
+struct snapshot_io_opts_entry {
+ u32 snapshot;
+ struct bch_io_opts io_opts;
+};
+
+struct per_snapshot_io_opts {
+ u64 cur_inum;
+ struct bch_io_opts fs_io_opts;
+ DARRAY(struct snapshot_io_opts_entry) d;
+};
+
+static inline void per_snapshot_io_opts_init(struct per_snapshot_io_opts *io_opts, struct bch_fs *c)
+{
+ memset(io_opts, 0, sizeof(*io_opts));
+ io_opts->fs_io_opts = bch2_opts_to_inode_opts(c->opts);
+}
+
+static inline void per_snapshot_io_opts_exit(struct per_snapshot_io_opts *io_opts)
+{
+ darray_exit(&io_opts->d);
+}
+
+struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *,
+ struct per_snapshot_io_opts *, struct bkey_s_c);
+int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *, struct bkey_s_c);
int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *);
+int bch2_move_extent(struct moving_context *,
+ struct move_bucket_in_flight *,
+ struct btree_iter *,
+ struct bkey_s_c,
+ struct bch_io_opts,
+ struct data_update_opts);
+
+int __bch2_move_data(struct moving_context *,
+ struct bbpos,
+ struct bbpos,
+ move_pred_fn, void *);
int bch2_move_data(struct bch_fs *,
- enum btree_id, struct bpos,
- enum btree_id, struct bpos,
+ struct bbpos start,
+ struct bbpos end,
struct bch_ratelimit *,
struct bch_move_stats *,
struct write_point_specifier,
bool,
move_pred_fn, void *);
-int __bch2_evacuate_bucket(struct btree_trans *,
- struct moving_context *,
+int __bch2_evacuate_bucket(struct moving_context *,
struct move_bucket_in_flight *,
struct bpos, int,
struct data_update_opts);
@@ -88,7 +127,10 @@ int bch2_data_job(struct bch_fs *,
struct bch_move_stats *,
struct bch_ioctl_data);
-void bch2_move_stats_init(struct bch_move_stats *stats, char *name);
+void bch2_move_stats_to_text(struct printbuf *, struct bch_move_stats *);
+void bch2_move_stats_exit(struct bch_move_stats *, struct bch_fs *);
+void bch2_move_stats_init(struct bch_move_stats *, char *);
+
void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *);
void bch2_fs_move_init(struct bch_fs *);
diff --git a/libbcachefs/move_types.h b/libbcachefs/move_types.h
index baf1f857..e22841ef 100644
--- a/libbcachefs/move_types.h
+++ b/libbcachefs/move_types.h
@@ -2,17 +2,17 @@
#ifndef _BCACHEFS_MOVE_TYPES_H
#define _BCACHEFS_MOVE_TYPES_H
+#include "bbpos_types.h"
+
struct bch_move_stats {
enum bch_data_type data_type;
- enum btree_id btree_id;
- struct bpos pos;
- struct list_head list;
+ struct bbpos pos;
char name[32];
atomic64_t keys_moved;
atomic64_t keys_raced;
- atomic64_t sectors_moved;
atomic64_t sectors_seen;
+ atomic64_t sectors_moved;
atomic64_t sectors_raced;
};
diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c
index 4017120b..f73b9b7f 100644
--- a/libbcachefs/movinggc.c
+++ b/libbcachefs/movinggc.c
@@ -101,8 +101,7 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
return ret;
}
-static void move_buckets_wait(struct btree_trans *trans,
- struct moving_context *ctxt,
+static void move_buckets_wait(struct moving_context *ctxt,
struct buckets_in_flight *list,
bool flush)
{
@@ -111,7 +110,7 @@ static void move_buckets_wait(struct btree_trans *trans,
while ((i = list->first)) {
if (flush)
- move_ctxt_wait_event(ctxt, trans, !atomic_read(&i->count));
+ move_ctxt_wait_event(ctxt, !atomic_read(&i->count));
if (atomic_read(&i->count))
break;
@@ -129,7 +128,7 @@ static void move_buckets_wait(struct btree_trans *trans,
kfree(i);
}
- bch2_trans_unlock(trans);
+ bch2_trans_unlock(ctxt->trans);
}
static bool bucket_in_flight(struct buckets_in_flight *list,
@@ -140,11 +139,11 @@ static bool bucket_in_flight(struct buckets_in_flight *list,
typedef DARRAY(struct move_bucket) move_buckets;
-static int bch2_copygc_get_buckets(struct btree_trans *trans,
- struct moving_context *ctxt,
+static int bch2_copygc_get_buckets(struct moving_context *ctxt,
struct buckets_in_flight *buckets_in_flight,
move_buckets *buckets)
{
+ struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_s_c k;
@@ -152,7 +151,7 @@ static int bch2_copygc_get_buckets(struct btree_trans *trans,
size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0;
int ret;
- move_buckets_wait(trans, ctxt, buckets_in_flight, false);
+ move_buckets_wait(ctxt, buckets_in_flight, false);
ret = bch2_btree_write_buffer_flush(trans);
if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_flush()",
@@ -188,10 +187,10 @@ static int bch2_copygc_get_buckets(struct btree_trans *trans,
}
noinline
-static int bch2_copygc(struct btree_trans *trans,
- struct moving_context *ctxt,
+static int bch2_copygc(struct moving_context *ctxt,
struct buckets_in_flight *buckets_in_flight)
{
+ struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
struct data_update_opts data_opts = {
.btree_insert_flags = BCH_WATERMARK_copygc,
@@ -202,7 +201,7 @@ static int bch2_copygc(struct btree_trans *trans,
u64 moved = atomic64_read(&ctxt->stats->sectors_moved);
int ret = 0;
- ret = bch2_copygc_get_buckets(trans, ctxt, buckets_in_flight, &buckets);
+ ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight, &buckets);
if (ret)
goto err;
@@ -221,7 +220,7 @@ static int bch2_copygc(struct btree_trans *trans,
break;
}
- ret = __bch2_evacuate_bucket(trans, ctxt, f, f->bucket.k.bucket,
+ ret = __bch2_evacuate_bucket(ctxt, f, f->bucket.k.bucket,
f->bucket.k.gen, data_opts);
if (ret)
goto err;
@@ -300,7 +299,6 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c)
static int bch2_copygc_thread(void *arg)
{
struct bch_fs *c = arg;
- struct btree_trans *trans;
struct moving_context ctxt;
struct bch_move_stats move_stats;
struct io_clock *clock = &c->io_clock[WRITE];
@@ -317,7 +315,6 @@ static int bch2_copygc_thread(void *arg)
}
set_freezable();
- trans = bch2_trans_get(c);
bch2_move_stats_init(&move_stats, "copygc");
bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats,
@@ -325,16 +322,16 @@ static int bch2_copygc_thread(void *arg)
false);
while (!ret && !kthread_should_stop()) {
- bch2_trans_unlock(trans);
+ bch2_trans_unlock(ctxt.trans);
cond_resched();
if (!c->copy_gc_enabled) {
- move_buckets_wait(trans, &ctxt, &buckets, true);
+ move_buckets_wait(&ctxt, &buckets, true);
kthread_wait_freezable(c->copy_gc_enabled);
}
if (unlikely(freezing(current))) {
- move_buckets_wait(trans, &ctxt, &buckets, true);
+ move_buckets_wait(&ctxt, &buckets, true);
__refrigerator(false);
continue;
}
@@ -345,7 +342,7 @@ static int bch2_copygc_thread(void *arg)
if (wait > clock->max_slop) {
c->copygc_wait_at = last;
c->copygc_wait = last + wait;
- move_buckets_wait(trans, &ctxt, &buckets, true);
+ move_buckets_wait(&ctxt, &buckets, true);
trace_and_count(c, copygc_wait, c, wait, last + wait);
bch2_kthread_io_clock_wait(clock, last + wait,
MAX_SCHEDULE_TIMEOUT);
@@ -355,16 +352,16 @@ static int bch2_copygc_thread(void *arg)
c->copygc_wait = 0;
c->copygc_running = true;
- ret = bch2_copygc(trans, &ctxt, &buckets);
+ ret = bch2_copygc(&ctxt, &buckets);
c->copygc_running = false;
wake_up(&c->copygc_running_wq);
}
- move_buckets_wait(trans, &ctxt, &buckets, true);
+ move_buckets_wait(&ctxt, &buckets, true);
rhashtable_destroy(&buckets.table);
- bch2_trans_put(trans);
bch2_moving_ctxt_exit(&ctxt);
+ bch2_move_stats_exit(&move_stats, c);
return 0;
}
diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c
index 8294f56e..b7722b62 100644
--- a/libbcachefs/opts.c
+++ b/libbcachefs/opts.c
@@ -294,6 +294,9 @@ int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err)
return -EINVAL;
}
+ if (opt->fn.validate)
+ return opt->fn.validate(v, err);
+
return 0;
}
diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h
index 16dd0f06..2307cdd2 100644
--- a/libbcachefs/opts.h
+++ b/libbcachefs/opts.h
@@ -74,6 +74,7 @@ enum opt_type {
struct bch_opt_fn {
int (*parse)(struct bch_fs *, const char *, u64 *, struct printbuf *);
void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
+ int (*validate)(u64, struct printbuf *);
};
/**
diff --git a/libbcachefs/printbuf.c b/libbcachefs/printbuf.c
index de41f9a1..5e653eb8 100644
--- a/libbcachefs/printbuf.c
+++ b/libbcachefs/printbuf.c
@@ -415,11 +415,11 @@ void bch2_prt_bitflags(struct printbuf *out,
while (list[nr])
nr++;
- while (flags && (bit = __ffs(flags)) < nr) {
+ while (flags && (bit = __ffs64(flags)) < nr) {
if (!first)
bch2_prt_printf(out, ",");
first = false;
bch2_prt_printf(out, "%s", list[bit]);
- flags ^= 1 << bit;
+ flags ^= BIT_ULL(bit);
}
}
diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c
index 568f1e8e..6ba8574b 100644
--- a/libbcachefs/rebalance.c
+++ b/libbcachefs/rebalance.c
@@ -3,13 +3,18 @@
#include "bcachefs.h"
#include "alloc_foreground.h"
#include "btree_iter.h"
+#include "btree_update.h"
+#include "btree_write_buffer.h"
#include "buckets.h"
#include "clock.h"
#include "compress.h"
#include "disk_groups.h"
#include "errcode.h"
+#include "error.h"
+#include "inode.h"
#include "move.h"
#include "rebalance.h"
+#include "subvolume.h"
#include "super-io.h"
#include "trace.h"
@@ -17,302 +22,398 @@
#include <linux/kthread.h>
#include <linux/sched/cputime.h>
-/*
- * Check if an extent should be moved:
- * returns -1 if it should not be moved, or
- * device of pointer that should be moved, if known, or INT_MAX if unknown
- */
-static bool rebalance_pred(struct bch_fs *c, void *arg,
- struct bkey_s_c k,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts)
-{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- unsigned i;
+#define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1)
- data_opts->rewrite_ptrs = 0;
- data_opts->target = io_opts->background_target;
- data_opts->extra_replicas = 0;
- data_opts->btree_insert_flags = 0;
-
- if (io_opts->background_compression &&
- !bch2_bkey_is_incompressible(k)) {
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
-
- i = 0;
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- if (!p.ptr.cached &&
- p.crc.compression_type !=
- bch2_compression_opt_to_type(io_opts->background_compression))
- data_opts->rewrite_ptrs |= 1U << i;
- i++;
- }
- }
-
- if (io_opts->background_target) {
- const struct bch_extent_ptr *ptr;
+static const char * const bch2_rebalance_state_strs[] = {
+#define x(t) #t,
+ BCH_REBALANCE_STATES()
+ NULL
+#undef x
+};
- i = 0;
- bkey_for_each_ptr(ptrs, ptr) {
- if (!ptr->cached &&
- !bch2_dev_in_target(c, ptr->dev, io_opts->background_target) &&
- bch2_target_accepts_data(c, BCH_DATA_user, io_opts->background_target))
- data_opts->rewrite_ptrs |= 1U << i;
- i++;
- }
- }
+static int __bch2_set_rebalance_needs_scan(struct btree_trans *trans, u64 inum)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ struct bkey_i_cookie *cookie;
+ u64 v;
+ int ret;
- return data_opts->rewrite_ptrs != 0;
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
+ SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
+ BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ v = k.k->type == KEY_TYPE_cookie
+ ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)
+ : 0;
+
+ cookie = bch2_trans_kmalloc(trans, sizeof(*cookie));
+ ret = PTR_ERR_OR_ZERO(cookie);
+ if (ret)
+ goto err;
+
+ bkey_cookie_init(&cookie->k_i);
+ cookie->k.p = iter.pos;
+ cookie->v.cookie = cpu_to_le64(v + 1);
+
+ ret = bch2_trans_update(trans, &iter, &cookie->k_i, 0);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
}
-void bch2_rebalance_add_key(struct bch_fs *c,
- struct bkey_s_c k,
- struct bch_io_opts *io_opts)
+int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)
{
- struct data_update_opts update_opts = { 0 };
- struct bkey_ptrs_c ptrs;
- const struct bch_extent_ptr *ptr;
- unsigned i;
+ int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+ __bch2_set_rebalance_needs_scan(trans, inum));
+ rebalance_wakeup(c);
+ return ret;
+}
- if (!rebalance_pred(c, NULL, k, io_opts, &update_opts))
- return;
-
- i = 0;
- ptrs = bch2_bkey_ptrs_c(k);
- bkey_for_each_ptr(ptrs, ptr) {
- if ((1U << i) && update_opts.rewrite_ptrs)
- if (atomic64_add_return(k.k->size,
- &bch_dev_bkey_exists(c, ptr->dev)->rebalance_work) ==
- k.k->size)
- rebalance_wakeup(c);
- i++;
- }
+int bch2_set_fs_needs_rebalance(struct bch_fs *c)
+{
+ return bch2_set_rebalance_needs_scan(c, 0);
}
-void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors)
+static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, u64 cookie)
{
- if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) ==
- sectors)
- rebalance_wakeup(c);
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ u64 v;
+ int ret;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
+ SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
+ BTREE_ITER_INTENT);
+ k = bch2_btree_iter_peek_slot(&iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ v = k.k->type == KEY_TYPE_cookie
+ ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)
+ : 0;
+
+ if (v == cookie)
+ ret = bch2_btree_delete_at(trans, &iter, 0);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
}
-struct rebalance_work {
- int dev_most_full_idx;
- unsigned dev_most_full_percent;
- u64 dev_most_full_work;
- u64 dev_most_full_capacity;
- u64 total_work;
-};
+static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans,
+ struct btree_iter *work_iter)
+{
+ return !kthread_should_stop()
+ ? bch2_btree_iter_peek(work_iter)
+ : bkey_s_c_null;
+}
-static void rebalance_work_accumulate(struct rebalance_work *w,
- u64 dev_work, u64 unknown_dev, u64 capacity, int idx)
+static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
{
- unsigned percent_full;
- u64 work = dev_work + unknown_dev;
+ struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0);
+ int ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ return ret;
- /* avoid divide by 0 */
- if (!capacity)
- return;
+ extent_entry_drop(bkey_i_to_s(n),
+ (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n)));
+ return bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
+}
+
+static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
+ struct bpos work_pos,
+ struct btree_iter *extent_iter,
+ struct data_update_opts *data_opts)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c k;
+
+ bch2_trans_iter_exit(trans, extent_iter);
+ bch2_trans_iter_init(trans, extent_iter,
+ work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink,
+ work_pos,
+ BTREE_ITER_ALL_SNAPSHOTS);
+ k = bch2_btree_iter_peek_slot(extent_iter);
+ if (bkey_err(k))
+ return k;
+
+ const struct bch_extent_rebalance *r = k.k ? bch2_bkey_rebalance_opts(k) : NULL;
+ if (!r) {
+ /* raced due to btree write buffer, nothing to do */
+ return bkey_s_c_null;
+ }
- if (work < dev_work || work < unknown_dev)
- work = U64_MAX;
- work = min(work, capacity);
+ memset(data_opts, 0, sizeof(*data_opts));
- percent_full = div64_u64(work * 100, capacity);
+ data_opts->rewrite_ptrs =
+ bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression);
+ data_opts->target = r->target;
- if (percent_full >= w->dev_most_full_percent) {
- w->dev_most_full_idx = idx;
- w->dev_most_full_percent = percent_full;
- w->dev_most_full_work = work;
- w->dev_most_full_capacity = capacity;
+ if (!data_opts->rewrite_ptrs) {
+ /*
+ * device we would want to write to offline? devices in target
+ * changed?
+ *
+ * We'll now need a full scan before this extent is picked up
+ * again:
+ */
+ int ret = bch2_bkey_clear_needs_rebalance(trans, extent_iter, k);
+ if (ret)
+ return bkey_s_c_err(ret);
+ return bkey_s_c_null;
}
- if (w->total_work + dev_work >= w->total_work &&
- w->total_work + dev_work >= dev_work)
- w->total_work += dev_work;
+ return k;
}
-static struct rebalance_work rebalance_work(struct bch_fs *c)
+noinline_for_stack
+static int do_rebalance_extent(struct moving_context *ctxt,
+ struct bpos work_pos,
+ struct btree_iter *extent_iter)
{
- struct bch_dev *ca;
- struct rebalance_work ret = { .dev_most_full_idx = -1 };
- u64 unknown_dev = atomic64_read(&c->rebalance.work_unknown_dev);
- unsigned i;
+ struct btree_trans *trans = ctxt->trans;
+ struct bch_fs *c = trans->c;
+ struct bch_fs_rebalance *r = &trans->c->rebalance;
+ struct data_update_opts data_opts;
+ struct bch_io_opts io_opts;
+ struct bkey_s_c k;
+ struct bkey_buf sk;
+ int ret;
+
+ ctxt->stats = &r->work_stats;
+ r->state = BCH_REBALANCE_working;
- for_each_online_member(ca, c, i)
- rebalance_work_accumulate(&ret,
- atomic64_read(&ca->rebalance_work),
- unknown_dev,
- bucket_to_sector(ca, ca->mi.nbuckets -
- ca->mi.first_bucket),
- i);
+ bch2_bkey_buf_init(&sk);
- rebalance_work_accumulate(&ret,
- unknown_dev, 0, c->capacity, -1);
+ ret = bkey_err(k = next_rebalance_extent(trans, work_pos,
+ extent_iter, &data_opts));
+ if (ret || !k.k)
+ goto out;
+ ret = bch2_move_get_io_opts_one(trans, &io_opts, k);
+ if (ret)
+ goto out;
+
+ atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
+
+ /*
+ * The iterator gets unlocked by __bch2_read_extent - need to
+ * save a copy of @k elsewhere:
+ */
+ bch2_bkey_buf_reassemble(&sk, c, k);
+ k = bkey_i_to_s_c(sk.k);
+
+ ret = bch2_move_extent(ctxt, NULL, extent_iter, k, io_opts, data_opts);
+ if (ret) {
+ if (bch2_err_matches(ret, ENOMEM)) {
+ /* memory allocation failure, wait for some IO to finish */
+ bch2_move_ctxt_wait_for_io(ctxt);
+ ret = -BCH_ERR_transaction_restart_nested;
+ }
+
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ goto out;
+
+ /* skip it and continue, XXX signal failure */
+ ret = 0;
+ }
+out:
+ bch2_bkey_buf_exit(&sk, c);
return ret;
}
-static void rebalance_work_reset(struct bch_fs *c)
+static bool rebalance_pred(struct bch_fs *c, void *arg,
+ struct bkey_s_c k,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
{
- struct bch_dev *ca;
- unsigned i;
+ unsigned target, compression;
- for_each_online_member(ca, c, i)
- atomic64_set(&ca->rebalance_work, 0);
+ if (k.k->p.inode) {
+ target = io_opts->background_target;
+ compression = io_opts->background_compression ?: io_opts->compression;
+ } else {
+ const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
+
+ target = r ? r->target : io_opts->background_target;
+ compression = r ? r->compression :
+ (io_opts->background_compression ?: io_opts->compression);
+ }
- atomic64_set(&c->rebalance.work_unknown_dev, 0);
+ data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, target, compression);
+ data_opts->target = target;
+ return data_opts->rewrite_ptrs != 0;
}
-static unsigned long curr_cputime(void)
+static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie)
{
- u64 utime, stime;
+ struct btree_trans *trans = ctxt->trans;
+ struct bch_fs_rebalance *r = &trans->c->rebalance;
+ int ret;
+
+ bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
+ ctxt->stats = &r->scan_stats;
- task_cputime_adjusted(current, &utime, &stime);
- return nsecs_to_jiffies(utime + stime);
+ if (!inum) {
+ r->scan_start = BBPOS_MIN;
+ r->scan_end = BBPOS_MAX;
+ } else {
+ r->scan_start = BBPOS(BTREE_ID_extents, POS(inum, 0));
+ r->scan_end = BBPOS(BTREE_ID_extents, POS(inum, U64_MAX));
+ }
+
+ r->state = BCH_REBALANCE_scanning;
+
+ ret = __bch2_move_data(ctxt, r->scan_start, r->scan_end, rebalance_pred, NULL) ?:
+ commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ bch2_clear_rebalance_needs_scan(trans, inum, cookie));
+
+ bch2_move_stats_exit(&r->scan_stats, trans->c);
+ return ret;
}
-static int bch2_rebalance_thread(void *arg)
+static void rebalance_wait(struct bch_fs *c)
{
- struct bch_fs *c = arg;
struct bch_fs_rebalance *r = &c->rebalance;
+ struct bch_dev *ca;
struct io_clock *clock = &c->io_clock[WRITE];
- struct rebalance_work w, p;
- struct bch_move_stats move_stats;
- unsigned long start, prev_start;
- unsigned long prev_run_time, prev_run_cputime;
- unsigned long cputime, prev_cputime;
- u64 io_start;
- long throttle;
+ u64 now = atomic64_read(&clock->now);
+ u64 min_member_capacity = 128 * 2048;
+ unsigned i;
- set_freezable();
+ for_each_rw_member(ca, c, i)
+ min_member_capacity = min(min_member_capacity,
+ ca->mi.nbuckets * ca->mi.bucket_size);
+
+ r->wait_iotime_end = now + (min_member_capacity >> 6);
+
+ if (r->state != BCH_REBALANCE_waiting) {
+ r->wait_iotime_start = now;
+ r->wait_wallclock_start = ktime_get_real_ns();
+ r->state = BCH_REBALANCE_waiting;
+ }
- io_start = atomic64_read(&clock->now);
- p = rebalance_work(c);
- prev_start = jiffies;
- prev_cputime = curr_cputime();
+ bch2_kthread_io_clock_wait(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT);
+}
- bch2_move_stats_init(&move_stats, "rebalance");
- while (!kthread_wait_freezable(r->enabled)) {
- cond_resched();
+static int do_rebalance(struct moving_context *ctxt)
+{
+ struct btree_trans *trans = ctxt->trans;
+ struct bch_fs *c = trans->c;
+ struct bch_fs_rebalance *r = &c->rebalance;
+ struct btree_iter rebalance_work_iter, extent_iter = { NULL };
+ struct bkey_s_c k;
+ int ret = 0;
- start = jiffies;
- cputime = curr_cputime();
+ bch2_move_stats_init(&r->work_stats, "rebalance_work");
+ bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
- prev_run_time = start - prev_start;
- prev_run_cputime = cputime - prev_cputime;
+ bch2_trans_iter_init(trans, &rebalance_work_iter,
+ BTREE_ID_rebalance_work, POS_MIN,
+ BTREE_ITER_ALL_SNAPSHOTS);
- w = rebalance_work(c);
- BUG_ON(!w.dev_most_full_capacity);
+ while (!bch2_move_ratelimit(ctxt) &&
+ !kthread_wait_freezable(r->enabled)) {
+ bch2_trans_begin(trans);
- if (!w.total_work) {
- r->state = REBALANCE_WAITING;
- kthread_wait_freezable(rebalance_work(c).total_work);
+ ret = bkey_err(k = next_rebalance_entry(trans, &rebalance_work_iter));
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
- }
+ if (ret || !k.k)
+ break;
- /*
- * If there isn't much work to do, throttle cpu usage:
- */
- throttle = prev_run_cputime * 100 /
- max(1U, w.dev_most_full_percent) -
- prev_run_time;
-
- if (w.dev_most_full_percent < 20 && throttle > 0) {
- r->throttled_until_iotime = io_start +
- div_u64(w.dev_most_full_capacity *
- (20 - w.dev_most_full_percent),
- 50);
-
- if (atomic64_read(&clock->now) + clock->max_slop <
- r->throttled_until_iotime) {
- r->throttled_until_cputime = start + throttle;
- r->state = REBALANCE_THROTTLED;
-
- bch2_kthread_io_clock_wait(clock,
- r->throttled_until_iotime,
- throttle);
- continue;
- }
- }
+ ret = k.k->type == KEY_TYPE_cookie
+ ? do_rebalance_scan(ctxt, k.k->p.inode,
+ le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie))
+ : do_rebalance_extent(ctxt, k.k->p, &extent_iter);
+
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
+ if (ret)
+ break;
- /* minimum 1 mb/sec: */
- r->pd.rate.rate =
- max_t(u64, 1 << 11,
- r->pd.rate.rate *
- max(p.dev_most_full_percent, 1U) /
- max(w.dev_most_full_percent, 1U));
-
- io_start = atomic64_read(&clock->now);
- p = w;
- prev_start = start;
- prev_cputime = cputime;
-
- r->state = REBALANCE_RUNNING;
- memset(&move_stats, 0, sizeof(move_stats));
- rebalance_work_reset(c);
-
- bch2_move_data(c,
- 0, POS_MIN,
- BTREE_ID_NR, POS_MAX,
- /* ratelimiting disabled for now */
- NULL, /* &r->pd.rate, */
- &move_stats,
- writepoint_ptr(&c->rebalance_write_point),
- true,
- rebalance_pred, NULL);
+ bch2_btree_iter_advance(&rebalance_work_iter);
}
- return 0;
+ bch2_trans_iter_exit(trans, &extent_iter);
+ bch2_trans_iter_exit(trans, &rebalance_work_iter);
+ bch2_move_stats_exit(&r->scan_stats, c);
+
+ if (!ret &&
+ !kthread_should_stop() &&
+ !atomic64_read(&r->work_stats.sectors_seen) &&
+ !atomic64_read(&r->scan_stats.sectors_seen)) {
+ bch2_trans_unlock(trans);
+ rebalance_wait(c);
+ }
+
+ bch_err_fn(c, ret);
+ return ret;
}
-void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c)
+static int bch2_rebalance_thread(void *arg)
{
+ struct bch_fs *c = arg;
struct bch_fs_rebalance *r = &c->rebalance;
- struct rebalance_work w = rebalance_work(c);
+ struct moving_context ctxt;
+ int ret;
- if (!out->nr_tabstops)
- printbuf_tabstop_push(out, 20);
+ set_freezable();
- prt_printf(out, "fullest_dev (%i):", w.dev_most_full_idx);
- prt_tab(out);
+ bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats,
+ writepoint_ptr(&c->rebalance_write_point),
+ true);
- prt_human_readable_u64(out, w.dev_most_full_work << 9);
- prt_printf(out, "/");
- prt_human_readable_u64(out, w.dev_most_full_capacity << 9);
- prt_newline(out);
+ while (!kthread_should_stop() &&
+ !(ret = do_rebalance(&ctxt)))
+ ;
- prt_printf(out, "total work:");
- prt_tab(out);
+ bch2_moving_ctxt_exit(&ctxt);
- prt_human_readable_u64(out, w.total_work << 9);
- prt_printf(out, "/");
- prt_human_readable_u64(out, c->capacity << 9);
- prt_newline(out);
+ return 0;
+}
+
+void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ struct bch_fs_rebalance *r = &c->rebalance;
- prt_printf(out, "rate:");
- prt_tab(out);
- prt_printf(out, "%u", r->pd.rate.rate);
+ prt_str(out, bch2_rebalance_state_strs[r->state]);
prt_newline(out);
+ printbuf_indent_add(out, 2);
switch (r->state) {
- case REBALANCE_WAITING:
- prt_printf(out, "waiting");
+ case BCH_REBALANCE_waiting: {
+ u64 now = atomic64_read(&c->io_clock[WRITE].now);
+
+ prt_str(out, "io wait duration: ");
+ bch2_prt_human_readable_s64(out, r->wait_iotime_end - r->wait_iotime_start);
+ prt_newline(out);
+
+ prt_str(out, "io wait remaining: ");
+ bch2_prt_human_readable_s64(out, r->wait_iotime_end - now);
+ prt_newline(out);
+
+ prt_str(out, "duration waited: ");
+ bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start);
+ prt_newline(out);
break;
- case REBALANCE_THROTTLED:
- prt_printf(out, "throttled for %lu sec or ",
- (r->throttled_until_cputime - jiffies) / HZ);
- prt_human_readable_u64(out,
- (r->throttled_until_iotime -
- atomic64_read(&c->io_clock[WRITE].now)) << 9);
- prt_printf(out, " io");
+ }
+ case BCH_REBALANCE_working:
+ bch2_move_stats_to_text(out, &r->work_stats);
break;
- case REBALANCE_RUNNING:
- prt_printf(out, "running");
+ case BCH_REBALANCE_scanning:
+ bch2_move_stats_to_text(out, &r->scan_stats);
break;
}
prt_newline(out);
+ printbuf_indent_sub(out, 2);
}
void bch2_rebalance_stop(struct bch_fs *c)
@@ -361,6 +462,4 @@ int bch2_rebalance_start(struct bch_fs *c)
void bch2_fs_rebalance_init(struct bch_fs *c)
{
bch2_pd_controller_init(&c->rebalance.pd);
-
- atomic64_set(&c->rebalance.work_unknown_dev, S64_MAX);
}
diff --git a/libbcachefs/rebalance.h b/libbcachefs/rebalance.h
index 7ade0bb8..28a52638 100644
--- a/libbcachefs/rebalance.h
+++ b/libbcachefs/rebalance.h
@@ -4,6 +4,9 @@
#include "rebalance_types.h"
+int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum);
+int bch2_set_fs_needs_rebalance(struct bch_fs *);
+
static inline void rebalance_wakeup(struct bch_fs *c)
{
struct task_struct *p;
@@ -15,11 +18,7 @@ static inline void rebalance_wakeup(struct bch_fs *c)
rcu_read_unlock();
}
-void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c,
- struct bch_io_opts *);
-void bch2_rebalance_add_work(struct bch_fs *, u64);
-
-void bch2_rebalance_work_to_text(struct printbuf *, struct bch_fs *);
+void bch2_rebalance_status_to_text(struct printbuf *, struct bch_fs *);
void bch2_rebalance_stop(struct bch_fs *);
int bch2_rebalance_start(struct bch_fs *);
diff --git a/libbcachefs/rebalance_types.h b/libbcachefs/rebalance_types.h
index 7462a92e..0fffb536 100644
--- a/libbcachefs/rebalance_types.h
+++ b/libbcachefs/rebalance_types.h
@@ -2,25 +2,36 @@
#ifndef _BCACHEFS_REBALANCE_TYPES_H
#define _BCACHEFS_REBALANCE_TYPES_H
+#include "bbpos_types.h"
#include "move_types.h"
-enum rebalance_state {
- REBALANCE_WAITING,
- REBALANCE_THROTTLED,
- REBALANCE_RUNNING,
+#define BCH_REBALANCE_STATES() \
+ x(waiting) \
+ x(working) \
+ x(scanning)
+
+enum bch_rebalance_states {
+#define x(t) BCH_REBALANCE_##t,
+ BCH_REBALANCE_STATES()
+#undef x
};
struct bch_fs_rebalance {
- struct task_struct __rcu *thread;
+ struct task_struct __rcu *thread;
struct bch_pd_controller pd;
- atomic64_t work_unknown_dev;
+ enum bch_rebalance_states state;
+ u64 wait_iotime_start;
+ u64 wait_iotime_end;
+ u64 wait_wallclock_start;
+
+ struct bch_move_stats work_stats;
- enum rebalance_state state;
- u64 throttled_until_iotime;
- unsigned long throttled_until_cputime;
+ struct bbpos scan_start;
+ struct bbpos scan_end;
+ struct bch_move_stats scan_stats;
- unsigned enabled:1;
+ unsigned enabled:1;
};
#endif /* _BCACHEFS_REBALANCE_TYPES_H */
diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c
index 55663253..02025099 100644
--- a/libbcachefs/recovery.c
+++ b/libbcachefs/recovery.c
@@ -23,6 +23,7 @@
#include "logged_ops.h"
#include "move.h"
#include "quota.h"
+#include "rebalance.h"
#include "recovery.h"
#include "replicas.h"
#include "sb-clean.h"
@@ -946,16 +947,12 @@ int bch2_fs_initialize(struct bch_fs *c)
for (i = 0; i < BTREE_ID_NR; i++)
bch2_btree_root_alloc(c, i);
- for_each_online_member(ca, c, i)
+ for_each_member_device(ca, c, i)
bch2_dev_usage_init(ca);
- for_each_online_member(ca, c, i) {
- ret = bch2_dev_journal_alloc(ca);
- if (ret) {
- percpu_ref_put(&ca->io_ref);
- goto err;
- }
- }
+ ret = bch2_fs_journal_alloc(c);
+ if (ret)
+ goto err;
/*
* journal_res_get() will crash if called before this has
@@ -973,15 +970,13 @@ int bch2_fs_initialize(struct bch_fs *c)
* btree updates
*/
bch_verbose(c, "marking superblocks");
- for_each_member_device(ca, c, i) {
- ret = bch2_trans_mark_dev_sb(c, ca);
- if (ret) {
- percpu_ref_put(&ca->ref);
- goto err;
- }
+ ret = bch2_trans_mark_dev_sbs(c);
+ bch_err_msg(c, ret, "marking superblocks");
+ if (ret)
+ goto err;
+ for_each_online_member(ca, c, i)
ca->new_fs_bucket_idx = 0;
- }
ret = bch2_fs_freespace_init(c);
if (ret)
diff --git a/libbcachefs/recovery_types.h b/libbcachefs/recovery_types.h
index 4c1cea2a..515e3d62 100644
--- a/libbcachefs/recovery_types.h
+++ b/libbcachefs/recovery_types.h
@@ -14,6 +14,8 @@
x(snapshots_read, PASS_ALWAYS) \
x(check_topology, 0) \
x(check_allocations, PASS_FSCK) \
+ x(trans_mark_dev_sbs, PASS_ALWAYS|PASS_SILENT) \
+ x(fs_journal_alloc, PASS_ALWAYS|PASS_SILENT) \
x(set_may_go_rw, PASS_ALWAYS|PASS_SILENT) \
x(journal_replay, PASS_ALWAYS) \
x(check_alloc_info, PASS_FSCK) \
@@ -32,6 +34,7 @@
x(resume_logged_ops, PASS_ALWAYS) \
x(check_inodes, PASS_FSCK) \
x(check_extents, PASS_FSCK) \
+ x(check_indirect_extents, PASS_FSCK) \
x(check_dirents, PASS_FSCK) \
x(check_xattrs, PASS_FSCK) \
x(check_root, PASS_FSCK) \
@@ -39,6 +42,7 @@
x(check_nlinks, PASS_FSCK) \
x(delete_dead_inodes, PASS_FSCK|PASS_UNCLEAN) \
x(fix_reflink_p, 0) \
+ x(set_fs_needs_rebalance, 0) \
enum bch_recovery_pass {
#define x(n, when) BCH_RECOVERY_PASS_##n,
diff --git a/libbcachefs/reflink.c b/libbcachefs/reflink.c
index d77d0ea9..dbbdf195 100644
--- a/libbcachefs/reflink.c
+++ b/libbcachefs/reflink.c
@@ -7,6 +7,7 @@
#include "inode.h"
#include "io_misc.h"
#include "io_write.h"
+#include "rebalance.h"
#include "reflink.h"
#include "subvolume.h"
#include "super-io.h"
@@ -103,21 +104,22 @@ bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
}
#endif
+static inline void check_indirect_extent_deleting(struct bkey_i *new, unsigned *flags)
+{
+ if ((*flags & BTREE_TRIGGER_INSERT) && !*bkey_refcount(new)) {
+ new->k.type = KEY_TYPE_deleted;
+ new->k.size = 0;
+ set_bkey_val_u64s(&new->k, 0);;
+ *flags &= ~BTREE_TRIGGER_INSERT;
+ }
+}
+
int bch2_trans_mark_reflink_v(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_i *new,
unsigned flags)
{
- if (!(flags & BTREE_TRIGGER_OVERWRITE)) {
- struct bkey_i_reflink_v *r = bkey_i_to_reflink_v(new);
-
- if (!r->v.refcount) {
- r->k.type = KEY_TYPE_deleted;
- r->k.size = 0;
- set_bkey_val_u64s(&r->k, 0);
- return 0;
- }
- }
+ check_indirect_extent_deleting(new, &flags);
return bch2_trans_mark_extent(trans, btree_id, level, old, new, flags);
}
@@ -132,7 +134,7 @@ int bch2_indirect_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k,
}
void bch2_indirect_inline_data_to_text(struct printbuf *out,
- struct bch_fs *c, struct bkey_s_c k)
+ struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k);
unsigned datalen = bkey_inline_data_bytes(k.k);
@@ -147,16 +149,7 @@ int bch2_trans_mark_indirect_inline_data(struct btree_trans *trans,
struct bkey_s_c old, struct bkey_i *new,
unsigned flags)
{
- if (!(flags & BTREE_TRIGGER_OVERWRITE)) {
- struct bkey_i_indirect_inline_data *r =
- bkey_i_to_indirect_inline_data(new);
-
- if (!r->v.refcount) {
- r->k.type = KEY_TYPE_deleted;
- r->k.size = 0;
- set_bkey_val_u64s(&r->k, 0);
- }
- }
+ check_indirect_extent_deleting(new, &flags);
return 0;
}
@@ -260,6 +253,7 @@ s64 bch2_remap_range(struct bch_fs *c,
struct bpos dst_start = POS(dst_inum.inum, dst_offset);
struct bpos src_start = POS(src_inum.inum, src_offset);
struct bpos dst_end = dst_start, src_end = src_start;
+ struct bch_io_opts opts;
struct bpos src_want;
u64 dst_done;
u32 dst_snapshot, src_snapshot;
@@ -277,6 +271,10 @@ s64 bch2_remap_range(struct bch_fs *c,
bch2_bkey_buf_init(&new_src);
trans = bch2_trans_get(c);
+ ret = bch2_inum_opts_get(trans, src_inum, &opts);
+ if (ret)
+ goto err;
+
bch2_trans_iter_init(trans, &src_iter, BTREE_ID_extents, src_start,
BTREE_ITER_INTENT);
bch2_trans_iter_init(trans, &dst_iter, BTREE_ID_extents, dst_start,
@@ -360,10 +358,13 @@ s64 bch2_remap_range(struct bch_fs *c,
min(src_k.k->p.offset - src_want.offset,
dst_end.offset - dst_iter.pos.offset));
- ret = bch2_extent_update(trans, dst_inum, &dst_iter,
- new_dst.k, &disk_res,
- new_i_size, i_sectors_delta,
- true);
+ ret = bch2_bkey_set_needs_rebalance(c, new_dst.k,
+ opts.background_target,
+ opts.background_compression) ?:
+ bch2_extent_update(trans, dst_inum, &dst_iter,
+ new_dst.k, &disk_res,
+ new_i_size, i_sectors_delta,
+ true);
bch2_disk_reservation_put(c, &disk_res);
}
bch2_trans_iter_exit(trans, &dst_iter);
@@ -394,7 +395,7 @@ s64 bch2_remap_range(struct bch_fs *c,
bch2_trans_iter_exit(trans, &inode_iter);
} while (bch2_err_matches(ret2, BCH_ERR_transaction_restart));
-
+err:
bch2_trans_put(trans);
bch2_bkey_buf_exit(&new_src, c);
bch2_bkey_buf_exit(&new_dst, c);
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index 0e85c226..ce59018b 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -949,9 +949,6 @@ int bch2_fs_start(struct bch_fs *c)
}
for_each_online_member(ca, c, i)
- bch2_sb_from_fs(c, ca);
-
- for_each_online_member(ca, c, i)
bch2_members_v2_get_mut(c->disk_sb.sb, i)->last_mount = cpu_to_le64(now);
mutex_unlock(&c->sb_lock);
@@ -960,12 +957,6 @@ int bch2_fs_start(struct bch_fs *c)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
- for (i = 0; i < BCH_TRANSACTIONS_NR; i++) {
- mutex_lock(&c->btree_transaction_stats[i].lock);
- bch2_time_stats_init(&c->btree_transaction_stats[i].lock_hold_times);
- mutex_unlock(&c->btree_transaction_stats[i].lock);
- }
-
ret = BCH_SB_INITIALIZED(c->disk_sb.sb)
? bch2_fs_recovery(c)
: bch2_fs_initialize(c);
@@ -1591,7 +1582,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx);
if (BCH_MEMBER_GROUP(&dev_mi)) {
- bch2_disk_path_to_text(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1);
+ bch2_disk_path_to_text_sb(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1);
if (label.allocation_failure) {
ret = -ENOMEM;
goto err;
@@ -1689,13 +1680,13 @@ have_slot:
ret = bch2_trans_mark_dev_sb(c, ca);
if (ret) {
- bch_err_msg(c, ret, "marking new superblock");
+ bch_err_msg(ca, ret, "marking new superblock");
goto err_late;
}
ret = bch2_fs_freespace_init(c);
if (ret) {
- bch_err_msg(c, ret, "initializing free space");
+ bch_err_msg(ca, ret, "initializing free space");
goto err_late;
}
@@ -1763,19 +1754,26 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
if (ca->mi.state == BCH_MEMBER_STATE_rw)
__bch2_dev_read_write(c, ca);
- mutex_lock(&c->sb_lock);
- struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+ if (!ca->mi.freespace_initialized) {
+ ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
+ bch_err_msg(ca, ret, "initializing free space");
+ if (ret)
+ goto err;
+ }
- m->last_mount =
- cpu_to_le64(ktime_get_real_seconds());
+ if (!ca->journal.nr) {
+ ret = bch2_dev_journal_alloc(ca);
+ bch_err_msg(ca, ret, "allocating journal");
+ if (ret)
+ goto err;
+ }
+ mutex_lock(&c->sb_lock);
+ bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount =
+ cpu_to_le64(ktime_get_real_seconds());
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
- ret = bch2_fs_freespace_init(c);
- if (ret)
- bch_err_msg(c, ret, "initializing free space");
-
up_write(&c->state_lock);
return 0;
err:
diff --git a/libbcachefs/super_types.h b/libbcachefs/super_types.h
index 78d6138d..7dda4985 100644
--- a/libbcachefs/super_types.h
+++ b/libbcachefs/super_types.h
@@ -37,16 +37,4 @@ struct bch_member_cpu {
u8 valid;
};
-struct bch_disk_group_cpu {
- bool deleted;
- u16 parent;
- struct bch_devs_mask devs;
-};
-
-struct bch_disk_groups_cpu {
- struct rcu_head rcu;
- unsigned nr;
- struct bch_disk_group_cpu entries[] __counted_by(nr);
-};
-
#endif /* _BCACHEFS_SUPER_TYPES_H */
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index 89544dad..db2727e5 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -212,7 +212,7 @@ read_attribute(copy_gc_wait);
rw_attribute(rebalance_enabled);
sysfs_pd_controller_attribute(rebalance);
-read_attribute(rebalance_work);
+read_attribute(rebalance_status);
rw_attribute(promote_whole_extents);
read_attribute(new_stripes);
@@ -386,8 +386,8 @@ SHOW(bch2_fs)
if (attr == &sysfs_copy_gc_wait)
bch2_copygc_wait_to_text(out, c);
- if (attr == &sysfs_rebalance_work)
- bch2_rebalance_work_to_text(out, c);
+ if (attr == &sysfs_rebalance_status)
+ bch2_rebalance_status_to_text(out, c);
sysfs_print(promote_whole_extents, c->promote_whole_extents);
@@ -646,7 +646,7 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_copy_gc_wait,
&sysfs_rebalance_enabled,
- &sysfs_rebalance_work,
+ &sysfs_rebalance_status,
sysfs_pd_controller_files(rebalance),
&sysfs_moving_ctxts,
@@ -707,10 +707,8 @@ STORE(bch2_fs_opts_dir)
bch2_opt_set_by_id(&c->opts, id, v);
if ((id == Opt_background_target ||
- id == Opt_background_compression) && v) {
- bch2_rebalance_add_work(c, S64_MAX);
- rebalance_wakeup(c);
- }
+ id == Opt_background_compression) && v)
+ bch2_set_rebalance_needs_scan(c, 0);
ret = size;
err:
@@ -910,13 +908,8 @@ SHOW(bch2_dev)
sysfs_print(discard, ca->mi.discard);
if (attr == &sysfs_label) {
- if (ca->mi.group) {
- mutex_lock(&c->sb_lock);
- bch2_disk_path_to_text(out, c->disk_sb.sb,
- ca->mi.group - 1);
- mutex_unlock(&c->sb_lock);
- }
-
+ if (ca->mi.group)
+ bch2_disk_path_to_text(out, c, ca->mi.group - 1);
prt_char(out, '\n');
}
diff --git a/libbcachefs/trace.c b/libbcachefs/trace.c
index 33efa600..dc48b52b 100644
--- a/libbcachefs/trace.c
+++ b/libbcachefs/trace.c
@@ -7,6 +7,7 @@
#include "btree_locking.h"
#include "btree_update_interior.h"
#include "keylist.h"
+#include "move_types.h"
#include "opts.h"
#include "six.h"
diff --git a/libbcachefs/trace.h b/libbcachefs/trace.h
index 2308f49f..81f72b2a 100644
--- a/libbcachefs/trace.h
+++ b/libbcachefs/trace.h
@@ -767,25 +767,36 @@ DEFINE_EVENT(bkey, move_extent_alloc_mem_fail,
);
TRACE_EVENT(move_data,
- TP_PROTO(struct bch_fs *c, u64 sectors_moved,
- u64 keys_moved),
- TP_ARGS(c, sectors_moved, keys_moved),
+ TP_PROTO(struct bch_fs *c,
+ struct bch_move_stats *stats),
+ TP_ARGS(c, stats),
TP_STRUCT__entry(
- __field(dev_t, dev )
- __field(u64, sectors_moved )
+ __field(dev_t, dev )
__field(u64, keys_moved )
+ __field(u64, keys_raced )
+ __field(u64, sectors_seen )
+ __field(u64, sectors_moved )
+ __field(u64, sectors_raced )
),
TP_fast_assign(
- __entry->dev = c->dev;
- __entry->sectors_moved = sectors_moved;
- __entry->keys_moved = keys_moved;
+ __entry->dev = c->dev;
+ __entry->keys_moved = atomic64_read(&stats->keys_moved);
+ __entry->keys_raced = atomic64_read(&stats->keys_raced);
+ __entry->sectors_seen = atomic64_read(&stats->sectors_seen);
+ __entry->sectors_moved = atomic64_read(&stats->sectors_moved);
+ __entry->sectors_raced = atomic64_read(&stats->sectors_raced);
),
- TP_printk("%d,%d sectors_moved %llu keys_moved %llu",
+ TP_printk("%d,%d keys moved %llu raced %llu"
+ "sectors seen %llu moved %llu raced %llu",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->sectors_moved, __entry->keys_moved)
+ __entry->keys_moved,
+ __entry->keys_raced,
+ __entry->sectors_seen,
+ __entry->sectors_moved,
+ __entry->sectors_raced)
);
TRACE_EVENT(evacuate_bucket,
diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c
index b069b1a6..74b41f56 100644
--- a/libbcachefs/xattr.c
+++ b/libbcachefs/xattr.c
@@ -590,7 +590,7 @@ err:
if (value &&
(opt_id == Opt_background_compression ||
opt_id == Opt_background_target))
- bch2_rebalance_add_work(c, inode->v.i_blocks);
+ bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum);
return bch2_err_class(ret);
}
diff --git a/linux/closure.c b/linux/closure.c
index 2958169c..1faa24d6 100644
--- a/linux/closure.c
+++ b/linux/closure.c
@@ -22,6 +22,10 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
panic("closure_put_after_sub: bogus flags %x remaining %i", flags, r);
if (!r) {
+ smp_acquire__after_ctrl_dep();
+
+ cl->closure_get_happened = false;
+
if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) {
atomic_set(&cl->remaining,
CLOSURE_REMAINING_INITIALIZER);
@@ -44,7 +48,7 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
/* For clearing flags with the same atomic op as a put */
void closure_sub(struct closure *cl, int v)
{
- closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining));
+ closure_put_after_sub(cl, atomic_sub_return_release(v, &cl->remaining));
}
EXPORT_SYMBOL(closure_sub);
@@ -53,7 +57,7 @@ EXPORT_SYMBOL(closure_sub);
*/
void closure_put(struct closure *cl)
{
- closure_put_after_sub(cl, atomic_dec_return(&cl->remaining));
+ closure_put_after_sub(cl, atomic_dec_return_release(&cl->remaining));
}
EXPORT_SYMBOL(closure_put);
@@ -91,6 +95,7 @@ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
return false;
+ cl->closure_get_happened = true;
closure_set_waiting(cl, _RET_IP_);
atomic_add(CLOSURE_WAITING + 1, &cl->remaining);
llist_add(&cl->list, &waitlist->list);