From 15b24c732749339e3f65f030e7e68624b1b4bfbd Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 5 Aug 2023 18:06:22 -0400 Subject: Update bcachefs sources to 717b356d1d bcachefs: Convert journal validation to bkey_invalid_flags Signed-off-by: Kent Overstreet --- .bcachefs_revision | 2 +- cmd_device.c | 1 + cmd_dump.c | 1 + cmd_kill_btree_node.c | 1 + libbcachefs/alloc_foreground.h | 2 +- libbcachefs/bcachefs.h | 21 +- libbcachefs/bkey.c | 148 +- libbcachefs/bkey.h | 10 +- libbcachefs/bkey_methods.h | 6 - libbcachefs/btree_cache.c | 15 +- libbcachefs/btree_gc.c | 3 +- libbcachefs/btree_gc.h | 1 + libbcachefs/btree_io.c | 134 +- libbcachefs/btree_iter.c | 2 +- libbcachefs/btree_iter.h | 16 + libbcachefs/btree_journal_iter.c | 531 +++++++ libbcachefs/btree_journal_iter.h | 57 + libbcachefs/btree_trans_commit.c | 1156 ++++++++++++++ libbcachefs/btree_update.c | 943 +++++++++++ libbcachefs/btree_update_interior.c | 2 +- libbcachefs/btree_update_leaf.c | 2107 ------------------------ libbcachefs/buckets.h | 47 +- libbcachefs/checksum.c | 52 +- libbcachefs/checksum.h | 2 + libbcachefs/disk_groups.c | 1 + libbcachefs/errcode.h | 6 + libbcachefs/extents.c | 6 +- libbcachefs/extents.h | 6 +- libbcachefs/fs-io-buffered.c | 1102 +++++++++++++ libbcachefs/fs-io-buffered.h | 28 + libbcachefs/fs-io-direct.c | 678 ++++++++ libbcachefs/fs-io-direct.h | 15 + libbcachefs/fs-io-pagecache.c | 777 +++++++++ libbcachefs/fs-io-pagecache.h | 175 ++ libbcachefs/fs-io.c | 2986 ++--------------------------------- libbcachefs/fs-io.h | 166 +- libbcachefs/fs.c | 11 +- libbcachefs/fsck.c | 1 + libbcachefs/inode.c | 20 +- libbcachefs/journal_io.c | 205 +-- libbcachefs/journal_io.h | 3 +- libbcachefs/journal_reclaim.c | 3 +- libbcachefs/movinggc.c | 4 +- libbcachefs/recovery.c | 749 +-------- libbcachefs/recovery.h | 65 +- libbcachefs/sb-clean.c | 395 +++++ libbcachefs/sb-clean.h | 16 + libbcachefs/sb-members.c | 173 ++ libbcachefs/sb-members.h | 176 +++ libbcachefs/super-io.c | 460 +----- libbcachefs/super-io.h | 10 - libbcachefs/super.c | 11 +- libbcachefs/super.h | 214 --- 53 files changed, 7015 insertions(+), 6707 deletions(-) create mode 100644 libbcachefs/btree_journal_iter.c create mode 100644 libbcachefs/btree_journal_iter.h create mode 100644 libbcachefs/btree_trans_commit.c create mode 100644 libbcachefs/btree_update.c delete mode 100644 libbcachefs/btree_update_leaf.c create mode 100644 libbcachefs/fs-io-buffered.c create mode 100644 libbcachefs/fs-io-buffered.h create mode 100644 libbcachefs/fs-io-direct.c create mode 100644 libbcachefs/fs-io-direct.h create mode 100644 libbcachefs/fs-io-pagecache.c create mode 100644 libbcachefs/fs-io-pagecache.h create mode 100644 libbcachefs/sb-clean.c create mode 100644 libbcachefs/sb-clean.h create mode 100644 libbcachefs/sb-members.c create mode 100644 libbcachefs/sb-members.h diff --git a/.bcachefs_revision b/.bcachefs_revision index b2d87414..d78b2e28 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -5b8c4a1366df20bc043404cb882230ce86296590 +717b356d1dfdf178ac46e217c81bb710b7e77032 diff --git a/cmd_device.c b/cmd_device.c index c59d3709..1914629b 100644 --- a/cmd_device.c +++ b/cmd_device.c @@ -16,6 +16,7 @@ #include "libbcachefs/bcachefs_ioctl.h" #include "libbcachefs/errcode.h" #include "libbcachefs/journal.h" +#include "libbcachefs/sb-members.h" #include "libbcachefs/super-io.h" #include "cmds.h" #include "libbcachefs.h" diff --git a/cmd_dump.c b/cmd_dump.c index ab94707a..e1f0a5aa 100644 --- a/cmd_dump.c +++ b/cmd_dump.c @@ -13,6 +13,7 @@ #include "libbcachefs/btree_iter.h" #include "libbcachefs/error.h" #include "libbcachefs/extents.h" +#include "libbcachefs/sb-members.h" #include "libbcachefs/super.h" static void dump_usage(void) diff --git a/cmd_kill_btree_node.c b/cmd_kill_btree_node.c index a8915a1f..e9b8265d 100644 --- a/cmd_kill_btree_node.c +++ b/cmd_kill_btree_node.c @@ -11,6 +11,7 @@ #include "libbcachefs/btree_iter.h" #include "libbcachefs/errcode.h" #include "libbcachefs/error.h" +#include "libbcachefs/sb-members.h" #include "libbcachefs/super.h" static void kill_btree_node_usage(void) diff --git a/libbcachefs/alloc_foreground.h b/libbcachefs/alloc_foreground.h index fee195f7..7aaeec44 100644 --- a/libbcachefs/alloc_foreground.h +++ b/libbcachefs/alloc_foreground.h @@ -5,7 +5,7 @@ #include "bcachefs.h" #include "alloc_types.h" #include "extents.h" -#include "super.h" +#include "sb-members.h" #include diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index e1f1e8e8..30b3d7b9 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -294,8 +294,8 @@ do { \ #define bch_err_fn(_c, _ret) \ bch_err(_c, "%s(): error %s", __func__, bch2_err_str(_ret)) -#define bch_err_msg(_c, _ret, _msg) \ - bch_err(_c, "%s(): error " _msg " %s", __func__, bch2_err_str(_ret)) +#define bch_err_msg(_c, _ret, _msg, ...) \ + bch_err(_c, "%s(): error " _msg " %s", __func__, ##__VA_ARGS__, bch2_err_str(_ret)) #define bch_verbose(c, fmt, ...) \ do { \ @@ -995,6 +995,7 @@ struct bch_fs { enum bch_recovery_pass curr_recovery_pass; /* bitmap of explicitly enabled recovery passes: */ u64 recovery_passes_explicit; + u64 recovery_passes_complete; /* DEBUG JUNK */ struct dentry *fs_debug_dir; @@ -1139,22 +1140,6 @@ static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev) return dev < c->sb.nr_devices && c->devs[dev]; } -/* - * For when we need to rewind recovery passes and run a pass we skipped: - */ -static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c, - enum bch_recovery_pass pass) -{ - c->recovery_passes_explicit |= BIT_ULL(pass); - - if (c->curr_recovery_pass >= pass) { - c->curr_recovery_pass = pass; - return -BCH_ERR_restart_recovery; - } else { - return 0; - } -} - #define BKEY_PADDED_ONSTACK(key, pad) \ struct { struct bkey_i key; __u64 key ## _pad[pad]; } diff --git a/libbcachefs/bkey.c b/libbcachefs/bkey.c index ee7ba700..d6960e25 100644 --- a/libbcachefs/bkey.c +++ b/libbcachefs/bkey.c @@ -7,14 +7,6 @@ #include "bset.h" #include "util.h" -#undef EBUG_ON - -#ifdef DEBUG_BKEYS -#define EBUG_ON(cond) BUG_ON(cond) -#else -#define EBUG_ON(cond) -#endif - const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT; void bch2_bkey_packed_to_binary_text(struct printbuf *out, @@ -184,6 +176,28 @@ static u64 get_inc_field(struct unpack_state *state, unsigned field) return v + offset; } +__always_inline +static void __set_inc_field(struct pack_state *state, unsigned field, u64 v) +{ + unsigned bits = state->format->bits_per_field[field]; + + if (bits) { + if (bits > state->bits) { + bits -= state->bits; + /* avoid shift by 64 if bits is 64 - bits is never 0 here: */ + state->w |= (v >> 1) >> (bits - 1); + + *state->p = state->w; + state->p = next_word(state->p); + state->w = 0; + state->bits = 64; + } + + state->bits -= bits; + state->w |= v << state->bits; + } +} + __always_inline static bool set_inc_field(struct pack_state *state, unsigned field, u64 v) { @@ -198,20 +212,7 @@ static bool set_inc_field(struct pack_state *state, unsigned field, u64 v) if (fls64(v) > bits) return false; - if (bits > state->bits) { - bits -= state->bits; - /* avoid shift by 64 if bits is 0 - bits is never 64 here: */ - state->w |= (v >> 1) >> (bits - 1); - - *state->p = state->w; - state->p = next_word(state->p); - state->w = 0; - state->bits = 64; - } - - state->bits -= bits; - state->w |= v << state->bits; - + __set_inc_field(state, field, v); return true; } @@ -380,19 +381,7 @@ static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v) ret = false; } - if (bits > state->bits) { - bits -= state->bits; - state->w |= (v >> 1) >> (bits - 1); - - *state->p = state->w; - state->p = next_word(state->p); - state->w = 0; - state->bits = 64; - } - - state->bits -= bits; - state->w |= v << state->bits; - + __set_inc_field(state, field, v); return ret; } @@ -435,6 +424,24 @@ static bool bkey_packed_successor(struct bkey_packed *out, return false; } + +static bool bkey_format_has_too_big_fields(const struct bkey_format *f) +{ + for (unsigned i = 0; i < f->nr_fields; i++) { + unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; + u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1)); + u64 packed_max = f->bits_per_field[i] + ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1)) + : 0; + u64 field_offset = le64_to_cpu(f->field_offset[i]); + + if (packed_max + field_offset < packed_max || + packed_max + field_offset > unpacked_max) + return true; + } + + return false; +} #endif /* @@ -515,7 +522,8 @@ enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out, BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0); BUG_ON(bkey_packed_successor(&successor, b, *out) && - bkey_cmp_left_packed(b, &successor, &orig) < 0); + bkey_cmp_left_packed(b, &successor, &orig) < 0 && + !bkey_format_has_too_big_fields(f)); } #endif @@ -604,40 +612,74 @@ struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s) } } - EBUG_ON(bch2_bkey_format_validate(&ret)); +#ifdef CONFIG_BCACHEFS_DEBUG + { + struct printbuf buf = PRINTBUF; + + BUG_ON(bch2_bkey_format_invalid(NULL, &ret, 0, &buf)); + printbuf_exit(&buf); + } +#endif return ret; } -const char *bch2_bkey_format_validate(struct bkey_format *f) +int bch2_bkey_format_invalid(struct bch_fs *c, + struct bkey_format *f, + enum bkey_invalid_flags flags, + struct printbuf *err) { unsigned i, bits = KEY_PACKED_BITS_START; - if (f->nr_fields != BKEY_NR_FIELDS) - return "incorrect number of fields"; + if (f->nr_fields != BKEY_NR_FIELDS) { + prt_printf(err, "incorrect number of fields: got %u, should be %u", + f->nr_fields, BKEY_NR_FIELDS); + return -BCH_ERR_invalid; + } /* * Verify that the packed format can't represent fields larger than the * unpacked format: */ for (i = 0; i < f->nr_fields; i++) { - unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; - u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1)); - u64 packed_max = f->bits_per_field[i] - ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1)) - : 0; - u64 field_offset = le64_to_cpu(f->field_offset[i]); - - if (packed_max + field_offset < packed_max || - packed_max + field_offset > unpacked_max) - return "field too large"; + if (!c || c->sb.version_min >= bcachefs_metadata_version_snapshot) { + unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; + u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1)); + u64 packed_max = f->bits_per_field[i] + ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1)) + : 0; + u64 field_offset = le64_to_cpu(f->field_offset[i]); + + if (packed_max + field_offset < packed_max || + packed_max + field_offset > unpacked_max) { + prt_printf(err, "field %u too large: %llu + %llu > %llu", + i, packed_max, field_offset, unpacked_max); + return -BCH_ERR_invalid; + } + } bits += f->bits_per_field[i]; } - if (f->key_u64s != DIV_ROUND_UP(bits, 64)) - return "incorrect key_u64s"; + if (f->key_u64s != DIV_ROUND_UP(bits, 64)) { + prt_printf(err, "incorrect key_u64s: got %u, should be %u", + f->key_u64s, DIV_ROUND_UP(bits, 64)); + return -BCH_ERR_invalid; + } + + return 0; +} - return NULL; +void bch2_bkey_format_to_text(struct printbuf *out, const struct bkey_format *f) +{ + prt_printf(out, "u64s %u fields ", f->key_u64s); + + for (unsigned i = 0; i < ARRAY_SIZE(f->bits_per_field); i++) { + if (i) + prt_str(out, ", "); + prt_printf(out, "%u:%llu", + f->bits_per_field[i], + le64_to_cpu(f->field_offset[i])); + } } /* diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h index e81fb3e0..51969a46 100644 --- a/libbcachefs/bkey.h +++ b/libbcachefs/bkey.h @@ -9,6 +9,12 @@ #include "util.h" #include "vstructs.h" +enum bkey_invalid_flags { + BKEY_INVALID_WRITE = (1U << 0), + BKEY_INVALID_COMMIT = (1U << 1), + BKEY_INVALID_JOURNAL = (1U << 2), +}; + #if 0 /* @@ -769,6 +775,8 @@ static inline void bch2_bkey_format_add_key(struct bkey_format_state *s, const s void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos); struct bkey_format bch2_bkey_format_done(struct bkey_format_state *); -const char *bch2_bkey_format_validate(struct bkey_format *); +int bch2_bkey_format_invalid(struct bch_fs *, struct bkey_format *, + enum bkey_invalid_flags, struct printbuf *); +void bch2_bkey_format_to_text(struct printbuf *, const struct bkey_format *); #endif /* _BCACHEFS_BKEY_H */ diff --git a/libbcachefs/bkey_methods.h b/libbcachefs/bkey_methods.h index d7b63769..668f595e 100644 --- a/libbcachefs/bkey_methods.h +++ b/libbcachefs/bkey_methods.h @@ -13,12 +13,6 @@ enum btree_node_type; extern const char * const bch2_bkey_types[]; extern const struct bkey_ops bch2_bkey_null_ops; -enum bkey_invalid_flags { - BKEY_INVALID_WRITE = (1U << 0), - BKEY_INVALID_COMMIT = (1U << 1), - BKEY_INVALID_JOURNAL = (1U << 2), -}; - /* * key_invalid: checks validity of @k, returns 0 if good or -EINVAL if bad. If * invalid, entire key will be deleted. diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index 13c88d95..a8283fdc 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -1214,7 +1214,6 @@ wait_on_io: void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) { - const struct bkey_format *f = &b->format; struct bset_stats stats; memset(&stats, 0, sizeof(stats)); @@ -1228,9 +1227,13 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, prt_printf(out, ":\n" " ptrs: "); bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key)); + prt_newline(out); - prt_printf(out, "\n" - " format: u64s %u fields %u %u %u %u %u\n" + prt_printf(out, + " format: "); + bch2_bkey_format_to_text(out, &b->format); + + prt_printf(out, " unpack fn len: %u\n" " bytes used %zu/%zu (%zu%% full)\n" " sib u64s: %u, %u (merge threshold %u)\n" @@ -1238,12 +1241,6 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, " nr unpacked keys %u\n" " floats %zu\n" " failed unpacked %zu\n", - f->key_u64s, - f->bits_per_field[0], - f->bits_per_field[1], - f->bits_per_field[2], - f->bits_per_field[3], - f->bits_per_field[4], b->unpack_fn_len, b->nr.live_u64s * sizeof(u64), btree_bytes(c) - sizeof(struct btree_node), diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 49e9822d..a5f685ff 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -9,6 +9,7 @@ #include "alloc_foreground.h" #include "bkey_methods.h" #include "bkey_buf.h" +#include "btree_journal_iter.h" #include "btree_key_cache.h" #include "btree_locking.h" #include "btree_update_interior.h" @@ -43,7 +44,7 @@ static bool should_restart_for_topology_repair(struct bch_fs *c) { return c->opts.fix_errors != FSCK_FIX_no && - !(c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology)); + !(c->recovery_passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology)); } static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) diff --git a/libbcachefs/btree_gc.h b/libbcachefs/btree_gc.h index b45e382f..607575f8 100644 --- a/libbcachefs/btree_gc.h +++ b/libbcachefs/btree_gc.h @@ -2,6 +2,7 @@ #ifndef _BCACHEFS_BTREE_GC_H #define _BCACHEFS_BTREE_GC_H +#include "bkey.h" #include "btree_types.h" int bch2_check_topology(struct bch_fs *); diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index c049876e..3b654841 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -17,6 +17,7 @@ #include "io.h" #include "journal_reclaim.h" #include "journal_seq_blacklist.h" +#include "recovery.h" #include "super-io.h" #include "trace.h" @@ -543,31 +544,7 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c, prt_str(out, ": "); } -enum btree_err_type { - /* - * We can repair this locally, and we're after the checksum check so - * there's no need to try another replica: - */ - BTREE_ERR_FIXABLE, - /* - * We can repair this if we have to, but we should try reading another - * replica if we can: - */ - BTREE_ERR_WANT_RETRY, - /* - * Read another replica if we have one, otherwise consider the whole - * node bad: - */ - BTREE_ERR_MUST_RETRY, - BTREE_ERR_BAD_NODE, - BTREE_ERR_INCOMPATIBLE, -}; - -enum btree_validate_ret { - BTREE_RETRY_READ = 64, -}; - -static int __btree_err(enum btree_err_type type, +static int __btree_err(int ret, struct bch_fs *c, struct bch_dev *ca, struct btree *b, @@ -578,7 +555,6 @@ static int __btree_err(enum btree_err_type type, { struct printbuf out = PRINTBUF; va_list args; - int ret = -BCH_ERR_fsck_fix; btree_err_msg(&out, c, ca, b, i, b->written, write); @@ -594,27 +570,26 @@ static int __btree_err(enum btree_err_type type, goto out; } - if (!have_retry && type == BTREE_ERR_WANT_RETRY) - type = BTREE_ERR_FIXABLE; - if (!have_retry && type == BTREE_ERR_MUST_RETRY) - type = BTREE_ERR_BAD_NODE; + if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry) + ret = -BCH_ERR_btree_node_read_err_fixable; + if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry) + ret = -BCH_ERR_btree_node_read_err_bad_node; - switch (type) { - case BTREE_ERR_FIXABLE: + switch (ret) { + case -BCH_ERR_btree_node_read_err_fixable: mustfix_fsck_err(c, "%s", out.buf); ret = -BCH_ERR_fsck_fix; break; - case BTREE_ERR_WANT_RETRY: - case BTREE_ERR_MUST_RETRY: + case -BCH_ERR_btree_node_read_err_want_retry: + case -BCH_ERR_btree_node_read_err_must_retry: bch2_print_string_as_lines(KERN_ERR, out.buf); - ret = BTREE_RETRY_READ; break; - case BTREE_ERR_BAD_NODE: + case -BCH_ERR_btree_node_read_err_bad_node: bch2_print_string_as_lines(KERN_ERR, out.buf); bch2_topology_error(c); ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?: -EIO; break; - case BTREE_ERR_INCOMPATIBLE: + case -BCH_ERR_btree_node_read_err_incompatible: bch2_print_string_as_lines(KERN_ERR, out.buf); ret = -BCH_ERR_fsck_errors_not_fixed; break; @@ -631,8 +606,11 @@ fsck_err: ({ \ int _ret = __btree_err(type, c, ca, b, i, write, have_retry, msg, ##__VA_ARGS__);\ \ - if (_ret != -BCH_ERR_fsck_fix) \ + if (_ret != -BCH_ERR_fsck_fix) { \ + ret = _ret; \ goto fsck_err; \ + } \ + \ *saw_error = true; \ }) @@ -696,19 +674,18 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, int write, bool have_retry, bool *saw_error) { unsigned version = le16_to_cpu(i->version); - const char *err; struct printbuf buf1 = PRINTBUF; struct printbuf buf2 = PRINTBUF; int ret = 0; btree_err_on(!bch2_version_compatible(version), - BTREE_ERR_INCOMPATIBLE, c, ca, b, i, + -BCH_ERR_btree_node_read_err_incompatible, c, ca, b, i, "unsupported bset version %u.%u", BCH_VERSION_MAJOR(version), BCH_VERSION_MINOR(version)); if (btree_err_on(version < c->sb.version_min, - BTREE_ERR_FIXABLE, c, NULL, b, i, + -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, "bset version %u older than superblock version_min %u", version, c->sb.version_min)) { mutex_lock(&c->sb_lock); @@ -719,7 +696,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, if (btree_err_on(BCH_VERSION_MAJOR(version) > BCH_VERSION_MAJOR(c->sb.version), - BTREE_ERR_FIXABLE, c, NULL, b, i, + -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, "bset version %u newer than superblock version %u", version, c->sb.version)) { mutex_lock(&c->sb_lock); @@ -729,11 +706,11 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, } btree_err_on(BSET_SEPARATE_WHITEOUTS(i), - BTREE_ERR_INCOMPATIBLE, c, ca, b, i, + -BCH_ERR_btree_node_read_err_incompatible, c, ca, b, i, "BSET_SEPARATE_WHITEOUTS no longer supported"); if (btree_err_on(offset + sectors > btree_sectors(c), - BTREE_ERR_FIXABLE, c, ca, b, i, + -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i, "bset past end of btree node")) { i->u64s = 0; ret = 0; @@ -741,12 +718,12 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, } btree_err_on(offset && !i->u64s, - BTREE_ERR_FIXABLE, c, ca, b, i, + -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i, "empty bset"); btree_err_on(BSET_OFFSET(i) && BSET_OFFSET(i) != offset, - BTREE_ERR_WANT_RETRY, c, ca, b, i, + -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i, "bset at wrong sector offset"); if (!offset) { @@ -760,16 +737,16 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, /* XXX endianness */ btree_err_on(bp->seq != bn->keys.seq, - BTREE_ERR_MUST_RETRY, c, ca, b, NULL, + -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL, "incorrect sequence number (wrong btree node)"); } btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id, - BTREE_ERR_MUST_RETRY, c, ca, b, i, + -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, i, "incorrect btree id"); btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level, - BTREE_ERR_MUST_RETRY, c, ca, b, i, + -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, i, "incorrect level"); if (!write) @@ -786,7 +763,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, } btree_err_on(!bpos_eq(b->data->min_key, bp->min_key), - BTREE_ERR_MUST_RETRY, c, ca, b, NULL, + -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL, "incorrect min_key: got %s should be %s", (printbuf_reset(&buf1), bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf), @@ -795,7 +772,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, } btree_err_on(!bpos_eq(bn->max_key, b->key.k.p), - BTREE_ERR_MUST_RETRY, c, ca, b, i, + -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, i, "incorrect max key %s", (printbuf_reset(&buf1), bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf)); @@ -804,10 +781,12 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, compat_btree_node(b->c.level, b->c.btree_id, version, BSET_BIG_ENDIAN(i), write, bn); - err = bch2_bkey_format_validate(&bn->format); - btree_err_on(err, - BTREE_ERR_BAD_NODE, c, ca, b, i, - "invalid bkey format: %s", err); + btree_err_on(bch2_bkey_format_invalid(c, &bn->format, write, &buf1), + -BCH_ERR_btree_node_read_err_bad_node, c, ca, b, i, + "invalid bkey format: %s\n %s", buf1.buf, + (printbuf_reset(&buf2), + bch2_bkey_format_to_text(&buf2, &bn->format), buf2.buf)); + printbuf_reset(&buf1); compat_bformat(b->c.level, b->c.btree_id, version, BSET_BIG_ENDIAN(i), write, @@ -847,14 +826,14 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, struct bkey tmp; if (btree_err_on(bkey_p_next(k) > vstruct_last(i), - BTREE_ERR_FIXABLE, c, NULL, b, i, + -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, "key extends past end of bset")) { i->u64s = cpu_to_le16((u64 *) k - i->_data); break; } if (btree_err_on(k->format > KEY_FORMAT_CURRENT, - BTREE_ERR_FIXABLE, c, NULL, b, i, + -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, "invalid bkey format %u", k->format)) { i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); memmove_u64s_down(k, bkey_p_next(k), @@ -878,7 +857,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, prt_printf(&buf, "\n "); bch2_bkey_val_to_text(&buf, c, u.s_c); - btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf); + btree_err(-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, "%s", buf.buf); i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); memmove_u64s_down(k, bkey_p_next(k), @@ -902,7 +881,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, bch2_dump_bset(c, b, i, 0); - if (btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf)) { + if (btree_err(-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, "%s", buf.buf)) { i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); memmove_u64s_down(k, bkey_p_next(k), (u64 *) vstruct_end(i) - (u64 *) k); @@ -945,16 +924,16 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, iter->size = (btree_blocks(c) + 1) * 2; if (bch2_meta_read_fault("btree")) - btree_err(BTREE_ERR_MUST_RETRY, c, ca, b, NULL, + btree_err(-BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL, "dynamic fault"); btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c), - BTREE_ERR_MUST_RETRY, c, ca, b, NULL, + -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL, "bad magic: want %llx, got %llx", bset_magic(c), le64_to_cpu(b->data->magic)); btree_err_on(!b->data->keys.seq, - BTREE_ERR_MUST_RETRY, c, ca, b, NULL, + -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL, "bad btree header: seq 0"); if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { @@ -962,7 +941,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, &bkey_i_to_btree_ptr_v2(&b->key)->v; btree_err_on(b->data->keys.seq != bp->seq, - BTREE_ERR_MUST_RETRY, c, ca, b, NULL, + -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL, "got wrong btree node (seq %llx want %llx)", b->data->keys.seq, bp->seq); } @@ -977,7 +956,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, i = &b->data->keys; btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), - BTREE_ERR_WANT_RETRY, c, ca, b, i, + -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i, "unknown checksum type %llu", BSET_CSUM_TYPE(i)); @@ -985,7 +964,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); btree_err_on(bch2_crc_cmp(csum, b->data->csum), - BTREE_ERR_WANT_RETRY, c, ca, b, i, + -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i, "invalid checksum"); ret = bset_encrypt(c, i, b->written << 9); @@ -995,7 +974,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_err_on(btree_node_type_is_extents(btree_node_type(b)) && !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data), - BTREE_ERR_INCOMPATIBLE, c, NULL, b, NULL, + -BCH_ERR_btree_node_read_err_incompatible, c, NULL, b, NULL, "btree node does not have NEW_EXTENT_OVERWRITE set"); sectors = vstruct_sectors(b->data, c->block_bits); @@ -1007,7 +986,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, break; btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), - BTREE_ERR_WANT_RETRY, c, ca, b, i, + -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i, "unknown checksum type %llu", BSET_CSUM_TYPE(i)); @@ -1015,7 +994,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); btree_err_on(bch2_crc_cmp(csum, bne->csum), - BTREE_ERR_WANT_RETRY, c, ca, b, i, + -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i, "invalid checksum"); ret = bset_encrypt(c, i, b->written << 9); @@ -1048,12 +1027,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, true); btree_err_on(blacklisted && first, - BTREE_ERR_FIXABLE, c, ca, b, i, + -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i, "first btree node bset has blacklisted journal seq (%llu)", le64_to_cpu(i->journal_seq)); btree_err_on(blacklisted && ptr_written, - BTREE_ERR_FIXABLE, c, ca, b, i, + -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i, "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u", le64_to_cpu(i->journal_seq), b->written, b->written + sectors, ptr_written); @@ -1072,7 +1051,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, if (ptr_written) { btree_err_on(b->written < ptr_written, - BTREE_ERR_WANT_RETRY, c, ca, b, NULL, + -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, NULL, "btree node data missing: expected %u sectors, found %u", ptr_written, b->written); } else { @@ -1083,7 +1062,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, !bch2_journal_seq_is_blacklisted(c, le64_to_cpu(bne->keys.journal_seq), true), - BTREE_ERR_WANT_RETRY, c, ca, b, NULL, + -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, NULL, "found bset signature after last bset"); /* @@ -1137,7 +1116,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, prt_printf(&buf, "\n "); bch2_bkey_val_to_text(&buf, c, u.s_c); - btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf); + btree_err(-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, "%s", buf.buf); btree_keys_account_key_drop(&b->nr, 0, k); @@ -1177,7 +1156,8 @@ out: printbuf_exit(&buf); return retry_read; fsck_err: - if (ret == BTREE_RETRY_READ) + if (ret == -BCH_ERR_btree_node_read_err_want_retry || + ret == -BCH_ERR_btree_node_read_err_must_retry) retry_read = 1; else set_btree_node_read_error(b); @@ -1363,14 +1343,14 @@ static void btree_node_read_all_replicas_done(struct closure *cl) } written2 = btree_node_sectors_written(c, ra->buf[i]); - if (btree_err_on(written2 != written, BTREE_ERR_FIXABLE, c, NULL, b, NULL, + if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, NULL, "btree node sectors written mismatch: %u != %u", written, written2) || btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]), - BTREE_ERR_FIXABLE, c, NULL, b, NULL, + -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, NULL, "found bset signature after last bset") || btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9), - BTREE_ERR_FIXABLE, c, NULL, b, NULL, + -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, NULL, "btree node replicas content mismatch")) dump_bset_maps = true; diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index dfb77b23..06dbb617 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -5,6 +5,7 @@ #include "bkey_buf.h" #include "btree_cache.h" #include "btree_iter.h" +#include "btree_journal_iter.h" #include "btree_key_cache.h" #include "btree_locking.h" #include "btree_update.h" @@ -12,7 +13,6 @@ #include "error.h" #include "extents.h" #include "journal.h" -#include "recovery.h" #include "replicas.h" #include "subvolume.h" #include "trace.h" diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index c472aa8c..8876f2b8 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -221,6 +221,22 @@ struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpo unsigned, unsigned, unsigned, unsigned long); struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *); +/* + * bch2_btree_path_peek_slot() for a cached iterator might return a key in a + * different snapshot: + */ +static inline struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u) +{ + struct bkey_s_c k = bch2_btree_path_peek_slot(path, u); + + if (k.k && bpos_eq(path->pos, k.k->p)) + return k; + + bkey_init(u); + u->p = path->pos; + return (struct bkey_s_c) { u, NULL }; +} + struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *, struct btree_iter *, struct bpos); diff --git a/libbcachefs/btree_journal_iter.c b/libbcachefs/btree_journal_iter.c new file mode 100644 index 00000000..58a981bc --- /dev/null +++ b/libbcachefs/btree_journal_iter.c @@ -0,0 +1,531 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "bset.h" +#include "btree_journal_iter.h" +#include "journal_io.h" + +#include + +/* + * For managing keys we read from the journal: until journal replay works normal + * btree lookups need to be able to find and return keys from the journal where + * they overwrite what's in the btree, so we have a special iterator and + * operations for the regular btree iter code to use: + */ + +static int __journal_key_cmp(enum btree_id l_btree_id, + unsigned l_level, + struct bpos l_pos, + const struct journal_key *r) +{ + return (cmp_int(l_btree_id, r->btree_id) ?: + cmp_int(l_level, r->level) ?: + bpos_cmp(l_pos, r->k->k.p)); +} + +static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r) +{ + return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r); +} + +static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx) +{ + size_t gap_size = keys->size - keys->nr; + + if (idx >= keys->gap) + idx += gap_size; + return idx; +} + +static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx) +{ + return keys->d + idx_to_pos(keys, idx); +} + +static size_t __bch2_journal_key_search(struct journal_keys *keys, + enum btree_id id, unsigned level, + struct bpos pos) +{ + size_t l = 0, r = keys->nr, m; + + while (l < r) { + m = l + ((r - l) >> 1); + if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0) + l = m + 1; + else + r = m; + } + + BUG_ON(l < keys->nr && + __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0); + + BUG_ON(l && + __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0); + + return l; +} + +static size_t bch2_journal_key_search(struct journal_keys *keys, + enum btree_id id, unsigned level, + struct bpos pos) +{ + return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos)); +} + +struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id, + unsigned level, struct bpos pos, + struct bpos end_pos, size_t *idx) +{ + struct journal_keys *keys = &c->journal_keys; + unsigned iters = 0; + struct journal_key *k; +search: + if (!*idx) + *idx = __bch2_journal_key_search(keys, btree_id, level, pos); + + while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { + if (__journal_key_cmp(btree_id, level, end_pos, k) < 0) + return NULL; + + if (__journal_key_cmp(btree_id, level, pos, k) <= 0 && + !k->overwritten) + return k->k; + + (*idx)++; + iters++; + if (iters == 10) { + *idx = 0; + goto search; + } + } + + return NULL; +} + +struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id, + unsigned level, struct bpos pos) +{ + size_t idx = 0; + + return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx); +} + +static void journal_iters_fix(struct bch_fs *c) +{ + struct journal_keys *keys = &c->journal_keys; + /* The key we just inserted is immediately before the gap: */ + size_t gap_end = keys->gap + (keys->size - keys->nr); + struct btree_and_journal_iter *iter; + + /* + * If an iterator points one after the key we just inserted, decrement + * the iterator so it points at the key we just inserted - if the + * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will + * handle that: + */ + list_for_each_entry(iter, &c->journal_iters, journal.list) + if (iter->journal.idx == gap_end) + iter->journal.idx = keys->gap - 1; +} + +static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap) +{ + struct journal_keys *keys = &c->journal_keys; + struct journal_iter *iter; + size_t gap_size = keys->size - keys->nr; + + list_for_each_entry(iter, &c->journal_iters, list) { + if (iter->idx > old_gap) + iter->idx -= gap_size; + if (iter->idx >= new_gap) + iter->idx += gap_size; + } +} + +int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, + unsigned level, struct bkey_i *k) +{ + struct journal_key n = { + .btree_id = id, + .level = level, + .k = k, + .allocated = true, + /* + * Ensure these keys are done last by journal replay, to unblock + * journal reclaim: + */ + .journal_seq = U32_MAX, + }; + struct journal_keys *keys = &c->journal_keys; + size_t idx = bch2_journal_key_search(keys, id, level, k->k.p); + + BUG_ON(test_bit(BCH_FS_RW, &c->flags)); + + if (idx < keys->size && + journal_key_cmp(&n, &keys->d[idx]) == 0) { + if (keys->d[idx].allocated) + kfree(keys->d[idx].k); + keys->d[idx] = n; + return 0; + } + + if (idx > keys->gap) + idx -= keys->size - keys->nr; + + if (keys->nr == keys->size) { + struct journal_keys new_keys = { + .nr = keys->nr, + .size = max_t(size_t, keys->size, 8) * 2, + }; + + new_keys.d = kvmalloc_array(new_keys.size, sizeof(new_keys.d[0]), GFP_KERNEL); + if (!new_keys.d) { + bch_err(c, "%s: error allocating new key array (size %zu)", + __func__, new_keys.size); + return -BCH_ERR_ENOMEM_journal_key_insert; + } + + /* Since @keys was full, there was no gap: */ + memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr); + kvfree(keys->d); + *keys = new_keys; + + /* And now the gap is at the end: */ + keys->gap = keys->nr; + } + + journal_iters_move_gap(c, keys->gap, idx); + + move_gap(keys->d, keys->nr, keys->size, keys->gap, idx); + keys->gap = idx; + + keys->nr++; + keys->d[keys->gap++] = n; + + journal_iters_fix(c); + + return 0; +} + +/* + * Can only be used from the recovery thread while we're still RO - can't be + * used once we've got RW, as journal_keys is at that point used by multiple + * threads: + */ +int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, + unsigned level, struct bkey_i *k) +{ + struct bkey_i *n; + int ret; + + n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL); + if (!n) + return -BCH_ERR_ENOMEM_journal_key_insert; + + bkey_copy(n, k); + ret = bch2_journal_key_insert_take(c, id, level, n); + if (ret) + kfree(n); + return ret; +} + +int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id, + unsigned level, struct bpos pos) +{ + struct bkey_i whiteout; + + bkey_init(&whiteout.k); + whiteout.k.p = pos; + + return bch2_journal_key_insert(c, id, level, &whiteout); +} + +void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, + unsigned level, struct bpos pos) +{ + struct journal_keys *keys = &c->journal_keys; + size_t idx = bch2_journal_key_search(keys, btree, level, pos); + + if (idx < keys->size && + keys->d[idx].btree_id == btree && + keys->d[idx].level == level && + bpos_eq(keys->d[idx].k->k.p, pos)) + keys->d[idx].overwritten = true; +} + +static void bch2_journal_iter_advance(struct journal_iter *iter) +{ + if (iter->idx < iter->keys->size) { + iter->idx++; + if (iter->idx == iter->keys->gap) + iter->idx += iter->keys->size - iter->keys->nr; + } +} + +static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) +{ + struct journal_key *k = iter->keys->d + iter->idx; + + while (k < iter->keys->d + iter->keys->size && + k->btree_id == iter->btree_id && + k->level == iter->level) { + if (!k->overwritten) + return bkey_i_to_s_c(k->k); + + bch2_journal_iter_advance(iter); + k = iter->keys->d + iter->idx; + } + + return bkey_s_c_null; +} + +static void bch2_journal_iter_exit(struct journal_iter *iter) +{ + list_del(&iter->list); +} + +static void bch2_journal_iter_init(struct bch_fs *c, + struct journal_iter *iter, + enum btree_id id, unsigned level, + struct bpos pos) +{ + iter->btree_id = id; + iter->level = level; + iter->keys = &c->journal_keys; + iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos); +} + +static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) +{ + return bch2_btree_node_iter_peek_unpack(&iter->node_iter, + iter->b, &iter->unpacked); +} + +static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter) +{ + bch2_btree_node_iter_advance(&iter->node_iter, iter->b); +} + +void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) +{ + if (bpos_eq(iter->pos, SPOS_MAX)) + iter->at_end = true; + else + iter->pos = bpos_successor(iter->pos); +} + +struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) +{ + struct bkey_s_c btree_k, journal_k, ret; +again: + if (iter->at_end) + return bkey_s_c_null; + + while ((btree_k = bch2_journal_iter_peek_btree(iter)).k && + bpos_lt(btree_k.k->p, iter->pos)) + bch2_journal_iter_advance_btree(iter); + + while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k && + bpos_lt(journal_k.k->p, iter->pos)) + bch2_journal_iter_advance(&iter->journal); + + ret = journal_k.k && + (!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p)) + ? journal_k + : btree_k; + + if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key)) + ret = bkey_s_c_null; + + if (ret.k) { + iter->pos = ret.k->p; + if (bkey_deleted(ret.k)) { + bch2_btree_and_journal_iter_advance(iter); + goto again; + } + } else { + iter->pos = SPOS_MAX; + iter->at_end = true; + } + + return ret; +} + +void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter) +{ + bch2_journal_iter_exit(&iter->journal); +} + +void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, + struct bch_fs *c, + struct btree *b, + struct btree_node_iter node_iter, + struct bpos pos) +{ + memset(iter, 0, sizeof(*iter)); + + iter->b = b; + iter->node_iter = node_iter; + bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos); + INIT_LIST_HEAD(&iter->journal.list); + iter->pos = b->data->min_key; + iter->at_end = false; +} + +/* + * this version is used by btree_gc before filesystem has gone RW and + * multithreaded, so uses the journal_iters list: + */ +void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, + struct bch_fs *c, + struct btree *b) +{ + struct btree_node_iter node_iter; + + bch2_btree_node_iter_init_from_start(&node_iter, b); + __bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key); + list_add(&iter->journal.list, &c->journal_iters); +} + +/* sort and dedup all keys in the journal: */ + +void bch2_journal_entries_free(struct bch_fs *c) +{ + struct journal_replay **i; + struct genradix_iter iter; + + genradix_for_each(&c->journal_entries, iter, i) + if (*i) + kvpfree(*i, offsetof(struct journal_replay, j) + + vstruct_bytes(&(*i)->j)); + genradix_free(&c->journal_entries); +} + +/* + * When keys compare equal, oldest compares first: + */ +static int journal_sort_key_cmp(const void *_l, const void *_r) +{ + const struct journal_key *l = _l; + const struct journal_key *r = _r; + + return journal_key_cmp(l, r) ?: + cmp_int(l->journal_seq, r->journal_seq) ?: + cmp_int(l->journal_offset, r->journal_offset); +} + +void bch2_journal_keys_free(struct journal_keys *keys) +{ + struct journal_key *i; + + move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); + keys->gap = keys->nr; + + for (i = keys->d; i < keys->d + keys->nr; i++) + if (i->allocated) + kfree(i->k); + + kvfree(keys->d); + keys->d = NULL; + keys->nr = keys->gap = keys->size = 0; +} + +static void __journal_keys_sort(struct journal_keys *keys) +{ + struct journal_key *src, *dst; + + sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL); + + src = dst = keys->d; + while (src < keys->d + keys->nr) { + while (src + 1 < keys->d + keys->nr && + src[0].btree_id == src[1].btree_id && + src[0].level == src[1].level && + bpos_eq(src[0].k->k.p, src[1].k->k.p)) + src++; + + *dst++ = *src++; + } + + keys->nr = dst - keys->d; +} + +int bch2_journal_keys_sort(struct bch_fs *c) +{ + struct genradix_iter iter; + struct journal_replay *i, **_i; + struct jset_entry *entry; + struct bkey_i *k; + struct journal_keys *keys = &c->journal_keys; + size_t nr_keys = 0, nr_read = 0; + + genradix_for_each(&c->journal_entries, iter, _i) { + i = *_i; + + if (!i || i->ignore) + continue; + + for_each_jset_key(k, entry, &i->j) + nr_keys++; + } + + if (!nr_keys) + return 0; + + keys->size = roundup_pow_of_two(nr_keys); + + keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL); + if (!keys->d) { + bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys); trying slowpath", + nr_keys); + + do { + keys->size >>= 1; + keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL); + } while (!keys->d && keys->size > nr_keys / 8); + + if (!keys->d) { + bch_err(c, "Failed to allocate %zu size buffer for sorted journal keys; exiting", + keys->size); + return -BCH_ERR_ENOMEM_journal_keys_sort; + } + } + + genradix_for_each(&c->journal_entries, iter, _i) { + i = *_i; + + if (!i || i->ignore) + continue; + + cond_resched(); + + for_each_jset_key(k, entry, &i->j) { + if (keys->nr == keys->size) { + __journal_keys_sort(keys); + + if (keys->nr > keys->size * 7 / 8) { + bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu/%zu", + keys->nr, keys->size, nr_read, nr_keys); + return -BCH_ERR_ENOMEM_journal_keys_sort; + } + } + + keys->d[keys->nr++] = (struct journal_key) { + .btree_id = entry->btree_id, + .level = entry->level, + .k = k, + .journal_seq = le64_to_cpu(i->j.seq), + .journal_offset = k->_data - i->j._data, + }; + + nr_read++; + } + } + + __journal_keys_sort(keys); + keys->gap = keys->nr; + + bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_keys, keys->nr); + return 0; +} diff --git a/libbcachefs/btree_journal_iter.h b/libbcachefs/btree_journal_iter.h new file mode 100644 index 00000000..5d64e7e2 --- /dev/null +++ b/libbcachefs/btree_journal_iter.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_H +#define _BCACHEFS_BTREE_JOURNAL_ITER_H + +struct journal_iter { + struct list_head list; + enum btree_id btree_id; + unsigned level; + size_t idx; + struct journal_keys *keys; +}; + +/* + * Iterate over keys in the btree, with keys from the journal overlaid on top: + */ + +struct btree_and_journal_iter { + struct btree *b; + struct btree_node_iter node_iter; + struct bkey unpacked; + + struct journal_iter journal; + struct bpos pos; + bool at_end; +}; + +struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id, + unsigned, struct bpos, struct bpos, size_t *); +struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id, + unsigned, struct bpos); + +int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id, + unsigned, struct bkey_i *); +int bch2_journal_key_insert(struct bch_fs *, enum btree_id, + unsigned, struct bkey_i *); +int bch2_journal_key_delete(struct bch_fs *, enum btree_id, + unsigned, struct bpos); +void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, + unsigned, struct bpos); + +void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); +struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); + +void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *); +void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, + struct bch_fs *, struct btree *, + struct btree_node_iter, struct bpos); +void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, + struct bch_fs *, + struct btree *); + +void bch2_journal_keys_free(struct journal_keys *); +void bch2_journal_entries_free(struct bch_fs *); + +int bch2_journal_keys_sort(struct bch_fs *); + +#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */ diff --git a/libbcachefs/btree_trans_commit.c b/libbcachefs/btree_trans_commit.c new file mode 100644 index 00000000..78a09aa0 --- /dev/null +++ b/libbcachefs/btree_trans_commit.c @@ -0,0 +1,1156 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_gc.h" +#include "btree_io.h" +#include "btree_iter.h" +#include "btree_journal_iter.h" +#include "btree_key_cache.h" +#include "btree_update_interior.h" +#include "btree_write_buffer.h" +#include "buckets.h" +#include "errcode.h" +#include "error.h" +#include "journal.h" +#include "journal_reclaim.h" +#include "replicas.h" +#include "subvolume.h" + +#include + +static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + struct bch_fs *c = trans->c; + struct bkey u; + struct bkey_s_c k = bch2_btree_path_peek_slot_exact(i->path, &u); + + if (unlikely(trans->journal_replay_not_finished)) { + struct bkey_i *j_k = + bch2_journal_keys_peek_slot(c, i->btree_id, i->level, i->k->k.p); + + if (j_k) + k = bkey_i_to_s_c(j_k); + } + + u = *k.k; + u.needs_whiteout = i->old_k.needs_whiteout; + + BUG_ON(memcmp(&i->old_k, &u, sizeof(struct bkey))); + BUG_ON(i->old_v != k.v); +#endif +} + +static inline struct btree_path_level *insert_l(struct btree_insert_entry *i) +{ + return i->path->l + i->level; +} + +static inline bool same_leaf_as_prev(struct btree_trans *trans, + struct btree_insert_entry *i) +{ + return i != trans->updates && + insert_l(&i[0])->b == insert_l(&i[-1])->b; +} + +static inline bool same_leaf_as_next(struct btree_trans *trans, + struct btree_insert_entry *i) +{ + return i + 1 < trans->updates + trans->nr_updates && + insert_l(&i[0])->b == insert_l(&i[1])->b; +} + +inline void bch2_btree_node_prep_for_write(struct btree_trans *trans, + struct btree_path *path, + struct btree *b) +{ + struct bch_fs *c = trans->c; + + if (unlikely(btree_node_just_written(b)) && + bch2_btree_post_write_cleanup(c, b)) + bch2_trans_node_reinit_iter(trans, b); + + /* + * If the last bset has been written, or if it's gotten too big - start + * a new bset to insert into: + */ + if (want_new_bset(c, b)) + bch2_btree_init_next(trans, b); +} + +/* Inserting into a given leaf node (last stage of insert): */ + +/* Handle overwrites and do insert, for non extents: */ +bool bch2_btree_bset_insert_key(struct btree_trans *trans, + struct btree_path *path, + struct btree *b, + struct btree_node_iter *node_iter, + struct bkey_i *insert) +{ + struct bkey_packed *k; + unsigned clobber_u64s = 0, new_u64s = 0; + + EBUG_ON(btree_node_just_written(b)); + EBUG_ON(bset_written(b, btree_bset_last(b))); + EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k)); + EBUG_ON(bpos_lt(insert->k.p, b->data->min_key)); + EBUG_ON(bpos_gt(insert->k.p, b->data->max_key)); + EBUG_ON(insert->k.u64s > + bch_btree_keys_u64s_remaining(trans->c, b)); + + k = bch2_btree_node_iter_peek_all(node_iter, b); + if (k && bkey_cmp_left_packed(b, k, &insert->k.p)) + k = NULL; + + /* @k is the key being overwritten/deleted, if any: */ + EBUG_ON(k && bkey_deleted(k)); + + /* Deleting, but not found? nothing to do: */ + if (bkey_deleted(&insert->k) && !k) + return false; + + if (bkey_deleted(&insert->k)) { + /* Deleting: */ + btree_account_key_drop(b, k); + k->type = KEY_TYPE_deleted; + + if (k->needs_whiteout) + push_whiteout(trans->c, b, insert->k.p); + k->needs_whiteout = false; + + if (k >= btree_bset_last(b)->start) { + clobber_u64s = k->u64s; + bch2_bset_delete(b, k, clobber_u64s); + goto fix_iter; + } else { + bch2_btree_path_fix_key_modified(trans, b, k); + } + + return true; + } + + if (k) { + /* Overwriting: */ + btree_account_key_drop(b, k); + k->type = KEY_TYPE_deleted; + + insert->k.needs_whiteout = k->needs_whiteout; + k->needs_whiteout = false; + + if (k >= btree_bset_last(b)->start) { + clobber_u64s = k->u64s; + goto overwrite; + } else { + bch2_btree_path_fix_key_modified(trans, b, k); + } + } + + k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b)); +overwrite: + bch2_bset_insert(b, node_iter, k, insert, clobber_u64s); + new_u64s = k->u64s; +fix_iter: + if (clobber_u64s != new_u64s) + bch2_btree_node_iter_fix(trans, path, b, node_iter, k, + clobber_u64s, new_u64s); + return true; +} + +static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, + unsigned i, u64 seq) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct btree_write *w = container_of(pin, struct btree_write, journal); + struct btree *b = container_of(w, struct btree, writes[i]); + struct btree_trans trans; + unsigned long old, new, v; + unsigned idx = w - b->writes; + + bch2_trans_init(&trans, c, 0, 0); + + btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read); + v = READ_ONCE(b->flags); + + do { + old = new = v; + + if (!(old & (1 << BTREE_NODE_dirty)) || + !!(old & (1 << BTREE_NODE_write_idx)) != idx || + w->journal.seq != seq) + break; + + new &= ~BTREE_WRITE_TYPE_MASK; + new |= BTREE_WRITE_journal_reclaim; + new |= 1 << BTREE_NODE_need_write; + } while ((v = cmpxchg(&b->flags, old, new)) != old); + + btree_node_write_if_need(c, b, SIX_LOCK_read); + six_unlock_read(&b->c.lock); + + bch2_trans_exit(&trans); + return 0; +} + +int bch2_btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq) +{ + return __btree_node_flush(j, pin, 0, seq); +} + +int bch2_btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq) +{ + return __btree_node_flush(j, pin, 1, seq); +} + +inline void bch2_btree_add_journal_pin(struct bch_fs *c, + struct btree *b, u64 seq) +{ + struct btree_write *w = btree_current_write(b); + + bch2_journal_pin_add(&c->journal, seq, &w->journal, + btree_node_write_idx(b) == 0 + ? bch2_btree_node_flush0 + : bch2_btree_node_flush1); +} + +/** + * btree_insert_key - insert a key one key into a leaf node + */ +inline void bch2_btree_insert_key_leaf(struct btree_trans *trans, + struct btree_path *path, + struct bkey_i *insert, + u64 journal_seq) +{ + struct bch_fs *c = trans->c; + struct btree *b = path_l(path)->b; + struct bset_tree *t = bset_tree_last(b); + struct bset *i = bset(b, t); + int old_u64s = bset_u64s(t); + int old_live_u64s = b->nr.live_u64s; + int live_u64s_added, u64s_added; + + if (unlikely(!bch2_btree_bset_insert_key(trans, path, b, + &path_l(path)->iter, insert))) + return; + + i->journal_seq = cpu_to_le64(max(journal_seq, le64_to_cpu(i->journal_seq))); + + bch2_btree_add_journal_pin(c, b, journal_seq); + + if (unlikely(!btree_node_dirty(b))) { + EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); + set_btree_node_dirty_acct(c, b); + } + + live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; + u64s_added = (int) bset_u64s(t) - old_u64s; + + if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) + b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); + if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) + b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); + + if (u64s_added > live_u64s_added && + bch2_maybe_compact_whiteouts(c, b)) + bch2_trans_node_reinit_iter(trans, b); +} + +/* Cached btree updates: */ + +/* Normal update interface: */ + +static inline void btree_insert_entry_checks(struct btree_trans *trans, + struct btree_insert_entry *i) +{ + BUG_ON(!bpos_eq(i->k->k.p, i->path->pos)); + BUG_ON(i->cached != i->path->cached); + BUG_ON(i->level != i->path->level); + BUG_ON(i->btree_id != i->path->btree_id); + EBUG_ON(!i->level && + !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) && + test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) && + i->k->k.p.snapshot && + bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot)); +} + +static noinline int +bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned flags, + unsigned long trace_ip) +{ + return drop_locks_do(trans, + bch2_journal_preres_get(&trans->c->journal, + &trans->journal_preres, + trans->journal_preres_u64s, + (flags & BCH_WATERMARK_MASK))); +} + +static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans, + unsigned flags) +{ + return bch2_journal_res_get(&trans->c->journal, &trans->journal_res, + trans->journal_u64s, flags); +} + +#define JSET_ENTRY_LOG_U64s 4 + +static noinline void journal_transaction_name(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + struct journal *j = &c->journal; + struct jset_entry *entry = + bch2_journal_add_entry(j, &trans->journal_res, + BCH_JSET_ENTRY_log, 0, 0, + JSET_ENTRY_LOG_U64s); + struct jset_entry_log *l = + container_of(entry, struct jset_entry_log, entry); + + strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64)); +} + +static inline int btree_key_can_insert(struct btree_trans *trans, + struct btree *b, unsigned u64s) +{ + struct bch_fs *c = trans->c; + + if (!bch2_btree_node_insert_fits(c, b, u64s)) + return -BCH_ERR_btree_insert_btree_node_full; + + return 0; +} + +static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags, + struct btree_path *path, unsigned u64s) +{ + struct bch_fs *c = trans->c; + struct bkey_cached *ck = (void *) path->l[0].b; + struct btree_insert_entry *i; + unsigned new_u64s; + struct bkey_i *new_k; + + EBUG_ON(path->level); + + if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && + bch2_btree_key_cache_must_wait(c) && + !(flags & BTREE_INSERT_JOURNAL_RECLAIM)) + return -BCH_ERR_btree_insert_need_journal_reclaim; + + /* + * bch2_varint_decode can read past the end of the buffer by at most 7 + * bytes (it won't be used): + */ + u64s += 1; + + if (u64s <= ck->u64s) + return 0; + + new_u64s = roundup_pow_of_two(u64s); + new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS); + if (!new_k) { + bch_err(c, "error allocating memory for key cache key, btree %s u64s %u", + bch2_btree_ids[path->btree_id], new_u64s); + return -BCH_ERR_ENOMEM_btree_key_cache_insert; + } + + trans_for_each_update(trans, i) + if (i->old_v == &ck->k->v) + i->old_v = &new_k->v; + + ck->u64s = new_u64s; + ck->k = new_k; + return 0; +} + +/* Triggers: */ + +static int run_one_mem_trigger(struct btree_trans *trans, + struct btree_insert_entry *i, + unsigned flags) +{ + struct bkey_s_c old = { &i->old_k, i->old_v }; + struct bkey_i *new = i->k; + const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type); + const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type); + int ret; + + verify_update_old_key(trans, i); + + if (unlikely(flags & BTREE_TRIGGER_NORUN)) + return 0; + + if (!btree_node_type_needs_gc((enum btree_node_type) i->btree_id)) + return 0; + + if (old_ops->atomic_trigger == new_ops->atomic_trigger && + ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { + ret = bch2_mark_key(trans, i->btree_id, i->level, + old, bkey_i_to_s_c(new), + BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); + } else { + struct bkey _deleted = KEY(0, 0, 0); + struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; + + _deleted.p = i->path->pos; + + ret = bch2_mark_key(trans, i->btree_id, i->level, + deleted, bkey_i_to_s_c(new), + BTREE_TRIGGER_INSERT|flags) ?: + bch2_mark_key(trans, i->btree_id, i->level, + old, deleted, + BTREE_TRIGGER_OVERWRITE|flags); + } + + return ret; +} + +static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i, + bool overwrite) +{ + /* + * Transactional triggers create new btree_insert_entries, so we can't + * pass them a pointer to a btree_insert_entry, that memory is going to + * move: + */ + struct bkey old_k = i->old_k; + struct bkey_s_c old = { &old_k, i->old_v }; + const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type); + const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type); + + verify_update_old_key(trans, i); + + if ((i->flags & BTREE_TRIGGER_NORUN) || + !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type))) + return 0; + + if (!i->insert_trigger_run && + !i->overwrite_trigger_run && + old_ops->trans_trigger == new_ops->trans_trigger && + ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { + i->overwrite_trigger_run = true; + i->insert_trigger_run = true; + return bch2_trans_mark_key(trans, i->btree_id, i->level, old, i->k, + BTREE_TRIGGER_INSERT| + BTREE_TRIGGER_OVERWRITE| + i->flags) ?: 1; + } else if (overwrite && !i->overwrite_trigger_run) { + i->overwrite_trigger_run = true; + return bch2_trans_mark_old(trans, i->btree_id, i->level, old, i->flags) ?: 1; + } else if (!overwrite && !i->insert_trigger_run) { + i->insert_trigger_run = true; + return bch2_trans_mark_new(trans, i->btree_id, i->level, i->k, i->flags) ?: 1; + } else { + return 0; + } +} + +static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, + struct btree_insert_entry *btree_id_start) +{ + struct btree_insert_entry *i; + bool trans_trigger_run; + int ret, overwrite; + + for (overwrite = 1; overwrite >= 0; --overwrite) { + + /* + * Running triggers will append more updates to the list of updates as + * we're walking it: + */ + do { + trans_trigger_run = false; + + for (i = btree_id_start; + i < trans->updates + trans->nr_updates && i->btree_id <= btree_id; + i++) { + if (i->btree_id != btree_id) + continue; + + ret = run_one_trans_trigger(trans, i, overwrite); + if (ret < 0) + return ret; + if (ret) + trans_trigger_run = true; + } + } while (trans_trigger_run); + } + + return 0; +} + +static int bch2_trans_commit_run_triggers(struct btree_trans *trans) +{ + struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates; + unsigned btree_id = 0; + int ret = 0; + + /* + * + * For a given btree, this algorithm runs insert triggers before + * overwrite triggers: this is so that when extents are being moved + * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before + * they are re-added. + */ + for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) { + if (btree_id == BTREE_ID_alloc) + continue; + + while (btree_id_start < trans->updates + trans->nr_updates && + btree_id_start->btree_id < btree_id) + btree_id_start++; + + ret = run_btree_triggers(trans, btree_id, btree_id_start); + if (ret) + return ret; + } + + trans_for_each_update(trans, i) { + if (i->btree_id > BTREE_ID_alloc) + break; + if (i->btree_id == BTREE_ID_alloc) { + ret = run_btree_triggers(trans, BTREE_ID_alloc, i); + if (ret) + return ret; + break; + } + } + +#ifdef CONFIG_BCACHEFS_DEBUG + trans_for_each_update(trans, i) + BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) && + (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) && + (!i->insert_trigger_run || !i->overwrite_trigger_run)); +#endif + return 0; +} + +static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; + int ret = 0; + + trans_for_each_update(trans, i) { + /* + * XXX: synchronization of cached update triggers with gc + * XXX: synchronization of interior node updates with gc + */ + BUG_ON(i->cached || i->level); + + if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) { + ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC); + if (ret) + break; + } + } + + return ret; +} + +static inline int +bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, + struct btree_insert_entry **stopped_at, + unsigned long trace_ip) +{ + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; + struct btree_write_buffered_key *wb; + struct btree_trans_commit_hook *h; + unsigned u64s = 0; + bool marking = false; + int ret; + + if (race_fault()) { + trace_and_count(c, trans_restart_fault_inject, trans, trace_ip); + return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject); + } + + /* + * Check if the insert will fit in the leaf node with the write lock + * held, otherwise another thread could write the node changing the + * amount of space available: + */ + + prefetch(&trans->c->journal.flags); + + trans_for_each_update(trans, i) { + /* Multiple inserts might go to same leaf: */ + if (!same_leaf_as_prev(trans, i)) + u64s = 0; + + u64s += i->k->k.u64s; + ret = !i->cached + ? btree_key_can_insert(trans, insert_l(i)->b, u64s) + : btree_key_can_insert_cached(trans, flags, i->path, u64s); + if (ret) { + *stopped_at = i; + return ret; + } + + if (btree_node_type_needs_gc(i->bkey_type)) + marking = true; + } + + if (trans->nr_wb_updates && + trans->nr_wb_updates + c->btree_write_buffer.state.nr > c->btree_write_buffer.size) + return -BCH_ERR_btree_insert_need_flush_buffer; + + /* + * Don't get journal reservation until after we know insert will + * succeed: + */ + if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) { + ret = bch2_trans_journal_res_get(trans, + (flags & BCH_WATERMARK_MASK)| + JOURNAL_RES_GET_NONBLOCK); + if (ret) + return ret; + + if (unlikely(trans->journal_transaction_names)) + journal_transaction_name(trans); + } else { + trans->journal_res.seq = c->journal.replay_journal_seq; + } + + /* + * Not allowed to fail after we've gotten our journal reservation - we + * have to use it: + */ + + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && + !(flags & BTREE_INSERT_JOURNAL_REPLAY)) { + if (bch2_journal_seq_verify) + trans_for_each_update(trans, i) + i->k->k.version.lo = trans->journal_res.seq; + else if (bch2_inject_invalid_keys) + trans_for_each_update(trans, i) + i->k->k.version = MAX_VERSION; + } + + if (trans->fs_usage_deltas && + bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas)) + return -BCH_ERR_btree_insert_need_mark_replicas; + + if (trans->nr_wb_updates) { + EBUG_ON(flags & BTREE_INSERT_JOURNAL_REPLAY); + + ret = bch2_btree_insert_keys_write_buffer(trans); + if (ret) + goto revert_fs_usage; + } + + h = trans->hooks; + while (h) { + ret = h->fn(trans, h); + if (ret) + goto revert_fs_usage; + h = h->next; + } + + trans_for_each_update(trans, i) + if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) { + ret = run_one_mem_trigger(trans, i, i->flags); + if (ret) + goto fatal_err; + } + + if (unlikely(c->gc_pos.phase)) { + ret = bch2_trans_commit_run_gc_triggers(trans); + if (ret) + goto fatal_err; + } + + if (unlikely(trans->extra_journal_entries.nr)) { + memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res), + trans->extra_journal_entries.data, + trans->extra_journal_entries.nr); + + trans->journal_res.offset += trans->extra_journal_entries.nr; + trans->journal_res.u64s -= trans->extra_journal_entries.nr; + } + + if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) { + struct journal *j = &c->journal; + struct jset_entry *entry; + + trans_for_each_update(trans, i) { + if (i->key_cache_already_flushed) + continue; + + if (i->flags & BTREE_UPDATE_NOJOURNAL) + continue; + + verify_update_old_key(trans, i); + + if (trans->journal_transaction_names) { + entry = bch2_journal_add_entry(j, &trans->journal_res, + BCH_JSET_ENTRY_overwrite, + i->btree_id, i->level, + i->old_k.u64s); + bkey_reassemble(&entry->start[0], + (struct bkey_s_c) { &i->old_k, i->old_v }); + } + + entry = bch2_journal_add_entry(j, &trans->journal_res, + BCH_JSET_ENTRY_btree_keys, + i->btree_id, i->level, + i->k->k.u64s); + bkey_copy(&entry->start[0], i->k); + } + + trans_for_each_wb_update(trans, wb) { + entry = bch2_journal_add_entry(j, &trans->journal_res, + BCH_JSET_ENTRY_btree_keys, + wb->btree, 0, + wb->k.k.u64s); + bkey_copy(&entry->start[0], &wb->k); + } + + if (trans->journal_seq) + *trans->journal_seq = trans->journal_res.seq; + } + + trans_for_each_update(trans, i) { + i->k->k.needs_whiteout = false; + + if (!i->cached) { + u64 seq = trans->journal_res.seq; + + if (i->flags & BTREE_UPDATE_PREJOURNAL) + seq = i->seq; + + bch2_btree_insert_key_leaf(trans, i->path, i->k, seq); + } else if (!i->key_cache_already_flushed) + bch2_btree_insert_key_cached(trans, flags, i); + else { + bch2_btree_key_cache_drop(trans, i->path); + btree_path_set_dirty(i->path, BTREE_ITER_NEED_TRAVERSE); + } + } + + return 0; +fatal_err: + bch2_fatal_error(c); +revert_fs_usage: + if (trans->fs_usage_deltas) + bch2_trans_fs_usage_revert(trans, trans->fs_usage_deltas); + return ret; +} + +static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i) +{ + while (--i >= trans->updates) { + if (same_leaf_as_prev(trans, i)) + continue; + + bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b); + } + + trace_and_count(trans->c, trans_restart_would_deadlock_write, trans); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write); +} + +static inline int trans_lock_write(struct btree_trans *trans) +{ + struct btree_insert_entry *i; + + trans_for_each_update(trans, i) { + if (same_leaf_as_prev(trans, i)) + continue; + + if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c)) + return trans_lock_write_fail(trans, i); + + if (!i->cached) + bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b); + } + + return 0; +} + +static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans) +{ + struct btree_insert_entry *i; + struct btree_write_buffered_key *wb; + + trans_for_each_update(trans, i) + bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p); + + trans_for_each_wb_update(trans, wb) + bch2_journal_key_overwritten(trans->c, wb->btree, 0, wb->k.k.p); +} + +#ifdef CONFIG_BCACHEFS_DEBUG +static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, unsigned flags, + struct btree_insert_entry *i, + struct printbuf *err) +{ + struct bch_fs *c = trans->c; + int rw = (flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE; + + printbuf_reset(err); + prt_printf(err, "invalid bkey on insert from %s -> %ps", + trans->fn, (void *) i->ip_allocated); + prt_newline(err); + printbuf_indent_add(err, 2); + + bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k)); + prt_newline(err); + + bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), + i->bkey_type, rw, err); + bch2_print_string_as_lines(KERN_ERR, err->buf); + + bch2_inconsistent_error(c); + bch2_dump_trans_updates(trans); + printbuf_exit(err); + + return -EINVAL; +} +#endif + +/* + * Get journal reservation, take write locks, and attempt to do btree update(s): + */ +static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags, + struct btree_insert_entry **stopped_at, + unsigned long trace_ip) +{ + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; + int ret = 0, u64s_delta = 0; + +#ifdef CONFIG_BCACHEFS_DEBUG + trans_for_each_update(trans, i) { + struct printbuf buf = PRINTBUF; + enum bkey_invalid_flags invalid_flags = 0; + + if (!(flags & BTREE_INSERT_JOURNAL_REPLAY)) + invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT; + + if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), + i->bkey_type, invalid_flags, &buf))) + ret = bch2_trans_commit_bkey_invalid(trans, flags, i, &buf); + btree_insert_entry_checks(trans, i); + printbuf_exit(&buf); + + if (ret) + return ret; + } +#endif + + trans_for_each_update(trans, i) { + if (i->cached) + continue; + + u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0; + u64s_delta -= i->old_btree_u64s; + + if (!same_leaf_as_next(trans, i)) { + if (u64s_delta <= 0) { + ret = bch2_foreground_maybe_merge(trans, i->path, + i->level, flags); + if (unlikely(ret)) + return ret; + } + + u64s_delta = 0; + } + } + + ret = bch2_journal_preres_get(&c->journal, + &trans->journal_preres, trans->journal_preres_u64s, + (flags & BCH_WATERMARK_MASK)|JOURNAL_RES_GET_NONBLOCK); + if (unlikely(ret == -BCH_ERR_journal_preres_get_blocked)) + ret = bch2_trans_journal_preres_get_cold(trans, flags, trace_ip); + if (unlikely(ret)) + return ret; + + ret = trans_lock_write(trans); + if (unlikely(ret)) + return ret; + + ret = bch2_trans_commit_write_locked(trans, flags, stopped_at, trace_ip); + + if (!ret && unlikely(trans->journal_replay_not_finished)) + bch2_drop_overwrites_from_journal(trans); + + trans_for_each_update(trans, i) + if (!same_leaf_as_prev(trans, i)) + bch2_btree_node_unlock_write_inlined(trans, i->path, + insert_l(i)->b); + + if (!ret && trans->journal_pin) + bch2_journal_pin_add(&c->journal, trans->journal_res.seq, + trans->journal_pin, NULL); + + /* + * Drop journal reservation after dropping write locks, since dropping + * the journal reservation may kick off a journal write: + */ + bch2_journal_res_put(&c->journal, &trans->journal_res); + + if (unlikely(ret)) + return ret; + + bch2_trans_downgrade(trans); + + return 0; +} + +static int journal_reclaim_wait_done(struct bch_fs *c) +{ + int ret = bch2_journal_error(&c->journal) ?: + !bch2_btree_key_cache_must_wait(c); + + if (!ret) + journal_reclaim_kick(&c->journal); + return ret; +} + +static noinline +int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, + struct btree_insert_entry *i, + int ret, unsigned long trace_ip) +{ + struct bch_fs *c = trans->c; + + switch (ret) { + case -BCH_ERR_btree_insert_btree_node_full: + ret = bch2_btree_split_leaf(trans, i->path, flags); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + trace_and_count(c, trans_restart_btree_node_split, trans, trace_ip, i->path); + break; + case -BCH_ERR_btree_insert_need_mark_replicas: + ret = drop_locks_do(trans, + bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas)); + break; + case -BCH_ERR_journal_res_get_blocked: + /* + * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK + * flag + */ + if ((flags & BTREE_INSERT_JOURNAL_RECLAIM) && + (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) { + ret = -BCH_ERR_journal_reclaim_would_deadlock; + break; + } + + ret = drop_locks_do(trans, + bch2_trans_journal_res_get(trans, + (flags & BCH_WATERMARK_MASK)| + JOURNAL_RES_GET_CHECK)); + break; + case -BCH_ERR_btree_insert_need_journal_reclaim: + bch2_trans_unlock(trans); + + trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip); + + wait_event_freezable(c->journal.reclaim_wait, + (ret = journal_reclaim_wait_done(c))); + if (ret < 0) + break; + + ret = bch2_trans_relock(trans); + break; + case -BCH_ERR_btree_insert_need_flush_buffer: { + struct btree_write_buffer *wb = &c->btree_write_buffer; + + ret = 0; + + if (wb->state.nr > wb->size * 3 / 4) { + bch2_trans_unlock(trans); + mutex_lock(&wb->flush_lock); + + if (wb->state.nr > wb->size * 3 / 4) { + bch2_trans_begin(trans); + ret = __bch2_btree_write_buffer_flush(trans, + flags|BTREE_INSERT_NOCHECK_RW, true); + if (!ret) { + trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush); + } + } else { + mutex_unlock(&wb->flush_lock); + ret = bch2_trans_relock(trans); + } + } + break; + } + default: + BUG_ON(ret >= 0); + break; + } + + BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted); + + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) && + !(flags & BTREE_INSERT_NOWAIT) && + (flags & BTREE_INSERT_NOFAIL), c, + "%s: incorrectly got %s\n", __func__, bch2_err_str(ret)); + + return ret; +} + +static noinline int +bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags) +{ + struct bch_fs *c = trans->c; + int ret; + + if (likely(!(flags & BTREE_INSERT_LAZY_RW)) || + test_bit(BCH_FS_STARTED, &c->flags)) + return -BCH_ERR_erofs_trans_commit; + + ret = drop_locks_do(trans, bch2_fs_read_write_early(c)); + if (ret) + return ret; + + bch2_write_ref_get(c, BCH_WRITE_REF_trans); + return 0; +} + +/* + * This is for updates done in the early part of fsck - btree_gc - before we've + * gone RW. we only add the new key to the list of keys for journal replay to + * do. + */ +static noinline int +do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; + int ret = 0; + + trans_for_each_update(trans, i) { + ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k); + if (ret) + break; + } + + return ret; +} + +int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) +{ + struct bch_fs *c = trans->c; + struct btree_insert_entry *i = NULL; + struct btree_write_buffered_key *wb; + unsigned u64s; + int ret = 0; + + if (!trans->nr_updates && + !trans->nr_wb_updates && + !trans->extra_journal_entries.nr) + goto out_reset; + + if (flags & BTREE_INSERT_GC_LOCK_HELD) + lockdep_assert_held(&c->gc_lock); + + ret = bch2_trans_commit_run_triggers(trans); + if (ret) + goto out_reset; + + if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) { + ret = do_bch2_trans_commit_to_journal_replay(trans); + goto out_reset; + } + + if (!(flags & BTREE_INSERT_NOCHECK_RW) && + unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) { + ret = bch2_trans_commit_get_rw_cold(trans, flags); + if (ret) + goto out_reset; + } + + if (c->btree_write_buffer.state.nr > c->btree_write_buffer.size / 2 && + mutex_trylock(&c->btree_write_buffer.flush_lock)) { + bch2_trans_begin(trans); + bch2_trans_unlock(trans); + + ret = __bch2_btree_write_buffer_flush(trans, + flags|BTREE_INSERT_NOCHECK_RW, true); + if (!ret) { + trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush); + } + goto out; + } + + EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); + + memset(&trans->journal_preres, 0, sizeof(trans->journal_preres)); + + trans->journal_u64s = trans->extra_journal_entries.nr; + trans->journal_preres_u64s = 0; + + trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names); + + if (trans->journal_transaction_names) + trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); + + trans_for_each_update(trans, i) { + EBUG_ON(!i->path->should_be_locked); + + ret = bch2_btree_path_upgrade(trans, i->path, i->level + 1); + if (unlikely(ret)) + goto out; + + EBUG_ON(!btree_node_intent_locked(i->path, i->level)); + + if (i->key_cache_already_flushed) + continue; + + /* we're going to journal the key being updated: */ + u64s = jset_u64s(i->k->k.u64s); + if (i->cached && + likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) + trans->journal_preres_u64s += u64s; + + if (i->flags & BTREE_UPDATE_NOJOURNAL) + continue; + + trans->journal_u64s += u64s; + + /* and we're also going to log the overwrite: */ + if (trans->journal_transaction_names) + trans->journal_u64s += jset_u64s(i->old_k.u64s); + } + + trans_for_each_wb_update(trans, wb) + trans->journal_u64s += jset_u64s(wb->k.k.u64s); + + if (trans->extra_journal_res) { + ret = bch2_disk_reservation_add(c, trans->disk_res, + trans->extra_journal_res, + (flags & BTREE_INSERT_NOFAIL) + ? BCH_DISK_RESERVATION_NOFAIL : 0); + if (ret) + goto err; + } +retry: + bch2_trans_verify_not_in_restart(trans); + memset(&trans->journal_res, 0, sizeof(trans->journal_res)); + + ret = do_bch2_trans_commit(trans, flags, &i, _RET_IP_); + + /* make sure we didn't drop or screw up locks: */ + bch2_trans_verify_locks(trans); + + if (ret) + goto err; + + trace_and_count(c, transaction_commit, trans, _RET_IP_); +out: + bch2_journal_preres_put(&c->journal, &trans->journal_preres); + + if (likely(!(flags & BTREE_INSERT_NOCHECK_RW))) + bch2_write_ref_put(c, BCH_WRITE_REF_trans); +out_reset: + bch2_trans_reset_updates(trans); + + return ret; +err: + ret = bch2_trans_commit_error(trans, flags, i, ret, _RET_IP_); + if (ret) + goto out; + + goto retry; +} diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update.c new file mode 100644 index 00000000..612fba60 --- /dev/null +++ b/libbcachefs/btree_update.c @@ -0,0 +1,943 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_update.h" +#include "btree_iter.h" +#include "btree_journal_iter.h" +#include "btree_locking.h" +#include "buckets.h" +#include "debug.h" +#include "errcode.h" +#include "error.h" +#include "extents.h" +#include "keylist.h" +#include "subvolume.h" +#include "trace.h" + +static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, + const struct btree_insert_entry *r) +{ + return cmp_int(l->btree_id, r->btree_id) ?: + cmp_int(l->cached, r->cached) ?: + -cmp_int(l->level, r->level) ?: + bpos_cmp(l->k->k.p, r->k->k.p); +} + +static int __must_check +bch2_trans_update_by_path(struct btree_trans *, struct btree_path *, + struct bkey_i *, enum btree_update_flags, + unsigned long ip); + +static noinline int __check_pos_snapshot_overwritten(struct btree_trans *trans, + enum btree_id id, + struct bpos pos) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + bch2_trans_iter_init(trans, &iter, id, pos, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS); + while (1) { + k = bch2_btree_iter_prev(&iter); + ret = bkey_err(k); + if (ret) + break; + + if (!k.k) + break; + + if (!bkey_eq(pos, k.k->p)) + break; + + if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) { + ret = 1; + break; + } + } + bch2_trans_iter_exit(trans, &iter); + + return ret; +} + +static inline int check_pos_snapshot_overwritten(struct btree_trans *trans, + enum btree_id id, + struct bpos pos) +{ + if (!btree_type_has_snapshots(id) || + bch2_snapshot_is_leaf(trans->c, pos.snapshot)) + return 0; + + return __check_pos_snapshot_overwritten(trans, id, pos); +} + +static noinline int extent_front_merge(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + struct bkey_i **insert, + enum btree_update_flags flags) +{ + struct bch_fs *c = trans->c; + struct bkey_i *update; + int ret; + + update = bch2_bkey_make_mut_noupdate(trans, k); + ret = PTR_ERR_OR_ZERO(update); + if (ret) + return ret; + + if (!bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert))) + return 0; + + ret = check_pos_snapshot_overwritten(trans, iter->btree_id, k.k->p) ?: + check_pos_snapshot_overwritten(trans, iter->btree_id, (*insert)->k.p); + if (ret < 0) + return ret; + if (ret) + return 0; + + ret = bch2_btree_delete_at(trans, iter, flags); + if (ret) + return ret; + + *insert = update; + return 0; +} + +static noinline int extent_back_merge(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_i *insert, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + int ret; + + ret = check_pos_snapshot_overwritten(trans, iter->btree_id, insert->k.p) ?: + check_pos_snapshot_overwritten(trans, iter->btree_id, k.k->p); + if (ret < 0) + return ret; + if (ret) + return 0; + + bch2_bkey_merge(c, bkey_i_to_s(insert), k); + return 0; +} + +/* + * When deleting, check if we need to emit a whiteout (because we're overwriting + * something in an ancestor snapshot) + */ +static int need_whiteout_for_snapshot(struct btree_trans *trans, + enum btree_id btree_id, struct bpos pos) +{ + struct btree_iter iter; + struct bkey_s_c k; + u32 snapshot = pos.snapshot; + int ret; + + if (!bch2_snapshot_parent(trans->c, pos.snapshot)) + return 0; + + pos.snapshot++; + + for_each_btree_key_norestart(trans, iter, btree_id, pos, + BTREE_ITER_ALL_SNAPSHOTS| + BTREE_ITER_NOPRESERVE, k, ret) { + if (!bkey_eq(k.k->p, pos)) + break; + + if (bch2_snapshot_is_ancestor(trans->c, snapshot, + k.k->p.snapshot)) { + ret = !bkey_whiteout(k.k); + break; + } + } + bch2_trans_iter_exit(trans, &iter); + + return ret; +} + +int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, + enum btree_id id, + struct bpos old_pos, + struct bpos new_pos) +{ + struct bch_fs *c = trans->c; + struct btree_iter old_iter, new_iter = { NULL }; + struct bkey_s_c old_k, new_k; + snapshot_id_list s; + struct bkey_i *update; + int ret; + + if (!bch2_snapshot_has_children(c, old_pos.snapshot)) + return 0; + + darray_init(&s); + + bch2_trans_iter_init(trans, &old_iter, id, old_pos, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS); + while ((old_k = bch2_btree_iter_prev(&old_iter)).k && + !(ret = bkey_err(old_k)) && + bkey_eq(old_pos, old_k.k->p)) { + struct bpos whiteout_pos = + SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);; + + if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) || + snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot)) + continue; + + new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_INTENT); + ret = bkey_err(new_k); + if (ret) + break; + + if (new_k.k->type == KEY_TYPE_deleted) { + update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); + ret = PTR_ERR_OR_ZERO(update); + if (ret) + break; + + bkey_init(&update->k); + update->k.p = whiteout_pos; + update->k.type = KEY_TYPE_whiteout; + + ret = bch2_trans_update(trans, &new_iter, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + } + bch2_trans_iter_exit(trans, &new_iter); + + ret = snapshot_list_add(c, &s, old_k.k->p.snapshot); + if (ret) + break; + } + bch2_trans_iter_exit(trans, &new_iter); + bch2_trans_iter_exit(trans, &old_iter); + darray_exit(&s); + + return ret; +} + +int bch2_trans_update_extent_overwrite(struct btree_trans *trans, + struct btree_iter *iter, + enum btree_update_flags flags, + struct bkey_s_c old, + struct bkey_s_c new) +{ + enum btree_id btree_id = iter->btree_id; + struct bkey_i *update; + struct bpos new_start = bkey_start_pos(new.k); + bool front_split = bkey_lt(bkey_start_pos(old.k), new_start); + bool back_split = bkey_gt(old.k->p, new.k->p); + int ret = 0, compressed_sectors; + + /* + * If we're going to be splitting a compressed extent, note it + * so that __bch2_trans_commit() can increase our disk + * reservation: + */ + if (((front_split && back_split) || + ((front_split || back_split) && old.k->p.snapshot != new.k->p.snapshot)) && + (compressed_sectors = bch2_bkey_sectors_compressed(old))) + trans->extra_journal_res += compressed_sectors; + + if (front_split) { + update = bch2_bkey_make_mut_noupdate(trans, old); + if ((ret = PTR_ERR_OR_ZERO(update))) + return ret; + + bch2_cut_back(new_start, update); + + ret = bch2_insert_snapshot_whiteouts(trans, btree_id, + old.k->p, update->k.p) ?: + bch2_btree_insert_nonextent(trans, btree_id, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); + if (ret) + return ret; + } + + /* If we're overwriting in a different snapshot - middle split: */ + if (old.k->p.snapshot != new.k->p.snapshot && + (front_split || back_split)) { + update = bch2_bkey_make_mut_noupdate(trans, old); + if ((ret = PTR_ERR_OR_ZERO(update))) + return ret; + + bch2_cut_front(new_start, update); + bch2_cut_back(new.k->p, update); + + ret = bch2_insert_snapshot_whiteouts(trans, btree_id, + old.k->p, update->k.p) ?: + bch2_btree_insert_nonextent(trans, btree_id, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); + if (ret) + return ret; + } + + if (bkey_le(old.k->p, new.k->p)) { + update = bch2_trans_kmalloc(trans, sizeof(*update)); + if ((ret = PTR_ERR_OR_ZERO(update))) + return ret; + + bkey_init(&update->k); + update->k.p = old.k->p; + update->k.p.snapshot = new.k->p.snapshot; + + if (new.k->p.snapshot != old.k->p.snapshot) { + update->k.type = KEY_TYPE_whiteout; + } else if (btree_type_has_snapshots(btree_id)) { + ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p); + if (ret < 0) + return ret; + if (ret) + update->k.type = KEY_TYPE_whiteout; + } + + ret = bch2_btree_insert_nonextent(trans, btree_id, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); + if (ret) + return ret; + } + + if (back_split) { + update = bch2_bkey_make_mut_noupdate(trans, old); + if ((ret = PTR_ERR_OR_ZERO(update))) + return ret; + + bch2_cut_front(new.k->p, update); + + ret = bch2_trans_update_by_path(trans, iter->path, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| + flags, _RET_IP_); + if (ret) + return ret; + } + + return 0; +} + +static int bch2_trans_update_extent(struct btree_trans *trans, + struct btree_iter *orig_iter, + struct bkey_i *insert, + enum btree_update_flags flags) +{ + struct btree_iter iter; + struct bkey_s_c k; + enum btree_id btree_id = orig_iter->btree_id; + int ret = 0; + + bch2_trans_iter_init(trans, &iter, btree_id, bkey_start_pos(&insert->k), + BTREE_ITER_INTENT| + BTREE_ITER_WITH_UPDATES| + BTREE_ITER_NOT_EXTENTS); + k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); + if ((ret = bkey_err(k))) + goto err; + if (!k.k) + goto out; + + if (bkey_eq(k.k->p, bkey_start_pos(&insert->k))) { + if (bch2_bkey_maybe_mergable(k.k, &insert->k)) { + ret = extent_front_merge(trans, &iter, k, &insert, flags); + if (ret) + goto err; + } + + goto next; + } + + while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) { + bool done = bkey_lt(insert->k.p, k.k->p); + + ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert)); + if (ret) + goto err; + + if (done) + goto out; +next: + bch2_btree_iter_advance(&iter); + k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); + if ((ret = bkey_err(k))) + goto err; + if (!k.k) + goto out; + } + + if (bch2_bkey_maybe_mergable(&insert->k, k.k)) { + ret = extent_back_merge(trans, &iter, insert, k); + if (ret) + goto err; + } +out: + if (!bkey_deleted(&insert->k)) + ret = bch2_btree_insert_nonextent(trans, btree_id, insert, flags); +err: + bch2_trans_iter_exit(trans, &iter); + + return ret; +} + +static noinline int flush_new_cached_update(struct btree_trans *trans, + struct btree_path *path, + struct btree_insert_entry *i, + enum btree_update_flags flags, + unsigned long ip) +{ + struct btree_path *btree_path; + struct bkey k; + int ret; + + btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0, + BTREE_ITER_INTENT, _THIS_IP_); + ret = bch2_btree_path_traverse(trans, btree_path, 0); + if (ret) + goto out; + + /* + * The old key in the insert entry might actually refer to an existing + * key in the btree that has been deleted from cache and not yet + * flushed. Check for this and skip the flush so we don't run triggers + * against a stale key. + */ + bch2_btree_path_peek_slot_exact(btree_path, &k); + if (!bkey_deleted(&k)) + goto out; + + i->key_cache_already_flushed = true; + i->flags |= BTREE_TRIGGER_NORUN; + + btree_path_set_should_be_locked(btree_path); + ret = bch2_trans_update_by_path(trans, btree_path, i->k, flags, ip); +out: + bch2_path_put(trans, btree_path, true); + return ret; +} + +static int __must_check +bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, + struct bkey_i *k, enum btree_update_flags flags, + unsigned long ip) +{ + struct bch_fs *c = trans->c; + struct btree_insert_entry *i, n; + u64 seq = 0; + int cmp; + + EBUG_ON(!path->should_be_locked); + EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX); + EBUG_ON(!bpos_eq(k->k.p, path->pos)); + + /* + * The transaction journal res hasn't been allocated at this point. + * That occurs at commit time. Reuse the seq field to pass in the seq + * of a prejournaled key. + */ + if (flags & BTREE_UPDATE_PREJOURNAL) + seq = trans->journal_res.seq; + + n = (struct btree_insert_entry) { + .flags = flags, + .bkey_type = __btree_node_type(path->level, path->btree_id), + .btree_id = path->btree_id, + .level = path->level, + .cached = path->cached, + .path = path, + .k = k, + .seq = seq, + .ip_allocated = ip, + }; + +#ifdef CONFIG_BCACHEFS_DEBUG + trans_for_each_update(trans, i) + BUG_ON(i != trans->updates && + btree_insert_entry_cmp(i - 1, i) >= 0); +#endif + + /* + * Pending updates are kept sorted: first, find position of new update, + * then delete/trim any updates the new update overwrites: + */ + trans_for_each_update(trans, i) { + cmp = btree_insert_entry_cmp(&n, i); + if (cmp <= 0) + break; + } + + if (!cmp && i < trans->updates + trans->nr_updates) { + EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run); + + bch2_path_put(trans, i->path, true); + i->flags = n.flags; + i->cached = n.cached; + i->k = n.k; + i->path = n.path; + i->seq = n.seq; + i->ip_allocated = n.ip_allocated; + } else { + array_insert_item(trans->updates, trans->nr_updates, + i - trans->updates, n); + + i->old_v = bch2_btree_path_peek_slot_exact(path, &i->old_k).v; + i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0; + + if (unlikely(trans->journal_replay_not_finished)) { + struct bkey_i *j_k = + bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p); + + if (j_k) { + i->old_k = j_k->k; + i->old_v = &j_k->v; + } + } + } + + __btree_path_get(i->path, true); + + /* + * If a key is present in the key cache, it must also exist in the + * btree - this is necessary for cache coherency. When iterating over + * a btree that's cached in the key cache, the btree iter code checks + * the key cache - but the key has to exist in the btree for that to + * work: + */ + if (path->cached && bkey_deleted(&i->old_k)) + return flush_new_cached_update(trans, path, i, flags, ip); + + return 0; +} + +int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_i *k, enum btree_update_flags flags) +{ + struct btree_path *path = iter->update_path ?: iter->path; + struct bkey_cached *ck; + int ret; + + if (iter->flags & BTREE_ITER_IS_EXTENTS) + return bch2_trans_update_extent(trans, iter, k, flags); + + if (bkey_deleted(&k->k) && + !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && + (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) { + ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p); + if (unlikely(ret < 0)) + return ret; + + if (ret) + k->k.type = KEY_TYPE_whiteout; + } + + /* + * Ensure that updates to cached btrees go to the key cache: + */ + if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && + !path->cached && + !path->level && + btree_id_cached(trans->c, path->btree_id)) { + if (!iter->key_cache_path || + !iter->key_cache_path->should_be_locked || + !bpos_eq(iter->key_cache_path->pos, k->k.p)) { + if (!iter->key_cache_path) + iter->key_cache_path = + bch2_path_get(trans, path->btree_id, path->pos, 1, 0, + BTREE_ITER_INTENT| + BTREE_ITER_CACHED, _THIS_IP_); + + iter->key_cache_path = + bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos, + iter->flags & BTREE_ITER_INTENT, + _THIS_IP_); + + ret = bch2_btree_path_traverse(trans, iter->key_cache_path, + BTREE_ITER_CACHED); + if (unlikely(ret)) + return ret; + + ck = (void *) iter->key_cache_path->l[0].b; + + if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced); + } + + btree_path_set_should_be_locked(iter->key_cache_path); + } + + path = iter->key_cache_path; + } + + return bch2_trans_update_by_path(trans, path, k, flags, _RET_IP_); +} + +/* + * Add a transaction update for a key that has already been journaled. + */ +int __must_check bch2_trans_update_seq(struct btree_trans *trans, u64 seq, + struct btree_iter *iter, struct bkey_i *k, + enum btree_update_flags flags) +{ + trans->journal_res.seq = seq; + return bch2_trans_update(trans, iter, k, flags|BTREE_UPDATE_NOJOURNAL| + BTREE_UPDATE_PREJOURNAL); +} + +int __must_check bch2_trans_update_buffered(struct btree_trans *trans, + enum btree_id btree, + struct bkey_i *k) +{ + struct btree_write_buffered_key *i; + int ret; + + EBUG_ON(trans->nr_wb_updates > trans->wb_updates_size); + EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX); + + trans_for_each_wb_update(trans, i) { + if (i->btree == btree && bpos_eq(i->k.k.p, k->k.p)) { + bkey_copy(&i->k, k); + return 0; + } + } + + if (!trans->wb_updates || + trans->nr_wb_updates == trans->wb_updates_size) { + struct btree_write_buffered_key *u; + + if (trans->nr_wb_updates == trans->wb_updates_size) { + struct btree_transaction_stats *s = btree_trans_stats(trans); + + BUG_ON(trans->wb_updates_size > U8_MAX / 2); + trans->wb_updates_size = max(1, trans->wb_updates_size * 2); + if (s) + s->wb_updates_size = trans->wb_updates_size; + } + + u = bch2_trans_kmalloc_nomemzero(trans, + trans->wb_updates_size * + sizeof(struct btree_write_buffered_key)); + ret = PTR_ERR_OR_ZERO(u); + if (ret) + return ret; + + if (trans->nr_wb_updates) + memcpy(u, trans->wb_updates, trans->nr_wb_updates * + sizeof(struct btree_write_buffered_key)); + trans->wb_updates = u; + } + + trans->wb_updates[trans->nr_wb_updates] = (struct btree_write_buffered_key) { + .btree = btree, + }; + + bkey_copy(&trans->wb_updates[trans->nr_wb_updates].k, k); + trans->nr_wb_updates++; + + return 0; +} + +int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter, + enum btree_id btree, struct bpos end) +{ + struct bkey_s_c k; + int ret = 0; + + bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_INTENT); + k = bch2_btree_iter_prev(iter); + ret = bkey_err(k); + if (ret) + goto err; + + bch2_btree_iter_advance(iter); + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + goto err; + + BUG_ON(k.k->type != KEY_TYPE_deleted); + + if (bkey_gt(k.k->p, end)) { + ret = -BCH_ERR_ENOSPC_btree_slot; + goto err; + } + + return 0; +err: + bch2_trans_iter_exit(trans, iter); + return ret; +} + +void bch2_trans_commit_hook(struct btree_trans *trans, + struct btree_trans_commit_hook *h) +{ + h->next = trans->hooks; + trans->hooks = h; +} + +int bch2_btree_insert_nonextent(struct btree_trans *trans, + enum btree_id btree, struct bkey_i *k, + enum btree_update_flags flags) +{ + struct btree_iter iter; + int ret; + + bch2_trans_iter_init(trans, &iter, btree, k->k.p, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(trans, &iter, k, flags); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int __bch2_btree_insert(struct btree_trans *trans, enum btree_id id, + struct bkey_i *k, enum btree_update_flags flags) +{ + struct btree_iter iter; + int ret; + + bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k), + BTREE_ITER_CACHED| + BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(trans, &iter, k, flags); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +/** + * bch2_btree_insert - insert keys into the extent btree + * @c: pointer to struct bch_fs + * @id: btree to insert into + * @insert_keys: list of keys to insert + * @hook: insert callback + */ +int bch2_btree_insert(struct bch_fs *c, enum btree_id id, + struct bkey_i *k, + struct disk_reservation *disk_res, + u64 *journal_seq, int flags) +{ + return bch2_trans_do(c, disk_res, journal_seq, flags, + __bch2_btree_insert(&trans, id, k, 0)); +} + +int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter, + unsigned len, unsigned update_flags) +{ + struct bkey_i *k; + + k = bch2_trans_kmalloc(trans, sizeof(*k)); + if (IS_ERR(k)) + return PTR_ERR(k); + + bkey_init(&k->k); + k->k.p = iter->pos; + bch2_key_resize(&k->k, len); + return bch2_trans_update(trans, iter, k, update_flags); +} + +int bch2_btree_delete_at(struct btree_trans *trans, + struct btree_iter *iter, unsigned update_flags) +{ + return bch2_btree_delete_extent_at(trans, iter, 0, update_flags); +} + +int bch2_btree_delete_at_buffered(struct btree_trans *trans, + enum btree_id btree, struct bpos pos) +{ + struct bkey_i *k; + + k = bch2_trans_kmalloc(trans, sizeof(*k)); + if (IS_ERR(k)) + return PTR_ERR(k); + + bkey_init(&k->k); + k->k.p = pos; + return bch2_trans_update_buffered(trans, btree, k); +} + +int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, + struct bpos start, struct bpos end, + unsigned update_flags, + u64 *journal_seq) +{ + u32 restart_count = trans->restart_count; + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT); + while ((k = bch2_btree_iter_peek_upto(&iter, end)).k) { + struct disk_reservation disk_res = + bch2_disk_reservation_init(trans->c, 0); + struct bkey_i delete; + + ret = bkey_err(k); + if (ret) + goto err; + + bkey_init(&delete.k); + + /* + * This could probably be more efficient for extents: + */ + + /* + * For extents, iter.pos won't necessarily be the same as + * bkey_start_pos(k.k) (for non extents they always will be the + * same). It's important that we delete starting from iter.pos + * because the range we want to delete could start in the middle + * of k. + * + * (bch2_btree_iter_peek() does guarantee that iter.pos >= + * bkey_start_pos(k.k)). + */ + delete.k.p = iter.pos; + + if (iter.flags & BTREE_ITER_IS_EXTENTS) + bch2_key_resize(&delete.k, + bpos_min(end, k.k->p).offset - + iter.pos.offset); + + ret = bch2_trans_update(trans, &iter, &delete, update_flags) ?: + bch2_trans_commit(trans, &disk_res, journal_seq, + BTREE_INSERT_NOFAIL); + bch2_disk_reservation_put(trans->c, &disk_res); +err: + /* + * the bch2_trans_begin() call is in a weird place because we + * need to call it after every transaction commit, to avoid path + * overflow, but don't want to call it if the delete operation + * is a no-op and we have no work to do: + */ + bch2_trans_begin(trans); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + ret = 0; + if (ret) + break; + } + bch2_trans_iter_exit(trans, &iter); + + if (!ret && trans_was_restarted(trans, restart_count)) + ret = -BCH_ERR_transaction_restart_nested; + return ret; +} + +/* + * bch_btree_delete_range - delete everything within a given range + * + * Range is a half open interval - [start, end) + */ +int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, + struct bpos start, struct bpos end, + unsigned update_flags, + u64 *journal_seq) +{ + int ret = bch2_trans_run(c, + bch2_btree_delete_range_trans(&trans, id, start, end, + update_flags, journal_seq)); + if (ret == -BCH_ERR_transaction_restart_nested) + ret = 0; + return ret; +} + +int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, + struct bpos pos, bool set) +{ + struct bkey_i *k; + int ret = 0; + + k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k)); + ret = PTR_ERR_OR_ZERO(k); + if (unlikely(ret)) + return ret; + + bkey_init(&k->k); + k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted; + k->k.p = pos; + + return bch2_trans_update_buffered(trans, btree, k); +} + +static int __bch2_trans_log_msg(darray_u64 *entries, const char *fmt, va_list args) +{ + struct printbuf buf = PRINTBUF; + struct jset_entry_log *l; + unsigned u64s; + int ret; + + prt_vprintf(&buf, fmt, args); + ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; + if (ret) + goto err; + + u64s = DIV_ROUND_UP(buf.pos, sizeof(u64)); + + ret = darray_make_room(entries, jset_u64s(u64s)); + if (ret) + goto err; + + l = (void *) &darray_top(*entries); + l->entry.u64s = cpu_to_le16(u64s); + l->entry.btree_id = 0; + l->entry.level = 1; + l->entry.type = BCH_JSET_ENTRY_log; + l->entry.pad[0] = 0; + l->entry.pad[1] = 0; + l->entry.pad[2] = 0; + memcpy(l->d, buf.buf, buf.pos); + while (buf.pos & 7) + l->d[buf.pos++] = '\0'; + + entries->nr += jset_u64s(u64s); +err: + printbuf_exit(&buf); + return ret; +} + +static int +__bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, + va_list args) +{ + int ret; + + if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) { + ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args); + } else { + ret = bch2_trans_do(c, NULL, NULL, + BTREE_INSERT_LAZY_RW|commit_flags, + __bch2_trans_log_msg(&trans.extra_journal_entries, fmt, args)); + } + + return ret; +} + +int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...) +{ + va_list args; + int ret; + + va_start(args, fmt); + ret = __bch2_fs_log_msg(c, 0, fmt, args); + va_end(args); + return ret; +} + +/* + * Use for logging messages during recovery to enable reserved space and avoid + * blocking. + */ +int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...) +{ + va_list args; + int ret; + + va_start(args, fmt); + ret = __bch2_fs_log_msg(c, BCH_WATERMARK_reclaim, fmt, args); + va_end(args); + return ret; +} diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index f42ef46c..986dd541 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -5,6 +5,7 @@ #include "bkey_methods.h" #include "btree_cache.h" #include "btree_gc.h" +#include "btree_journal_iter.h" #include "btree_update.h" #include "btree_update_interior.h" #include "btree_io.h" @@ -17,7 +18,6 @@ #include "journal.h" #include "journal_reclaim.h" #include "keylist.h" -#include "recovery.h" #include "replicas.h" #include "super-io.h" #include "trace.h" diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c deleted file mode 100644 index 369e37a4..00000000 --- a/libbcachefs/btree_update_leaf.c +++ /dev/null @@ -1,2107 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include "bcachefs.h" -#include "btree_update.h" -#include "btree_update_interior.h" -#include "btree_gc.h" -#include "btree_io.h" -#include "btree_iter.h" -#include "btree_key_cache.h" -#include "btree_locking.h" -#include "btree_write_buffer.h" -#include "buckets.h" -#include "debug.h" -#include "errcode.h" -#include "error.h" -#include "extent_update.h" -#include "journal.h" -#include "journal_reclaim.h" -#include "keylist.h" -#include "recovery.h" -#include "subvolume.h" -#include "replicas.h" -#include "trace.h" - -#include -#include - -/* - * bch2_btree_path_peek_slot() for a cached iterator might return a key in a - * different snapshot: - */ -static struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u) -{ - struct bkey_s_c k = bch2_btree_path_peek_slot(path, u); - - if (k.k && bpos_eq(path->pos, k.k->p)) - return k; - - bkey_init(u); - u->p = path->pos; - return (struct bkey_s_c) { u, NULL }; -} - -static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - struct bch_fs *c = trans->c; - struct bkey u; - struct bkey_s_c k = bch2_btree_path_peek_slot_exact(i->path, &u); - - if (unlikely(trans->journal_replay_not_finished)) { - struct bkey_i *j_k = - bch2_journal_keys_peek_slot(c, i->btree_id, i->level, i->k->k.p); - - if (j_k) - k = bkey_i_to_s_c(j_k); - } - - u = *k.k; - u.needs_whiteout = i->old_k.needs_whiteout; - - BUG_ON(memcmp(&i->old_k, &u, sizeof(struct bkey))); - BUG_ON(i->old_v != k.v); -#endif -} - -static int __must_check -bch2_trans_update_by_path(struct btree_trans *, struct btree_path *, - struct bkey_i *, enum btree_update_flags, - unsigned long ip); - -static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, - const struct btree_insert_entry *r) -{ - return cmp_int(l->btree_id, r->btree_id) ?: - cmp_int(l->cached, r->cached) ?: - -cmp_int(l->level, r->level) ?: - bpos_cmp(l->k->k.p, r->k->k.p); -} - -static inline struct btree_path_level *insert_l(struct btree_insert_entry *i) -{ - return i->path->l + i->level; -} - -static inline bool same_leaf_as_prev(struct btree_trans *trans, - struct btree_insert_entry *i) -{ - return i != trans->updates && - insert_l(&i[0])->b == insert_l(&i[-1])->b; -} - -static inline bool same_leaf_as_next(struct btree_trans *trans, - struct btree_insert_entry *i) -{ - return i + 1 < trans->updates + trans->nr_updates && - insert_l(&i[0])->b == insert_l(&i[1])->b; -} - -inline void bch2_btree_node_prep_for_write(struct btree_trans *trans, - struct btree_path *path, - struct btree *b) -{ - struct bch_fs *c = trans->c; - - if (unlikely(btree_node_just_written(b)) && - bch2_btree_post_write_cleanup(c, b)) - bch2_trans_node_reinit_iter(trans, b); - - /* - * If the last bset has been written, or if it's gotten too big - start - * a new bset to insert into: - */ - if (want_new_bset(c, b)) - bch2_btree_init_next(trans, b); -} - -/* Inserting into a given leaf node (last stage of insert): */ - -/* Handle overwrites and do insert, for non extents: */ -bool bch2_btree_bset_insert_key(struct btree_trans *trans, - struct btree_path *path, - struct btree *b, - struct btree_node_iter *node_iter, - struct bkey_i *insert) -{ - struct bkey_packed *k; - unsigned clobber_u64s = 0, new_u64s = 0; - - EBUG_ON(btree_node_just_written(b)); - EBUG_ON(bset_written(b, btree_bset_last(b))); - EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k)); - EBUG_ON(bpos_lt(insert->k.p, b->data->min_key)); - EBUG_ON(bpos_gt(insert->k.p, b->data->max_key)); - EBUG_ON(insert->k.u64s > - bch_btree_keys_u64s_remaining(trans->c, b)); - - k = bch2_btree_node_iter_peek_all(node_iter, b); - if (k && bkey_cmp_left_packed(b, k, &insert->k.p)) - k = NULL; - - /* @k is the key being overwritten/deleted, if any: */ - EBUG_ON(k && bkey_deleted(k)); - - /* Deleting, but not found? nothing to do: */ - if (bkey_deleted(&insert->k) && !k) - return false; - - if (bkey_deleted(&insert->k)) { - /* Deleting: */ - btree_account_key_drop(b, k); - k->type = KEY_TYPE_deleted; - - if (k->needs_whiteout) - push_whiteout(trans->c, b, insert->k.p); - k->needs_whiteout = false; - - if (k >= btree_bset_last(b)->start) { - clobber_u64s = k->u64s; - bch2_bset_delete(b, k, clobber_u64s); - goto fix_iter; - } else { - bch2_btree_path_fix_key_modified(trans, b, k); - } - - return true; - } - - if (k) { - /* Overwriting: */ - btree_account_key_drop(b, k); - k->type = KEY_TYPE_deleted; - - insert->k.needs_whiteout = k->needs_whiteout; - k->needs_whiteout = false; - - if (k >= btree_bset_last(b)->start) { - clobber_u64s = k->u64s; - goto overwrite; - } else { - bch2_btree_path_fix_key_modified(trans, b, k); - } - } - - k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b)); -overwrite: - bch2_bset_insert(b, node_iter, k, insert, clobber_u64s); - new_u64s = k->u64s; -fix_iter: - if (clobber_u64s != new_u64s) - bch2_btree_node_iter_fix(trans, path, b, node_iter, k, - clobber_u64s, new_u64s); - return true; -} - -static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, - unsigned i, u64 seq) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct btree_write *w = container_of(pin, struct btree_write, journal); - struct btree *b = container_of(w, struct btree, writes[i]); - struct btree_trans trans; - unsigned long old, new, v; - unsigned idx = w - b->writes; - - bch2_trans_init(&trans, c, 0, 0); - - btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read); - v = READ_ONCE(b->flags); - - do { - old = new = v; - - if (!(old & (1 << BTREE_NODE_dirty)) || - !!(old & (1 << BTREE_NODE_write_idx)) != idx || - w->journal.seq != seq) - break; - - new &= ~BTREE_WRITE_TYPE_MASK; - new |= BTREE_WRITE_journal_reclaim; - new |= 1 << BTREE_NODE_need_write; - } while ((v = cmpxchg(&b->flags, old, new)) != old); - - btree_node_write_if_need(c, b, SIX_LOCK_read); - six_unlock_read(&b->c.lock); - - bch2_trans_exit(&trans); - return 0; -} - -int bch2_btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq) -{ - return __btree_node_flush(j, pin, 0, seq); -} - -int bch2_btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq) -{ - return __btree_node_flush(j, pin, 1, seq); -} - -inline void bch2_btree_add_journal_pin(struct bch_fs *c, - struct btree *b, u64 seq) -{ - struct btree_write *w = btree_current_write(b); - - bch2_journal_pin_add(&c->journal, seq, &w->journal, - btree_node_write_idx(b) == 0 - ? bch2_btree_node_flush0 - : bch2_btree_node_flush1); -} - -/** - * btree_insert_key - insert a key one key into a leaf node - */ -inline void bch2_btree_insert_key_leaf(struct btree_trans *trans, - struct btree_path *path, - struct bkey_i *insert, - u64 journal_seq) -{ - struct bch_fs *c = trans->c; - struct btree *b = path_l(path)->b; - struct bset_tree *t = bset_tree_last(b); - struct bset *i = bset(b, t); - int old_u64s = bset_u64s(t); - int old_live_u64s = b->nr.live_u64s; - int live_u64s_added, u64s_added; - - if (unlikely(!bch2_btree_bset_insert_key(trans, path, b, - &path_l(path)->iter, insert))) - return; - - i->journal_seq = cpu_to_le64(max(journal_seq, le64_to_cpu(i->journal_seq))); - - bch2_btree_add_journal_pin(c, b, journal_seq); - - if (unlikely(!btree_node_dirty(b))) { - EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); - set_btree_node_dirty_acct(c, b); - } - - live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; - u64s_added = (int) bset_u64s(t) - old_u64s; - - if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) - b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); - if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) - b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); - - if (u64s_added > live_u64s_added && - bch2_maybe_compact_whiteouts(c, b)) - bch2_trans_node_reinit_iter(trans, b); -} - -/* Cached btree updates: */ - -/* Normal update interface: */ - -static inline void btree_insert_entry_checks(struct btree_trans *trans, - struct btree_insert_entry *i) -{ - BUG_ON(!bpos_eq(i->k->k.p, i->path->pos)); - BUG_ON(i->cached != i->path->cached); - BUG_ON(i->level != i->path->level); - BUG_ON(i->btree_id != i->path->btree_id); - EBUG_ON(!i->level && - !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) && - test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) && - i->k->k.p.snapshot && - bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot)); -} - -static noinline int -bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned flags, - unsigned long trace_ip) -{ - return drop_locks_do(trans, - bch2_journal_preres_get(&trans->c->journal, - &trans->journal_preres, - trans->journal_preres_u64s, - (flags & BCH_WATERMARK_MASK))); -} - -static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans, - unsigned flags) -{ - return bch2_journal_res_get(&trans->c->journal, &trans->journal_res, - trans->journal_u64s, flags); -} - -#define JSET_ENTRY_LOG_U64s 4 - -static noinline void journal_transaction_name(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - struct journal *j = &c->journal; - struct jset_entry *entry = - bch2_journal_add_entry(j, &trans->journal_res, - BCH_JSET_ENTRY_log, 0, 0, - JSET_ENTRY_LOG_U64s); - struct jset_entry_log *l = - container_of(entry, struct jset_entry_log, entry); - - strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64)); -} - -static inline int btree_key_can_insert(struct btree_trans *trans, - struct btree *b, unsigned u64s) -{ - struct bch_fs *c = trans->c; - - if (!bch2_btree_node_insert_fits(c, b, u64s)) - return -BCH_ERR_btree_insert_btree_node_full; - - return 0; -} - -static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags, - struct btree_path *path, unsigned u64s) -{ - struct bch_fs *c = trans->c; - struct bkey_cached *ck = (void *) path->l[0].b; - struct btree_insert_entry *i; - unsigned new_u64s; - struct bkey_i *new_k; - - EBUG_ON(path->level); - - if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && - bch2_btree_key_cache_must_wait(c) && - !(flags & BTREE_INSERT_JOURNAL_RECLAIM)) - return -BCH_ERR_btree_insert_need_journal_reclaim; - - /* - * bch2_varint_decode can read past the end of the buffer by at most 7 - * bytes (it won't be used): - */ - u64s += 1; - - if (u64s <= ck->u64s) - return 0; - - new_u64s = roundup_pow_of_two(u64s); - new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS); - if (!new_k) { - bch_err(c, "error allocating memory for key cache key, btree %s u64s %u", - bch2_btree_ids[path->btree_id], new_u64s); - return -BCH_ERR_ENOMEM_btree_key_cache_insert; - } - - trans_for_each_update(trans, i) - if (i->old_v == &ck->k->v) - i->old_v = &new_k->v; - - ck->u64s = new_u64s; - ck->k = new_k; - return 0; -} - -/* Triggers: */ - -static int run_one_mem_trigger(struct btree_trans *trans, - struct btree_insert_entry *i, - unsigned flags) -{ - struct bkey_s_c old = { &i->old_k, i->old_v }; - struct bkey_i *new = i->k; - const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type); - const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type); - int ret; - - verify_update_old_key(trans, i); - - if (unlikely(flags & BTREE_TRIGGER_NORUN)) - return 0; - - if (!btree_node_type_needs_gc((enum btree_node_type) i->btree_id)) - return 0; - - if (old_ops->atomic_trigger == new_ops->atomic_trigger && - ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { - ret = bch2_mark_key(trans, i->btree_id, i->level, - old, bkey_i_to_s_c(new), - BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); - } else { - struct bkey _deleted = KEY(0, 0, 0); - struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; - - _deleted.p = i->path->pos; - - ret = bch2_mark_key(trans, i->btree_id, i->level, - deleted, bkey_i_to_s_c(new), - BTREE_TRIGGER_INSERT|flags) ?: - bch2_mark_key(trans, i->btree_id, i->level, - old, deleted, - BTREE_TRIGGER_OVERWRITE|flags); - } - - return ret; -} - -static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i, - bool overwrite) -{ - /* - * Transactional triggers create new btree_insert_entries, so we can't - * pass them a pointer to a btree_insert_entry, that memory is going to - * move: - */ - struct bkey old_k = i->old_k; - struct bkey_s_c old = { &old_k, i->old_v }; - const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type); - const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type); - - verify_update_old_key(trans, i); - - if ((i->flags & BTREE_TRIGGER_NORUN) || - !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type))) - return 0; - - if (!i->insert_trigger_run && - !i->overwrite_trigger_run && - old_ops->trans_trigger == new_ops->trans_trigger && - ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { - i->overwrite_trigger_run = true; - i->insert_trigger_run = true; - return bch2_trans_mark_key(trans, i->btree_id, i->level, old, i->k, - BTREE_TRIGGER_INSERT| - BTREE_TRIGGER_OVERWRITE| - i->flags) ?: 1; - } else if (overwrite && !i->overwrite_trigger_run) { - i->overwrite_trigger_run = true; - return bch2_trans_mark_old(trans, i->btree_id, i->level, old, i->flags) ?: 1; - } else if (!overwrite && !i->insert_trigger_run) { - i->insert_trigger_run = true; - return bch2_trans_mark_new(trans, i->btree_id, i->level, i->k, i->flags) ?: 1; - } else { - return 0; - } -} - -static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, - struct btree_insert_entry *btree_id_start) -{ - struct btree_insert_entry *i; - bool trans_trigger_run; - int ret, overwrite; - - for (overwrite = 1; overwrite >= 0; --overwrite) { - - /* - * Running triggers will append more updates to the list of updates as - * we're walking it: - */ - do { - trans_trigger_run = false; - - for (i = btree_id_start; - i < trans->updates + trans->nr_updates && i->btree_id <= btree_id; - i++) { - if (i->btree_id != btree_id) - continue; - - ret = run_one_trans_trigger(trans, i, overwrite); - if (ret < 0) - return ret; - if (ret) - trans_trigger_run = true; - } - } while (trans_trigger_run); - } - - return 0; -} - -static int bch2_trans_commit_run_triggers(struct btree_trans *trans) -{ - struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates; - unsigned btree_id = 0; - int ret = 0; - - /* - * - * For a given btree, this algorithm runs insert triggers before - * overwrite triggers: this is so that when extents are being moved - * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before - * they are re-added. - */ - for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) { - if (btree_id == BTREE_ID_alloc) - continue; - - while (btree_id_start < trans->updates + trans->nr_updates && - btree_id_start->btree_id < btree_id) - btree_id_start++; - - ret = run_btree_triggers(trans, btree_id, btree_id_start); - if (ret) - return ret; - } - - trans_for_each_update(trans, i) { - if (i->btree_id > BTREE_ID_alloc) - break; - if (i->btree_id == BTREE_ID_alloc) { - ret = run_btree_triggers(trans, BTREE_ID_alloc, i); - if (ret) - return ret; - break; - } - } - -#ifdef CONFIG_BCACHEFS_DEBUG - trans_for_each_update(trans, i) - BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) && - (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) && - (!i->insert_trigger_run || !i->overwrite_trigger_run)); -#endif - return 0; -} - -static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - struct btree_insert_entry *i; - int ret = 0; - - trans_for_each_update(trans, i) { - /* - * XXX: synchronization of cached update triggers with gc - * XXX: synchronization of interior node updates with gc - */ - BUG_ON(i->cached || i->level); - - if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) { - ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC); - if (ret) - break; - } - } - - return ret; -} - -static inline int -bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, - struct btree_insert_entry **stopped_at, - unsigned long trace_ip) -{ - struct bch_fs *c = trans->c; - struct btree_insert_entry *i; - struct btree_write_buffered_key *wb; - struct btree_trans_commit_hook *h; - unsigned u64s = 0; - bool marking = false; - int ret; - - if (race_fault()) { - trace_and_count(c, trans_restart_fault_inject, trans, trace_ip); - return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject); - } - - /* - * Check if the insert will fit in the leaf node with the write lock - * held, otherwise another thread could write the node changing the - * amount of space available: - */ - - prefetch(&trans->c->journal.flags); - - trans_for_each_update(trans, i) { - /* Multiple inserts might go to same leaf: */ - if (!same_leaf_as_prev(trans, i)) - u64s = 0; - - u64s += i->k->k.u64s; - ret = !i->cached - ? btree_key_can_insert(trans, insert_l(i)->b, u64s) - : btree_key_can_insert_cached(trans, flags, i->path, u64s); - if (ret) { - *stopped_at = i; - return ret; - } - - if (btree_node_type_needs_gc(i->bkey_type)) - marking = true; - } - - if (trans->nr_wb_updates && - trans->nr_wb_updates + c->btree_write_buffer.state.nr > c->btree_write_buffer.size) - return -BCH_ERR_btree_insert_need_flush_buffer; - - /* - * Don't get journal reservation until after we know insert will - * succeed: - */ - if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) { - ret = bch2_trans_journal_res_get(trans, - (flags & BCH_WATERMARK_MASK)| - JOURNAL_RES_GET_NONBLOCK); - if (ret) - return ret; - - if (unlikely(trans->journal_transaction_names)) - journal_transaction_name(trans); - } else { - trans->journal_res.seq = c->journal.replay_journal_seq; - } - - /* - * Not allowed to fail after we've gotten our journal reservation - we - * have to use it: - */ - - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && - !(flags & BTREE_INSERT_JOURNAL_REPLAY)) { - if (bch2_journal_seq_verify) - trans_for_each_update(trans, i) - i->k->k.version.lo = trans->journal_res.seq; - else if (bch2_inject_invalid_keys) - trans_for_each_update(trans, i) - i->k->k.version = MAX_VERSION; - } - - if (trans->fs_usage_deltas && - bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas)) - return -BCH_ERR_btree_insert_need_mark_replicas; - - if (trans->nr_wb_updates) { - EBUG_ON(flags & BTREE_INSERT_JOURNAL_REPLAY); - - ret = bch2_btree_insert_keys_write_buffer(trans); - if (ret) - goto revert_fs_usage; - } - - h = trans->hooks; - while (h) { - ret = h->fn(trans, h); - if (ret) - goto revert_fs_usage; - h = h->next; - } - - trans_for_each_update(trans, i) - if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) { - ret = run_one_mem_trigger(trans, i, i->flags); - if (ret) - goto fatal_err; - } - - if (unlikely(c->gc_pos.phase)) { - ret = bch2_trans_commit_run_gc_triggers(trans); - if (ret) - goto fatal_err; - } - - if (unlikely(trans->extra_journal_entries.nr)) { - memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res), - trans->extra_journal_entries.data, - trans->extra_journal_entries.nr); - - trans->journal_res.offset += trans->extra_journal_entries.nr; - trans->journal_res.u64s -= trans->extra_journal_entries.nr; - } - - if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) { - struct journal *j = &c->journal; - struct jset_entry *entry; - - trans_for_each_update(trans, i) { - if (i->key_cache_already_flushed) - continue; - - if (i->flags & BTREE_UPDATE_NOJOURNAL) - continue; - - verify_update_old_key(trans, i); - - if (trans->journal_transaction_names) { - entry = bch2_journal_add_entry(j, &trans->journal_res, - BCH_JSET_ENTRY_overwrite, - i->btree_id, i->level, - i->old_k.u64s); - bkey_reassemble(&entry->start[0], - (struct bkey_s_c) { &i->old_k, i->old_v }); - } - - entry = bch2_journal_add_entry(j, &trans->journal_res, - BCH_JSET_ENTRY_btree_keys, - i->btree_id, i->level, - i->k->k.u64s); - bkey_copy(&entry->start[0], i->k); - } - - trans_for_each_wb_update(trans, wb) { - entry = bch2_journal_add_entry(j, &trans->journal_res, - BCH_JSET_ENTRY_btree_keys, - wb->btree, 0, - wb->k.k.u64s); - bkey_copy(&entry->start[0], &wb->k); - } - - if (trans->journal_seq) - *trans->journal_seq = trans->journal_res.seq; - } - - trans_for_each_update(trans, i) { - i->k->k.needs_whiteout = false; - - if (!i->cached) { - u64 seq = trans->journal_res.seq; - - if (i->flags & BTREE_UPDATE_PREJOURNAL) - seq = i->seq; - - bch2_btree_insert_key_leaf(trans, i->path, i->k, seq); - } else if (!i->key_cache_already_flushed) - bch2_btree_insert_key_cached(trans, flags, i); - else { - bch2_btree_key_cache_drop(trans, i->path); - btree_path_set_dirty(i->path, BTREE_ITER_NEED_TRAVERSE); - } - } - - return 0; -fatal_err: - bch2_fatal_error(c); -revert_fs_usage: - if (trans->fs_usage_deltas) - bch2_trans_fs_usage_revert(trans, trans->fs_usage_deltas); - return ret; -} - -static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i) -{ - while (--i >= trans->updates) { - if (same_leaf_as_prev(trans, i)) - continue; - - bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b); - } - - trace_and_count(trans->c, trans_restart_would_deadlock_write, trans); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write); -} - -static inline int trans_lock_write(struct btree_trans *trans) -{ - struct btree_insert_entry *i; - - trans_for_each_update(trans, i) { - if (same_leaf_as_prev(trans, i)) - continue; - - if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c)) - return trans_lock_write_fail(trans, i); - - if (!i->cached) - bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b); - } - - return 0; -} - -static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans) -{ - struct btree_insert_entry *i; - struct btree_write_buffered_key *wb; - - trans_for_each_update(trans, i) - bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p); - - trans_for_each_wb_update(trans, wb) - bch2_journal_key_overwritten(trans->c, wb->btree, 0, wb->k.k.p); -} - -#ifdef CONFIG_BCACHEFS_DEBUG -static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, unsigned flags, - struct btree_insert_entry *i, - struct printbuf *err) -{ - struct bch_fs *c = trans->c; - int rw = (flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE; - - printbuf_reset(err); - prt_printf(err, "invalid bkey on insert from %s -> %ps", - trans->fn, (void *) i->ip_allocated); - prt_newline(err); - printbuf_indent_add(err, 2); - - bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k)); - prt_newline(err); - - bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), - i->bkey_type, rw, err); - bch2_print_string_as_lines(KERN_ERR, err->buf); - - bch2_inconsistent_error(c); - bch2_dump_trans_updates(trans); - printbuf_exit(err); - - return -EINVAL; -} -#endif - -/* - * Get journal reservation, take write locks, and attempt to do btree update(s): - */ -static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags, - struct btree_insert_entry **stopped_at, - unsigned long trace_ip) -{ - struct bch_fs *c = trans->c; - struct btree_insert_entry *i; - int ret = 0, u64s_delta = 0; - -#ifdef CONFIG_BCACHEFS_DEBUG - trans_for_each_update(trans, i) { - struct printbuf buf = PRINTBUF; - enum bkey_invalid_flags invalid_flags = 0; - - if (!(flags & BTREE_INSERT_JOURNAL_REPLAY)) - invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT; - - if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), - i->bkey_type, invalid_flags, &buf))) - ret = bch2_trans_commit_bkey_invalid(trans, flags, i, &buf); - btree_insert_entry_checks(trans, i); - printbuf_exit(&buf); - - if (ret) - return ret; - } -#endif - - trans_for_each_update(trans, i) { - if (i->cached) - continue; - - u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0; - u64s_delta -= i->old_btree_u64s; - - if (!same_leaf_as_next(trans, i)) { - if (u64s_delta <= 0) { - ret = bch2_foreground_maybe_merge(trans, i->path, - i->level, flags); - if (unlikely(ret)) - return ret; - } - - u64s_delta = 0; - } - } - - ret = bch2_journal_preres_get(&c->journal, - &trans->journal_preres, trans->journal_preres_u64s, - (flags & BCH_WATERMARK_MASK)|JOURNAL_RES_GET_NONBLOCK); - if (unlikely(ret == -BCH_ERR_journal_preres_get_blocked)) - ret = bch2_trans_journal_preres_get_cold(trans, flags, trace_ip); - if (unlikely(ret)) - return ret; - - ret = trans_lock_write(trans); - if (unlikely(ret)) - return ret; - - ret = bch2_trans_commit_write_locked(trans, flags, stopped_at, trace_ip); - - if (!ret && unlikely(trans->journal_replay_not_finished)) - bch2_drop_overwrites_from_journal(trans); - - trans_for_each_update(trans, i) - if (!same_leaf_as_prev(trans, i)) - bch2_btree_node_unlock_write_inlined(trans, i->path, - insert_l(i)->b); - - if (!ret && trans->journal_pin) - bch2_journal_pin_add(&c->journal, trans->journal_res.seq, - trans->journal_pin, NULL); - - /* - * Drop journal reservation after dropping write locks, since dropping - * the journal reservation may kick off a journal write: - */ - bch2_journal_res_put(&c->journal, &trans->journal_res); - - if (unlikely(ret)) - return ret; - - bch2_trans_downgrade(trans); - - return 0; -} - -static int journal_reclaim_wait_done(struct bch_fs *c) -{ - int ret = bch2_journal_error(&c->journal) ?: - !bch2_btree_key_cache_must_wait(c); - - if (!ret) - journal_reclaim_kick(&c->journal); - return ret; -} - -static noinline -int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, - struct btree_insert_entry *i, - int ret, unsigned long trace_ip) -{ - struct bch_fs *c = trans->c; - - switch (ret) { - case -BCH_ERR_btree_insert_btree_node_full: - ret = bch2_btree_split_leaf(trans, i->path, flags); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - trace_and_count(c, trans_restart_btree_node_split, trans, trace_ip, i->path); - break; - case -BCH_ERR_btree_insert_need_mark_replicas: - ret = drop_locks_do(trans, - bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas)); - break; - case -BCH_ERR_journal_res_get_blocked: - /* - * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK - * flag - */ - if ((flags & BTREE_INSERT_JOURNAL_RECLAIM) && - (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) { - ret = -BCH_ERR_journal_reclaim_would_deadlock; - break; - } - - ret = drop_locks_do(trans, - bch2_trans_journal_res_get(trans, - (flags & BCH_WATERMARK_MASK)| - JOURNAL_RES_GET_CHECK)); - break; - case -BCH_ERR_btree_insert_need_journal_reclaim: - bch2_trans_unlock(trans); - - trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip); - - wait_event_freezable(c->journal.reclaim_wait, - (ret = journal_reclaim_wait_done(c))); - if (ret < 0) - break; - - ret = bch2_trans_relock(trans); - break; - case -BCH_ERR_btree_insert_need_flush_buffer: { - struct btree_write_buffer *wb = &c->btree_write_buffer; - - ret = 0; - - if (wb->state.nr > wb->size * 3 / 4) { - bch2_trans_unlock(trans); - mutex_lock(&wb->flush_lock); - - if (wb->state.nr > wb->size * 3 / 4) { - bch2_trans_begin(trans); - ret = __bch2_btree_write_buffer_flush(trans, - flags|BTREE_INSERT_NOCHECK_RW, true); - if (!ret) { - trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush); - } - } else { - mutex_unlock(&wb->flush_lock); - ret = bch2_trans_relock(trans); - } - } - break; - } - default: - BUG_ON(ret >= 0); - break; - } - - BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted); - - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) && - !(flags & BTREE_INSERT_NOWAIT) && - (flags & BTREE_INSERT_NOFAIL), c, - "%s: incorrectly got %s\n", __func__, bch2_err_str(ret)); - - return ret; -} - -static noinline int -bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags) -{ - struct bch_fs *c = trans->c; - int ret; - - if (likely(!(flags & BTREE_INSERT_LAZY_RW)) || - test_bit(BCH_FS_STARTED, &c->flags)) - return -BCH_ERR_erofs_trans_commit; - - ret = drop_locks_do(trans, bch2_fs_read_write_early(c)); - if (ret) - return ret; - - bch2_write_ref_get(c, BCH_WRITE_REF_trans); - return 0; -} - -/* - * This is for updates done in the early part of fsck - btree_gc - before we've - * gone RW. we only add the new key to the list of keys for journal replay to - * do. - */ -static noinline int -do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) -{ - struct bch_fs *c = trans->c; - struct btree_insert_entry *i; - int ret = 0; - - trans_for_each_update(trans, i) { - ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k); - if (ret) - break; - } - - return ret; -} - -int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) -{ - struct bch_fs *c = trans->c; - struct btree_insert_entry *i = NULL; - struct btree_write_buffered_key *wb; - unsigned u64s; - int ret = 0; - - if (!trans->nr_updates && - !trans->nr_wb_updates && - !trans->extra_journal_entries.nr) - goto out_reset; - - if (flags & BTREE_INSERT_GC_LOCK_HELD) - lockdep_assert_held(&c->gc_lock); - - ret = bch2_trans_commit_run_triggers(trans); - if (ret) - goto out_reset; - - if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) { - ret = do_bch2_trans_commit_to_journal_replay(trans); - goto out_reset; - } - - if (!(flags & BTREE_INSERT_NOCHECK_RW) && - unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) { - ret = bch2_trans_commit_get_rw_cold(trans, flags); - if (ret) - goto out_reset; - } - - if (c->btree_write_buffer.state.nr > c->btree_write_buffer.size / 2 && - mutex_trylock(&c->btree_write_buffer.flush_lock)) { - bch2_trans_begin(trans); - bch2_trans_unlock(trans); - - ret = __bch2_btree_write_buffer_flush(trans, - flags|BTREE_INSERT_NOCHECK_RW, true); - if (!ret) { - trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush); - } - goto out; - } - - EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); - - memset(&trans->journal_preres, 0, sizeof(trans->journal_preres)); - - trans->journal_u64s = trans->extra_journal_entries.nr; - trans->journal_preres_u64s = 0; - - trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names); - - if (trans->journal_transaction_names) - trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); - - trans_for_each_update(trans, i) { - EBUG_ON(!i->path->should_be_locked); - - ret = bch2_btree_path_upgrade(trans, i->path, i->level + 1); - if (unlikely(ret)) - goto out; - - EBUG_ON(!btree_node_intent_locked(i->path, i->level)); - - if (i->key_cache_already_flushed) - continue; - - /* we're going to journal the key being updated: */ - u64s = jset_u64s(i->k->k.u64s); - if (i->cached && - likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) - trans->journal_preres_u64s += u64s; - - if (i->flags & BTREE_UPDATE_NOJOURNAL) - continue; - - trans->journal_u64s += u64s; - - /* and we're also going to log the overwrite: */ - if (trans->journal_transaction_names) - trans->journal_u64s += jset_u64s(i->old_k.u64s); - } - - trans_for_each_wb_update(trans, wb) - trans->journal_u64s += jset_u64s(wb->k.k.u64s); - - if (trans->extra_journal_res) { - ret = bch2_disk_reservation_add(c, trans->disk_res, - trans->extra_journal_res, - (flags & BTREE_INSERT_NOFAIL) - ? BCH_DISK_RESERVATION_NOFAIL : 0); - if (ret) - goto err; - } -retry: - bch2_trans_verify_not_in_restart(trans); - memset(&trans->journal_res, 0, sizeof(trans->journal_res)); - - ret = do_bch2_trans_commit(trans, flags, &i, _RET_IP_); - - /* make sure we didn't drop or screw up locks: */ - bch2_trans_verify_locks(trans); - - if (ret) - goto err; - - trace_and_count(c, transaction_commit, trans, _RET_IP_); -out: - bch2_journal_preres_put(&c->journal, &trans->journal_preres); - - if (likely(!(flags & BTREE_INSERT_NOCHECK_RW))) - bch2_write_ref_put(c, BCH_WRITE_REF_trans); -out_reset: - bch2_trans_reset_updates(trans); - - return ret; -err: - ret = bch2_trans_commit_error(trans, flags, i, ret, _RET_IP_); - if (ret) - goto out; - - goto retry; -} - -static noinline int __check_pos_snapshot_overwritten(struct btree_trans *trans, - enum btree_id id, - struct bpos pos) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - bch2_trans_iter_init(trans, &iter, id, pos, - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_ALL_SNAPSHOTS); - while (1) { - k = bch2_btree_iter_prev(&iter); - ret = bkey_err(k); - if (ret) - break; - - if (!k.k) - break; - - if (!bkey_eq(pos, k.k->p)) - break; - - if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) { - ret = 1; - break; - } - } - bch2_trans_iter_exit(trans, &iter); - - return ret; -} - -static inline int check_pos_snapshot_overwritten(struct btree_trans *trans, - enum btree_id id, - struct bpos pos) -{ - if (!btree_type_has_snapshots(id) || - bch2_snapshot_is_leaf(trans->c, pos.snapshot)) - return 0; - - return __check_pos_snapshot_overwritten(trans, id, pos); -} - -static noinline int extent_front_merge(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k, - struct bkey_i **insert, - enum btree_update_flags flags) -{ - struct bch_fs *c = trans->c; - struct bkey_i *update; - int ret; - - update = bch2_bkey_make_mut_noupdate(trans, k); - ret = PTR_ERR_OR_ZERO(update); - if (ret) - return ret; - - if (!bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert))) - return 0; - - ret = check_pos_snapshot_overwritten(trans, iter->btree_id, k.k->p) ?: - check_pos_snapshot_overwritten(trans, iter->btree_id, (*insert)->k.p); - if (ret < 0) - return ret; - if (ret) - return 0; - - ret = bch2_btree_delete_at(trans, iter, flags); - if (ret) - return ret; - - *insert = update; - return 0; -} - -static noinline int extent_back_merge(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_i *insert, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - int ret; - - ret = check_pos_snapshot_overwritten(trans, iter->btree_id, insert->k.p) ?: - check_pos_snapshot_overwritten(trans, iter->btree_id, k.k->p); - if (ret < 0) - return ret; - if (ret) - return 0; - - bch2_bkey_merge(c, bkey_i_to_s(insert), k); - return 0; -} - -/* - * When deleting, check if we need to emit a whiteout (because we're overwriting - * something in an ancestor snapshot) - */ -static int need_whiteout_for_snapshot(struct btree_trans *trans, - enum btree_id btree_id, struct bpos pos) -{ - struct btree_iter iter; - struct bkey_s_c k; - u32 snapshot = pos.snapshot; - int ret; - - if (!bch2_snapshot_parent(trans->c, pos.snapshot)) - return 0; - - pos.snapshot++; - - for_each_btree_key_norestart(trans, iter, btree_id, pos, - BTREE_ITER_ALL_SNAPSHOTS| - BTREE_ITER_NOPRESERVE, k, ret) { - if (!bkey_eq(k.k->p, pos)) - break; - - if (bch2_snapshot_is_ancestor(trans->c, snapshot, - k.k->p.snapshot)) { - ret = !bkey_whiteout(k.k); - break; - } - } - bch2_trans_iter_exit(trans, &iter); - - return ret; -} - -int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, - enum btree_id id, - struct bpos old_pos, - struct bpos new_pos) -{ - struct bch_fs *c = trans->c; - struct btree_iter old_iter, new_iter = { NULL }; - struct bkey_s_c old_k, new_k; - snapshot_id_list s; - struct bkey_i *update; - int ret; - - if (!bch2_snapshot_has_children(c, old_pos.snapshot)) - return 0; - - darray_init(&s); - - bch2_trans_iter_init(trans, &old_iter, id, old_pos, - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_ALL_SNAPSHOTS); - while ((old_k = bch2_btree_iter_prev(&old_iter)).k && - !(ret = bkey_err(old_k)) && - bkey_eq(old_pos, old_k.k->p)) { - struct bpos whiteout_pos = - SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);; - - if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) || - snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot)) - continue; - - new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos, - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_INTENT); - ret = bkey_err(new_k); - if (ret) - break; - - if (new_k.k->type == KEY_TYPE_deleted) { - update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); - ret = PTR_ERR_OR_ZERO(update); - if (ret) - break; - - bkey_init(&update->k); - update->k.p = whiteout_pos; - update->k.type = KEY_TYPE_whiteout; - - ret = bch2_trans_update(trans, &new_iter, update, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); - } - bch2_trans_iter_exit(trans, &new_iter); - - ret = snapshot_list_add(c, &s, old_k.k->p.snapshot); - if (ret) - break; - } - bch2_trans_iter_exit(trans, &new_iter); - bch2_trans_iter_exit(trans, &old_iter); - darray_exit(&s); - - return ret; -} - -int bch2_trans_update_extent_overwrite(struct btree_trans *trans, - struct btree_iter *iter, - enum btree_update_flags flags, - struct bkey_s_c old, - struct bkey_s_c new) -{ - enum btree_id btree_id = iter->btree_id; - struct bkey_i *update; - struct bpos new_start = bkey_start_pos(new.k); - bool front_split = bkey_lt(bkey_start_pos(old.k), new_start); - bool back_split = bkey_gt(old.k->p, new.k->p); - int ret = 0, compressed_sectors; - - /* - * If we're going to be splitting a compressed extent, note it - * so that __bch2_trans_commit() can increase our disk - * reservation: - */ - if (((front_split && back_split) || - ((front_split || back_split) && old.k->p.snapshot != new.k->p.snapshot)) && - (compressed_sectors = bch2_bkey_sectors_compressed(old))) - trans->extra_journal_res += compressed_sectors; - - if (front_split) { - update = bch2_bkey_make_mut_noupdate(trans, old); - if ((ret = PTR_ERR_OR_ZERO(update))) - return ret; - - bch2_cut_back(new_start, update); - - ret = bch2_insert_snapshot_whiteouts(trans, btree_id, - old.k->p, update->k.p) ?: - bch2_btree_insert_nonextent(trans, btree_id, update, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); - if (ret) - return ret; - } - - /* If we're overwriting in a different snapshot - middle split: */ - if (old.k->p.snapshot != new.k->p.snapshot && - (front_split || back_split)) { - update = bch2_bkey_make_mut_noupdate(trans, old); - if ((ret = PTR_ERR_OR_ZERO(update))) - return ret; - - bch2_cut_front(new_start, update); - bch2_cut_back(new.k->p, update); - - ret = bch2_insert_snapshot_whiteouts(trans, btree_id, - old.k->p, update->k.p) ?: - bch2_btree_insert_nonextent(trans, btree_id, update, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); - if (ret) - return ret; - } - - if (bkey_le(old.k->p, new.k->p)) { - update = bch2_trans_kmalloc(trans, sizeof(*update)); - if ((ret = PTR_ERR_OR_ZERO(update))) - return ret; - - bkey_init(&update->k); - update->k.p = old.k->p; - update->k.p.snapshot = new.k->p.snapshot; - - if (new.k->p.snapshot != old.k->p.snapshot) { - update->k.type = KEY_TYPE_whiteout; - } else if (btree_type_has_snapshots(btree_id)) { - ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p); - if (ret < 0) - return ret; - if (ret) - update->k.type = KEY_TYPE_whiteout; - } - - ret = bch2_btree_insert_nonextent(trans, btree_id, update, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); - if (ret) - return ret; - } - - if (back_split) { - update = bch2_bkey_make_mut_noupdate(trans, old); - if ((ret = PTR_ERR_OR_ZERO(update))) - return ret; - - bch2_cut_front(new.k->p, update); - - ret = bch2_trans_update_by_path(trans, iter->path, update, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| - flags, _RET_IP_); - if (ret) - return ret; - } - - return 0; -} - -static int bch2_trans_update_extent(struct btree_trans *trans, - struct btree_iter *orig_iter, - struct bkey_i *insert, - enum btree_update_flags flags) -{ - struct btree_iter iter; - struct bkey_s_c k; - enum btree_id btree_id = orig_iter->btree_id; - int ret = 0; - - bch2_trans_iter_init(trans, &iter, btree_id, bkey_start_pos(&insert->k), - BTREE_ITER_INTENT| - BTREE_ITER_WITH_UPDATES| - BTREE_ITER_NOT_EXTENTS); - k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); - if ((ret = bkey_err(k))) - goto err; - if (!k.k) - goto out; - - if (bkey_eq(k.k->p, bkey_start_pos(&insert->k))) { - if (bch2_bkey_maybe_mergable(k.k, &insert->k)) { - ret = extent_front_merge(trans, &iter, k, &insert, flags); - if (ret) - goto err; - } - - goto next; - } - - while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) { - bool done = bkey_lt(insert->k.p, k.k->p); - - ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert)); - if (ret) - goto err; - - if (done) - goto out; -next: - bch2_btree_iter_advance(&iter); - k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); - if ((ret = bkey_err(k))) - goto err; - if (!k.k) - goto out; - } - - if (bch2_bkey_maybe_mergable(&insert->k, k.k)) { - ret = extent_back_merge(trans, &iter, insert, k); - if (ret) - goto err; - } -out: - if (!bkey_deleted(&insert->k)) - ret = bch2_btree_insert_nonextent(trans, btree_id, insert, flags); -err: - bch2_trans_iter_exit(trans, &iter); - - return ret; -} - -static noinline int flush_new_cached_update(struct btree_trans *trans, - struct btree_path *path, - struct btree_insert_entry *i, - enum btree_update_flags flags, - unsigned long ip) -{ - struct btree_path *btree_path; - struct bkey k; - int ret; - - btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0, - BTREE_ITER_INTENT, _THIS_IP_); - ret = bch2_btree_path_traverse(trans, btree_path, 0); - if (ret) - goto out; - - /* - * The old key in the insert entry might actually refer to an existing - * key in the btree that has been deleted from cache and not yet - * flushed. Check for this and skip the flush so we don't run triggers - * against a stale key. - */ - bch2_btree_path_peek_slot_exact(btree_path, &k); - if (!bkey_deleted(&k)) - goto out; - - i->key_cache_already_flushed = true; - i->flags |= BTREE_TRIGGER_NORUN; - - btree_path_set_should_be_locked(btree_path); - ret = bch2_trans_update_by_path(trans, btree_path, i->k, flags, ip); -out: - bch2_path_put(trans, btree_path, true); - return ret; -} - -static int __must_check -bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, - struct bkey_i *k, enum btree_update_flags flags, - unsigned long ip) -{ - struct bch_fs *c = trans->c; - struct btree_insert_entry *i, n; - u64 seq = 0; - int cmp; - - EBUG_ON(!path->should_be_locked); - EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX); - EBUG_ON(!bpos_eq(k->k.p, path->pos)); - - /* - * The transaction journal res hasn't been allocated at this point. - * That occurs at commit time. Reuse the seq field to pass in the seq - * of a prejournaled key. - */ - if (flags & BTREE_UPDATE_PREJOURNAL) - seq = trans->journal_res.seq; - - n = (struct btree_insert_entry) { - .flags = flags, - .bkey_type = __btree_node_type(path->level, path->btree_id), - .btree_id = path->btree_id, - .level = path->level, - .cached = path->cached, - .path = path, - .k = k, - .seq = seq, - .ip_allocated = ip, - }; - -#ifdef CONFIG_BCACHEFS_DEBUG - trans_for_each_update(trans, i) - BUG_ON(i != trans->updates && - btree_insert_entry_cmp(i - 1, i) >= 0); -#endif - - /* - * Pending updates are kept sorted: first, find position of new update, - * then delete/trim any updates the new update overwrites: - */ - trans_for_each_update(trans, i) { - cmp = btree_insert_entry_cmp(&n, i); - if (cmp <= 0) - break; - } - - if (!cmp && i < trans->updates + trans->nr_updates) { - EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run); - - bch2_path_put(trans, i->path, true); - i->flags = n.flags; - i->cached = n.cached; - i->k = n.k; - i->path = n.path; - i->seq = n.seq; - i->ip_allocated = n.ip_allocated; - } else { - array_insert_item(trans->updates, trans->nr_updates, - i - trans->updates, n); - - i->old_v = bch2_btree_path_peek_slot_exact(path, &i->old_k).v; - i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0; - - if (unlikely(trans->journal_replay_not_finished)) { - struct bkey_i *j_k = - bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p); - - if (j_k) { - i->old_k = j_k->k; - i->old_v = &j_k->v; - } - } - } - - __btree_path_get(i->path, true); - - /* - * If a key is present in the key cache, it must also exist in the - * btree - this is necessary for cache coherency. When iterating over - * a btree that's cached in the key cache, the btree iter code checks - * the key cache - but the key has to exist in the btree for that to - * work: - */ - if (path->cached && bkey_deleted(&i->old_k)) - return flush_new_cached_update(trans, path, i, flags, ip); - - return 0; -} - -int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_i *k, enum btree_update_flags flags) -{ - struct btree_path *path = iter->update_path ?: iter->path; - struct bkey_cached *ck; - int ret; - - if (iter->flags & BTREE_ITER_IS_EXTENTS) - return bch2_trans_update_extent(trans, iter, k, flags); - - if (bkey_deleted(&k->k) && - !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && - (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) { - ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p); - if (unlikely(ret < 0)) - return ret; - - if (ret) - k->k.type = KEY_TYPE_whiteout; - } - - /* - * Ensure that updates to cached btrees go to the key cache: - */ - if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && - !path->cached && - !path->level && - btree_id_cached(trans->c, path->btree_id)) { - if (!iter->key_cache_path || - !iter->key_cache_path->should_be_locked || - !bpos_eq(iter->key_cache_path->pos, k->k.p)) { - if (!iter->key_cache_path) - iter->key_cache_path = - bch2_path_get(trans, path->btree_id, path->pos, 1, 0, - BTREE_ITER_INTENT| - BTREE_ITER_CACHED, _THIS_IP_); - - iter->key_cache_path = - bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos, - iter->flags & BTREE_ITER_INTENT, - _THIS_IP_); - - ret = bch2_btree_path_traverse(trans, iter->key_cache_path, - BTREE_ITER_CACHED); - if (unlikely(ret)) - return ret; - - ck = (void *) iter->key_cache_path->l[0].b; - - if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced); - } - - btree_path_set_should_be_locked(iter->key_cache_path); - } - - path = iter->key_cache_path; - } - - return bch2_trans_update_by_path(trans, path, k, flags, _RET_IP_); -} - -/* - * Add a transaction update for a key that has already been journaled. - */ -int __must_check bch2_trans_update_seq(struct btree_trans *trans, u64 seq, - struct btree_iter *iter, struct bkey_i *k, - enum btree_update_flags flags) -{ - trans->journal_res.seq = seq; - return bch2_trans_update(trans, iter, k, flags|BTREE_UPDATE_NOJOURNAL| - BTREE_UPDATE_PREJOURNAL); -} - -int __must_check bch2_trans_update_buffered(struct btree_trans *trans, - enum btree_id btree, - struct bkey_i *k) -{ - struct btree_write_buffered_key *i; - int ret; - - EBUG_ON(trans->nr_wb_updates > trans->wb_updates_size); - EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX); - - trans_for_each_wb_update(trans, i) { - if (i->btree == btree && bpos_eq(i->k.k.p, k->k.p)) { - bkey_copy(&i->k, k); - return 0; - } - } - - if (!trans->wb_updates || - trans->nr_wb_updates == trans->wb_updates_size) { - struct btree_write_buffered_key *u; - - if (trans->nr_wb_updates == trans->wb_updates_size) { - struct btree_transaction_stats *s = btree_trans_stats(trans); - - BUG_ON(trans->wb_updates_size > U8_MAX / 2); - trans->wb_updates_size = max(1, trans->wb_updates_size * 2); - if (s) - s->wb_updates_size = trans->wb_updates_size; - } - - u = bch2_trans_kmalloc_nomemzero(trans, - trans->wb_updates_size * - sizeof(struct btree_write_buffered_key)); - ret = PTR_ERR_OR_ZERO(u); - if (ret) - return ret; - - if (trans->nr_wb_updates) - memcpy(u, trans->wb_updates, trans->nr_wb_updates * - sizeof(struct btree_write_buffered_key)); - trans->wb_updates = u; - } - - trans->wb_updates[trans->nr_wb_updates] = (struct btree_write_buffered_key) { - .btree = btree, - }; - - bkey_copy(&trans->wb_updates[trans->nr_wb_updates].k, k); - trans->nr_wb_updates++; - - return 0; -} - -int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter, - enum btree_id btree, struct bpos end) -{ - struct bkey_s_c k; - int ret = 0; - - bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_INTENT); - k = bch2_btree_iter_prev(iter); - ret = bkey_err(k); - if (ret) - goto err; - - bch2_btree_iter_advance(iter); - k = bch2_btree_iter_peek_slot(iter); - ret = bkey_err(k); - if (ret) - goto err; - - BUG_ON(k.k->type != KEY_TYPE_deleted); - - if (bkey_gt(k.k->p, end)) { - ret = -BCH_ERR_ENOSPC_btree_slot; - goto err; - } - - return 0; -err: - bch2_trans_iter_exit(trans, iter); - return ret; -} - -void bch2_trans_commit_hook(struct btree_trans *trans, - struct btree_trans_commit_hook *h) -{ - h->next = trans->hooks; - trans->hooks = h; -} - -int bch2_btree_insert_nonextent(struct btree_trans *trans, - enum btree_id btree, struct bkey_i *k, - enum btree_update_flags flags) -{ - struct btree_iter iter; - int ret; - - bch2_trans_iter_init(trans, &iter, btree, k->k.p, - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_INTENT); - ret = bch2_btree_iter_traverse(&iter) ?: - bch2_trans_update(trans, &iter, k, flags); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int __bch2_btree_insert(struct btree_trans *trans, enum btree_id id, - struct bkey_i *k, enum btree_update_flags flags) -{ - struct btree_iter iter; - int ret; - - bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k), - BTREE_ITER_CACHED| - BTREE_ITER_INTENT); - ret = bch2_btree_iter_traverse(&iter) ?: - bch2_trans_update(trans, &iter, k, flags); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -/** - * bch2_btree_insert - insert keys into the extent btree - * @c: pointer to struct bch_fs - * @id: btree to insert into - * @insert_keys: list of keys to insert - * @hook: insert callback - */ -int bch2_btree_insert(struct bch_fs *c, enum btree_id id, - struct bkey_i *k, - struct disk_reservation *disk_res, - u64 *journal_seq, int flags) -{ - return bch2_trans_do(c, disk_res, journal_seq, flags, - __bch2_btree_insert(&trans, id, k, 0)); -} - -int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter, - unsigned len, unsigned update_flags) -{ - struct bkey_i *k; - - k = bch2_trans_kmalloc(trans, sizeof(*k)); - if (IS_ERR(k)) - return PTR_ERR(k); - - bkey_init(&k->k); - k->k.p = iter->pos; - bch2_key_resize(&k->k, len); - return bch2_trans_update(trans, iter, k, update_flags); -} - -int bch2_btree_delete_at(struct btree_trans *trans, - struct btree_iter *iter, unsigned update_flags) -{ - return bch2_btree_delete_extent_at(trans, iter, 0, update_flags); -} - -int bch2_btree_delete_at_buffered(struct btree_trans *trans, - enum btree_id btree, struct bpos pos) -{ - struct bkey_i *k; - - k = bch2_trans_kmalloc(trans, sizeof(*k)); - if (IS_ERR(k)) - return PTR_ERR(k); - - bkey_init(&k->k); - k->k.p = pos; - return bch2_trans_update_buffered(trans, btree, k); -} - -int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, - struct bpos start, struct bpos end, - unsigned update_flags, - u64 *journal_seq) -{ - u32 restart_count = trans->restart_count; - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT); - while ((k = bch2_btree_iter_peek_upto(&iter, end)).k) { - struct disk_reservation disk_res = - bch2_disk_reservation_init(trans->c, 0); - struct bkey_i delete; - - ret = bkey_err(k); - if (ret) - goto err; - - bkey_init(&delete.k); - - /* - * This could probably be more efficient for extents: - */ - - /* - * For extents, iter.pos won't necessarily be the same as - * bkey_start_pos(k.k) (for non extents they always will be the - * same). It's important that we delete starting from iter.pos - * because the range we want to delete could start in the middle - * of k. - * - * (bch2_btree_iter_peek() does guarantee that iter.pos >= - * bkey_start_pos(k.k)). - */ - delete.k.p = iter.pos; - - if (iter.flags & BTREE_ITER_IS_EXTENTS) - bch2_key_resize(&delete.k, - bpos_min(end, k.k->p).offset - - iter.pos.offset); - - ret = bch2_trans_update(trans, &iter, &delete, update_flags) ?: - bch2_trans_commit(trans, &disk_res, journal_seq, - BTREE_INSERT_NOFAIL); - bch2_disk_reservation_put(trans->c, &disk_res); -err: - /* - * the bch2_trans_begin() call is in a weird place because we - * need to call it after every transaction commit, to avoid path - * overflow, but don't want to call it if the delete operation - * is a no-op and we have no work to do: - */ - bch2_trans_begin(trans); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - ret = 0; - if (ret) - break; - } - bch2_trans_iter_exit(trans, &iter); - - if (!ret && trans_was_restarted(trans, restart_count)) - ret = -BCH_ERR_transaction_restart_nested; - return ret; -} - -/* - * bch_btree_delete_range - delete everything within a given range - * - * Range is a half open interval - [start, end) - */ -int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, - struct bpos start, struct bpos end, - unsigned update_flags, - u64 *journal_seq) -{ - int ret = bch2_trans_run(c, - bch2_btree_delete_range_trans(&trans, id, start, end, - update_flags, journal_seq)); - if (ret == -BCH_ERR_transaction_restart_nested) - ret = 0; - return ret; -} - -int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, - struct bpos pos, bool set) -{ - struct bkey_i *k; - int ret = 0; - - k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k)); - ret = PTR_ERR_OR_ZERO(k); - if (unlikely(ret)) - return ret; - - bkey_init(&k->k); - k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted; - k->k.p = pos; - - return bch2_trans_update_buffered(trans, btree, k); -} - -static int __bch2_trans_log_msg(darray_u64 *entries, const char *fmt, va_list args) -{ - struct printbuf buf = PRINTBUF; - struct jset_entry_log *l; - unsigned u64s; - int ret; - - prt_vprintf(&buf, fmt, args); - ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; - if (ret) - goto err; - - u64s = DIV_ROUND_UP(buf.pos, sizeof(u64)); - - ret = darray_make_room(entries, jset_u64s(u64s)); - if (ret) - goto err; - - l = (void *) &darray_top(*entries); - l->entry.u64s = cpu_to_le16(u64s); - l->entry.btree_id = 0; - l->entry.level = 1; - l->entry.type = BCH_JSET_ENTRY_log; - l->entry.pad[0] = 0; - l->entry.pad[1] = 0; - l->entry.pad[2] = 0; - memcpy(l->d, buf.buf, buf.pos); - while (buf.pos & 7) - l->d[buf.pos++] = '\0'; - - entries->nr += jset_u64s(u64s); -err: - printbuf_exit(&buf); - return ret; -} - -static int -__bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, - va_list args) -{ - int ret; - - if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) { - ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args); - } else { - ret = bch2_trans_do(c, NULL, NULL, - BTREE_INSERT_LAZY_RW|commit_flags, - __bch2_trans_log_msg(&trans.extra_journal_entries, fmt, args)); - } - - return ret; -} - -int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...) -{ - va_list args; - int ret; - - va_start(args, fmt); - ret = __bch2_fs_log_msg(c, 0, fmt, args); - va_end(args); - return ret; -} - -/* - * Use for logging messages during recovery to enable reserved space and avoid - * blocking. - */ -int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...) -{ - va_list args; - int ret; - - va_start(args, fmt); - ret = __bch2_fs_log_msg(c, BCH_WATERMARK_reclaim, fmt, args); - va_end(args); - return ret; -} diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index a418f664..f192809f 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -10,7 +10,31 @@ #include "buckets_types.h" #include "extents.h" -#include "super.h" +#include "sb-members.h" + +static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s) +{ + return div_u64(s, ca->mi.bucket_size); +} + +static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b) +{ + return ((sector_t) b) * ca->mi.bucket_size; +} + +static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s) +{ + u32 remainder; + + div_u64_rem(s, ca->mi.bucket_size, &remainder); + return remainder; +} + +static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s, + u32 *offset) +{ + return div_u64_rem(s, ca->mi.bucket_size, offset); +} #define for_each_bucket(_b, _buckets) \ for (_b = (_buckets)->b + (_buckets)->first_bucket; \ @@ -292,6 +316,27 @@ int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, size_t, enum bch_data_type, unsigned); int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *); +static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b) +{ + struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; + u64 b_offset = bucket_to_sector(ca, b); + u64 b_end = bucket_to_sector(ca, b + 1); + unsigned i; + + if (!b) + return true; + + for (i = 0; i < layout->nr_superblocks; i++) { + u64 offset = le64_to_cpu(layout->sb_offset[i]); + u64 end = offset + (1 << layout->sb_max_size_bits); + + if (!(offset >= b_end || end <= b_offset)) + return true; + } + + return false; +} + /* disk reservations: */ static inline void bch2_disk_reservation_put(struct bch_fs *c, diff --git a/libbcachefs/checksum.c b/libbcachefs/checksum.c index a08997a5..a0ef85eb 100644 --- a/libbcachefs/checksum.c +++ b/libbcachefs/checksum.c @@ -360,7 +360,7 @@ struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a, state.type = type; bch2_checksum_init(&state); - state.seed = a.lo; + state.seed = (u64 __force) a.lo; BUG_ON(!bch2_checksum_mergeable(type)); @@ -371,7 +371,7 @@ struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a, page_address(ZERO_PAGE(0)), b); b_len -= b; } - a.lo = bch2_checksum_final(&state); + a.lo = (__le64 __force) bch2_checksum_final(&state); a.lo ^= b.lo; a.hi ^= b.hi; return a; @@ -426,7 +426,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, merged = bch2_checksum_bio(c, crc_old.csum_type, extent_nonce(version, crc_old), bio); - if (bch2_crc_cmp(merged, crc_old.csum)) { + if (bch2_crc_cmp(merged, crc_old.csum) && !c->opts.no_data_io) { bch_err(c, "checksum error in bch2_rechecksum_bio() (memory corruption or bug?)\n" "expected %0llx:%0llx got %0llx:%0llx (old type %s new type %s)", crc_old.csum.hi, @@ -458,6 +458,48 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, return 0; } +/* BCH_SB_FIELD_crypt: */ + +static int bch2_sb_crypt_validate(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) +{ + struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); + + if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) { + prt_printf(err, "wrong size (got %zu should be %zu)", + vstruct_bytes(&crypt->field), sizeof(*crypt)); + return -BCH_ERR_invalid_sb_crypt; + } + + if (BCH_CRYPT_KDF_TYPE(crypt)) { + prt_printf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt)); + return -BCH_ERR_invalid_sb_crypt; + } + + return 0; +} + +static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); + + prt_printf(out, "KFD: %llu", BCH_CRYPT_KDF_TYPE(crypt)); + prt_newline(out); + prt_printf(out, "scrypt n: %llu", BCH_KDF_SCRYPT_N(crypt)); + prt_newline(out); + prt_printf(out, "scrypt r: %llu", BCH_KDF_SCRYPT_R(crypt)); + prt_newline(out); + prt_printf(out, "scrypt p: %llu", BCH_KDF_SCRYPT_P(crypt)); + prt_newline(out); +} + +const struct bch_sb_field_ops bch_sb_field_ops_crypt = { + .validate = bch2_sb_crypt_validate, + .to_text = bch2_sb_crypt_to_text, +}; + #ifdef __KERNEL__ static int __bch2_request_key(char *key_description, struct bch_key *key) { @@ -597,7 +639,7 @@ int bch2_disable_encryption(struct bch_fs *c) if (ret) goto out; - crypt->key.magic = BCH_KEY_MAGIC; + crypt->key.magic = cpu_to_le64(BCH_KEY_MAGIC); crypt->key.key = key; SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0); @@ -625,7 +667,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed) if (ret) goto err; - key.magic = BCH_KEY_MAGIC; + key.magic = cpu_to_le64(BCH_KEY_MAGIC); get_random_bytes(&key.key, sizeof(key.key)); if (keyed) { diff --git a/libbcachefs/checksum.h b/libbcachefs/checksum.h index 1ad1d5f0..c7b1a8fc 100644 --- a/libbcachefs/checksum.h +++ b/libbcachefs/checksum.h @@ -72,6 +72,8 @@ static inline int bch2_encrypt_bio(struct bch_fs *c, unsigned type, : 0; } +extern const struct bch_sb_field_ops bch_sb_field_ops_crypt; + int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *, struct bch_key *); diff --git a/libbcachefs/disk_groups.c b/libbcachefs/disk_groups.c index de14ca3a..f36472c4 100644 --- a/libbcachefs/disk_groups.c +++ b/libbcachefs/disk_groups.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "disk_groups.h" +#include "sb-members.h" #include "super-io.h" #include diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h index 735eb241..f7fa8744 100644 --- a/libbcachefs/errcode.h +++ b/libbcachefs/errcode.h @@ -213,6 +213,12 @@ x(BCH_ERR_invalid_sb, invalid_sb_quota) \ x(BCH_ERR_invalid, invalid_bkey) \ x(BCH_ERR_operation_blocked, nocow_lock_blocked) \ + x(EIO, btree_node_read_err) \ + x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ + x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ + x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ + x(BCH_ERR_btree_node_read_err, btree_node_read_err_bad_node) \ + x(BCH_ERR_btree_node_read_err, btree_node_read_err_incompatible) enum bch_errcode { BCH_ERR_START = 2048, diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index c13e0afc..7a3f42f3 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -517,13 +517,13 @@ static void bch2_extent_crc_pack(union bch_extent_crc *dst, switch (type) { case BCH_EXTENT_ENTRY_crc32: set_common_fields(dst->crc32, src); - memcpy(&dst->crc32.csum, &src.csum.lo, sizeof(dst->crc32.csum)); + dst->crc32.csum = (u32 __force) *((__le32 *) &src.csum.lo); break; case BCH_EXTENT_ENTRY_crc64: set_common_fields(dst->crc64, src); dst->crc64.nonce = src.nonce; - dst->crc64.csum_lo = src.csum.lo; - dst->crc64.csum_hi = *((__le16 *) &src.csum.hi); + dst->crc64.csum_lo = (u64 __force) src.csum.lo; + dst->crc64.csum_hi = (u64 __force) *((__le16 *) &src.csum.hi); break; case BCH_EXTENT_ENTRY_crc128: set_common_fields(dst->crc128, src); diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index 6e9d23a0..7ee8d031 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -155,7 +155,7 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc) common_fields(crc->crc32), }; - memcpy(&ret.csum.lo, &crc->crc32.csum, sizeof(crc->crc32.csum)); + *((__le32 *) &ret.csum.lo) = (__le32 __force) crc->crc32.csum; return ret; } case BCH_EXTENT_ENTRY_crc64: { @@ -165,8 +165,8 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc) .csum.lo = (__force __le64) crc->crc64.csum_lo, }; - u16 hi = crc->crc64.csum_hi; - memcpy(&ret.csum.hi, &hi, sizeof(hi)); + *((__le16 *) &ret.csum.hi) = (__le16 __force) crc->crc64.csum_hi; + return ret; } case BCH_EXTENT_ENTRY_crc128: { diff --git a/libbcachefs/fs-io-buffered.c b/libbcachefs/fs-io-buffered.c new file mode 100644 index 00000000..f6c8d216 --- /dev/null +++ b/libbcachefs/fs-io-buffered.c @@ -0,0 +1,1102 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef NO_BCACHEFS_FS + +#include "bcachefs.h" +#include "alloc_foreground.h" +#include "bkey_buf.h" +#include "fs-io.h" +#include "fs-io-buffered.h" +#include "fs-io-direct.h" +#include "fs-io-pagecache.h" +#include "io.h" + +#include +#include +#include + +static inline bool bio_full(struct bio *bio, unsigned len) +{ + if (bio->bi_vcnt >= bio->bi_max_vecs) + return true; + if (bio->bi_iter.bi_size > UINT_MAX - len) + return true; + return false; +} + +/* readpage(s): */ + +static void bch2_readpages_end_io(struct bio *bio) +{ + struct folio_iter fi; + + bio_for_each_folio_all(fi, bio) { + if (!bio->bi_status) { + folio_mark_uptodate(fi.folio); + } else { + folio_clear_uptodate(fi.folio); + folio_set_error(fi.folio); + } + folio_unlock(fi.folio); + } + + bio_put(bio); +} + +struct readpages_iter { + struct address_space *mapping; + unsigned idx; + folios folios; +}; + +static int readpages_iter_init(struct readpages_iter *iter, + struct readahead_control *ractl) +{ + struct folio **fi; + int ret; + + memset(iter, 0, sizeof(*iter)); + + iter->mapping = ractl->mapping; + + ret = bch2_filemap_get_contig_folios_d(iter->mapping, + ractl->_index << PAGE_SHIFT, + (ractl->_index + ractl->_nr_pages) << PAGE_SHIFT, + 0, mapping_gfp_mask(iter->mapping), + &iter->folios); + if (ret) + return ret; + + darray_for_each(iter->folios, fi) { + ractl->_nr_pages -= 1U << folio_order(*fi); + __bch2_folio_create(*fi, __GFP_NOFAIL|GFP_KERNEL); + folio_put(*fi); + folio_put(*fi); + } + + return 0; +} + +static inline struct folio *readpage_iter_peek(struct readpages_iter *iter) +{ + if (iter->idx >= iter->folios.nr) + return NULL; + return iter->folios.data[iter->idx]; +} + +static inline void readpage_iter_advance(struct readpages_iter *iter) +{ + iter->idx++; +} + +static bool extent_partial_reads_expensive(struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + struct bch_extent_crc_unpacked crc; + const union bch_extent_entry *i; + + bkey_for_each_crc(k.k, ptrs, crc, i) + if (crc.csum_type || crc.compression_type) + return true; + return false; +} + +static int readpage_bio_extend(struct btree_trans *trans, + struct readpages_iter *iter, + struct bio *bio, + unsigned sectors_this_extent, + bool get_more) +{ + /* Don't hold btree locks while allocating memory: */ + bch2_trans_unlock(trans); + + while (bio_sectors(bio) < sectors_this_extent && + bio->bi_vcnt < bio->bi_max_vecs) { + struct folio *folio = readpage_iter_peek(iter); + int ret; + + if (folio) { + readpage_iter_advance(iter); + } else { + pgoff_t folio_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT; + + if (!get_more) + break; + + folio = xa_load(&iter->mapping->i_pages, folio_offset); + if (folio && !xa_is_value(folio)) + break; + + folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), 0); + if (!folio) + break; + + if (!__bch2_folio_create(folio, GFP_KERNEL)) { + folio_put(folio); + break; + } + + ret = filemap_add_folio(iter->mapping, folio, folio_offset, GFP_KERNEL); + if (ret) { + __bch2_folio_release(folio); + folio_put(folio); + break; + } + + folio_put(folio); + } + + BUG_ON(folio_sector(folio) != bio_end_sector(bio)); + + BUG_ON(!bio_add_folio(bio, folio, folio_size(folio), 0)); + } + + return bch2_trans_relock(trans); +} + +static void bchfs_read(struct btree_trans *trans, + struct bch_read_bio *rbio, + subvol_inum inum, + struct readpages_iter *readpages_iter) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_buf sk; + int flags = BCH_READ_RETRY_IF_STALE| + BCH_READ_MAY_PROMOTE; + u32 snapshot; + int ret = 0; + + rbio->c = c; + rbio->start_time = local_clock(); + rbio->subvol = inum.subvol; + + bch2_bkey_buf_init(&sk); +retry: + bch2_trans_begin(trans); + iter = (struct btree_iter) { NULL }; + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + goto err; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, + SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot), + BTREE_ITER_SLOTS); + while (1) { + struct bkey_s_c k; + unsigned bytes, sectors, offset_into_extent; + enum btree_id data_btree = BTREE_ID_extents; + + /* + * read_extent -> io_time_reset may cause a transaction restart + * without returning an error, we need to check for that here: + */ + ret = bch2_trans_relock(trans); + if (ret) + break; + + bch2_btree_iter_set_pos(&iter, + POS(inum.inum, rbio->bio.bi_iter.bi_sector)); + + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + break; + + offset_into_extent = iter.pos.offset - + bkey_start_offset(k.k); + sectors = k.k->size - offset_into_extent; + + bch2_bkey_buf_reassemble(&sk, c, k); + + ret = bch2_read_indirect_extent(trans, &data_btree, + &offset_into_extent, &sk); + if (ret) + break; + + k = bkey_i_to_s_c(sk.k); + + sectors = min(sectors, k.k->size - offset_into_extent); + + if (readpages_iter) { + ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors, + extent_partial_reads_expensive(k)); + if (ret) + break; + } + + bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; + swap(rbio->bio.bi_iter.bi_size, bytes); + + if (rbio->bio.bi_iter.bi_size == bytes) + flags |= BCH_READ_LAST_FRAGMENT; + + bch2_bio_page_state_set(&rbio->bio, k); + + bch2_read_extent(trans, rbio, iter.pos, + data_btree, k, offset_into_extent, flags); + + if (flags & BCH_READ_LAST_FRAGMENT) + break; + + swap(rbio->bio.bi_iter.bi_size, bytes); + bio_advance(&rbio->bio, bytes); + + ret = btree_trans_too_many_iters(trans); + if (ret) + break; + } +err: + bch2_trans_iter_exit(trans, &iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + if (ret) { + bch_err_inum_offset_ratelimited(c, + iter.pos.inode, + iter.pos.offset << 9, + "read error %i from btree lookup", ret); + rbio->bio.bi_status = BLK_STS_IOERR; + bio_endio(&rbio->bio); + } + + bch2_bkey_buf_exit(&sk, c); +} + +void bch2_readahead(struct readahead_control *ractl) +{ + struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_io_opts opts; + struct btree_trans trans; + struct folio *folio; + struct readpages_iter readpages_iter; + int ret; + + bch2_inode_opts_get(&opts, c, &inode->ei_inode); + + ret = readpages_iter_init(&readpages_iter, ractl); + BUG_ON(ret); + + bch2_trans_init(&trans, c, 0, 0); + + bch2_pagecache_add_get(inode); + + while ((folio = readpage_iter_peek(&readpages_iter))) { + unsigned n = min_t(unsigned, + readpages_iter.folios.nr - + readpages_iter.idx, + BIO_MAX_VECS); + struct bch_read_bio *rbio = + rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ, + GFP_KERNEL, &c->bio_read), + opts); + + readpage_iter_advance(&readpages_iter); + + rbio->bio.bi_iter.bi_sector = folio_sector(folio); + rbio->bio.bi_end_io = bch2_readpages_end_io; + BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); + + bchfs_read(&trans, rbio, inode_inum(inode), + &readpages_iter); + bch2_trans_unlock(&trans); + } + + bch2_pagecache_add_put(inode); + + bch2_trans_exit(&trans); + darray_exit(&readpages_iter.folios); +} + +static void __bchfs_readfolio(struct bch_fs *c, struct bch_read_bio *rbio, + subvol_inum inum, struct folio *folio) +{ + struct btree_trans trans; + + bch2_folio_create(folio, __GFP_NOFAIL); + + rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC; + rbio->bio.bi_iter.bi_sector = folio_sector(folio); + BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); + + bch2_trans_init(&trans, c, 0, 0); + bchfs_read(&trans, rbio, inum, NULL); + bch2_trans_exit(&trans); +} + +static void bch2_read_single_folio_end_io(struct bio *bio) +{ + complete(bio->bi_private); +} + +int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) +{ + struct bch_inode_info *inode = to_bch_ei(mapping->host); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_read_bio *rbio; + struct bch_io_opts opts; + int ret; + DECLARE_COMPLETION_ONSTACK(done); + + bch2_inode_opts_get(&opts, c, &inode->ei_inode); + + rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read), + opts); + rbio->bio.bi_private = &done; + rbio->bio.bi_end_io = bch2_read_single_folio_end_io; + + __bchfs_readfolio(c, rbio, inode_inum(inode), folio); + wait_for_completion(&done); + + ret = blk_status_to_errno(rbio->bio.bi_status); + bio_put(&rbio->bio); + + if (ret < 0) + return ret; + + folio_mark_uptodate(folio); + return 0; +} + +int bch2_read_folio(struct file *file, struct folio *folio) +{ + int ret; + + ret = bch2_read_single_folio(folio, folio->mapping); + folio_unlock(folio); + return bch2_err_class(ret); +} + +/* writepages: */ + +struct bch_writepage_io { + struct bch_inode_info *inode; + + /* must be last: */ + struct bch_write_op op; +}; + +struct bch_writepage_state { + struct bch_writepage_io *io; + struct bch_io_opts opts; + struct bch_folio_sector *tmp; + unsigned tmp_sectors; +}; + +static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c, + struct bch_inode_info *inode) +{ + struct bch_writepage_state ret = { 0 }; + + bch2_inode_opts_get(&ret.opts, c, &inode->ei_inode); + return ret; +} + +static void bch2_writepage_io_done(struct bch_write_op *op) +{ + struct bch_writepage_io *io = + container_of(op, struct bch_writepage_io, op); + struct bch_fs *c = io->op.c; + struct bio *bio = &io->op.wbio.bio; + struct folio_iter fi; + unsigned i; + + if (io->op.error) { + set_bit(EI_INODE_ERROR, &io->inode->ei_flags); + + bio_for_each_folio_all(fi, bio) { + struct bch_folio *s; + + folio_set_error(fi.folio); + mapping_set_error(fi.folio->mapping, -EIO); + + s = __bch2_folio(fi.folio); + spin_lock(&s->lock); + for (i = 0; i < folio_sectors(fi.folio); i++) + s->s[i].nr_replicas = 0; + spin_unlock(&s->lock); + } + } + + if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { + bio_for_each_folio_all(fi, bio) { + struct bch_folio *s; + + s = __bch2_folio(fi.folio); + spin_lock(&s->lock); + for (i = 0; i < folio_sectors(fi.folio); i++) + s->s[i].nr_replicas = 0; + spin_unlock(&s->lock); + } + } + + /* + * racing with fallocate can cause us to add fewer sectors than + * expected - but we shouldn't add more sectors than expected: + */ + WARN_ON_ONCE(io->op.i_sectors_delta > 0); + + /* + * (error (due to going RO) halfway through a page can screw that up + * slightly) + * XXX wtf? + BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS); + */ + + /* + * PageWriteback is effectively our ref on the inode - fixup i_blocks + * before calling end_page_writeback: + */ + bch2_i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); + + bio_for_each_folio_all(fi, bio) { + struct bch_folio *s = __bch2_folio(fi.folio); + + if (atomic_dec_and_test(&s->write_count)) + folio_end_writeback(fi.folio); + } + + bio_put(&io->op.wbio.bio); +} + +static void bch2_writepage_do_io(struct bch_writepage_state *w) +{ + struct bch_writepage_io *io = w->io; + + w->io = NULL; + closure_call(&io->op.cl, bch2_write, NULL, NULL); +} + +/* + * Get a bch_writepage_io and add @page to it - appending to an existing one if + * possible, else allocating a new one: + */ +static void bch2_writepage_io_alloc(struct bch_fs *c, + struct writeback_control *wbc, + struct bch_writepage_state *w, + struct bch_inode_info *inode, + u64 sector, + unsigned nr_replicas) +{ + struct bch_write_op *op; + + w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS, + REQ_OP_WRITE, + GFP_KERNEL, + &c->writepage_bioset), + struct bch_writepage_io, op.wbio.bio); + + w->io->inode = inode; + op = &w->io->op; + bch2_write_op_init(op, c, w->opts); + op->target = w->opts.foreground_target; + op->nr_replicas = nr_replicas; + op->res.nr_replicas = nr_replicas; + op->write_point = writepoint_hashed(inode->ei_last_dirtied); + op->subvol = inode->ei_subvol; + op->pos = POS(inode->v.i_ino, sector); + op->end_io = bch2_writepage_io_done; + op->devs_need_flush = &inode->ei_devs_need_flush; + op->wbio.bio.bi_iter.bi_sector = sector; + op->wbio.bio.bi_opf = wbc_to_write_flags(wbc); +} + +static int __bch2_writepage(struct folio *folio, + struct writeback_control *wbc, + void *data) +{ + struct bch_inode_info *inode = to_bch_ei(folio->mapping->host); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_writepage_state *w = data; + struct bch_folio *s; + unsigned i, offset, f_sectors, nr_replicas_this_write = U32_MAX; + loff_t i_size = i_size_read(&inode->v); + int ret; + + EBUG_ON(!folio_test_uptodate(folio)); + + /* Is the folio fully inside i_size? */ + if (folio_end_pos(folio) <= i_size) + goto do_io; + + /* Is the folio fully outside i_size? (truncate in progress) */ + if (folio_pos(folio) >= i_size) { + folio_unlock(folio); + return 0; + } + + /* + * The folio straddles i_size. It must be zeroed out on each and every + * writepage invocation because it may be mmapped. "A file is mapped + * in multiples of the folio size. For a file that is not a multiple of + * the folio size, the remaining memory is zeroed when mapped, and + * writes to that region are not written out to the file." + */ + folio_zero_segment(folio, + i_size - folio_pos(folio), + folio_size(folio)); +do_io: + f_sectors = folio_sectors(folio); + s = bch2_folio(folio); + + if (f_sectors > w->tmp_sectors) { + kfree(w->tmp); + w->tmp = kzalloc(sizeof(struct bch_folio_sector) * + f_sectors, __GFP_NOFAIL); + w->tmp_sectors = f_sectors; + } + + /* + * Things get really hairy with errors during writeback: + */ + ret = bch2_get_folio_disk_reservation(c, inode, folio, false); + BUG_ON(ret); + + /* Before unlocking the page, get copy of reservations: */ + spin_lock(&s->lock); + memcpy(w->tmp, s->s, sizeof(struct bch_folio_sector) * f_sectors); + + for (i = 0; i < f_sectors; i++) { + if (s->s[i].state < SECTOR_dirty) + continue; + + nr_replicas_this_write = + min_t(unsigned, nr_replicas_this_write, + s->s[i].nr_replicas + + s->s[i].replicas_reserved); + } + + for (i = 0; i < f_sectors; i++) { + if (s->s[i].state < SECTOR_dirty) + continue; + + s->s[i].nr_replicas = w->opts.compression + ? 0 : nr_replicas_this_write; + + s->s[i].replicas_reserved = 0; + bch2_folio_sector_set(folio, s, i, SECTOR_allocated); + } + spin_unlock(&s->lock); + + BUG_ON(atomic_read(&s->write_count)); + atomic_set(&s->write_count, 1); + + BUG_ON(folio_test_writeback(folio)); + folio_start_writeback(folio); + + folio_unlock(folio); + + offset = 0; + while (1) { + unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0; + u64 sector; + + while (offset < f_sectors && + w->tmp[offset].state < SECTOR_dirty) + offset++; + + if (offset == f_sectors) + break; + + while (offset + sectors < f_sectors && + w->tmp[offset + sectors].state >= SECTOR_dirty) { + reserved_sectors += w->tmp[offset + sectors].replicas_reserved; + dirty_sectors += w->tmp[offset + sectors].state == SECTOR_dirty; + sectors++; + } + BUG_ON(!sectors); + + sector = folio_sector(folio) + offset; + + if (w->io && + (w->io->op.res.nr_replicas != nr_replicas_this_write || + bio_full(&w->io->op.wbio.bio, sectors << 9) || + w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >= + (BIO_MAX_VECS * PAGE_SIZE) || + bio_end_sector(&w->io->op.wbio.bio) != sector)) + bch2_writepage_do_io(w); + + if (!w->io) + bch2_writepage_io_alloc(c, wbc, w, inode, sector, + nr_replicas_this_write); + + atomic_inc(&s->write_count); + + BUG_ON(inode != w->io->inode); + BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio, + sectors << 9, offset << 9)); + + /* Check for writing past i_size: */ + WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) > + round_up(i_size, block_bytes(c)) && + !test_bit(BCH_FS_EMERGENCY_RO, &c->flags), + "writing past i_size: %llu > %llu (unrounded %llu)\n", + bio_end_sector(&w->io->op.wbio.bio) << 9, + round_up(i_size, block_bytes(c)), + i_size); + + w->io->op.res.sectors += reserved_sectors; + w->io->op.i_sectors_delta -= dirty_sectors; + w->io->op.new_i_size = i_size; + + offset += sectors; + } + + if (atomic_dec_and_test(&s->write_count)) + folio_end_writeback(folio); + + return 0; +} + +int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc) +{ + struct bch_fs *c = mapping->host->i_sb->s_fs_info; + struct bch_writepage_state w = + bch_writepage_state_init(c, to_bch_ei(mapping->host)); + struct blk_plug plug; + int ret; + + blk_start_plug(&plug); + ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w); + if (w.io) + bch2_writepage_do_io(&w); + blk_finish_plug(&plug); + kfree(w.tmp); + return bch2_err_class(ret); +} + +/* buffered writes: */ + +int bch2_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, + struct page **pagep, void **fsdata) +{ + struct bch_inode_info *inode = to_bch_ei(mapping->host); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch2_folio_reservation *res; + struct folio *folio; + unsigned offset; + int ret = -ENOMEM; + + res = kmalloc(sizeof(*res), GFP_KERNEL); + if (!res) + return -ENOMEM; + + bch2_folio_reservation_init(c, inode, res); + *fsdata = res; + + bch2_pagecache_add_get(inode); + + folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, + FGP_LOCK|FGP_WRITE|FGP_CREAT|FGP_STABLE, + mapping_gfp_mask(mapping)); + if (IS_ERR_OR_NULL(folio)) + goto err_unlock; + + if (folio_test_uptodate(folio)) + goto out; + + offset = pos - folio_pos(folio); + len = min_t(size_t, len, folio_end_pos(folio) - pos); + + /* If we're writing entire folio, don't need to read it in first: */ + if (!offset && len == folio_size(folio)) + goto out; + + if (!offset && pos + len >= inode->v.i_size) { + folio_zero_segment(folio, len, folio_size(folio)); + flush_dcache_folio(folio); + goto out; + } + + if (folio_pos(folio) >= inode->v.i_size) { + folio_zero_segments(folio, 0, offset, offset + len, folio_size(folio)); + flush_dcache_folio(folio); + goto out; + } +readpage: + ret = bch2_read_single_folio(folio, mapping); + if (ret) + goto err; +out: + ret = bch2_folio_set(c, inode_inum(inode), &folio, 1); + if (ret) + goto err; + + ret = bch2_folio_reservation_get(c, inode, folio, res, offset, len); + if (ret) { + if (!folio_test_uptodate(folio)) { + /* + * If the folio hasn't been read in, we won't know if we + * actually need a reservation - we don't actually need + * to read here, we just need to check if the folio is + * fully backed by uncompressed data: + */ + goto readpage; + } + + goto err; + } + + *pagep = &folio->page; + return 0; +err: + folio_unlock(folio); + folio_put(folio); + *pagep = NULL; +err_unlock: + bch2_pagecache_add_put(inode); + kfree(res); + *fsdata = NULL; + return bch2_err_class(ret); +} + +int bch2_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct bch_inode_info *inode = to_bch_ei(mapping->host); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch2_folio_reservation *res = fsdata; + struct folio *folio = page_folio(page); + unsigned offset = pos - folio_pos(folio); + + lockdep_assert_held(&inode->v.i_rwsem); + BUG_ON(offset + copied > folio_size(folio)); + + if (unlikely(copied < len && !folio_test_uptodate(folio))) { + /* + * The folio needs to be read in, but that would destroy + * our partial write - simplest thing is to just force + * userspace to redo the write: + */ + folio_zero_range(folio, 0, folio_size(folio)); + flush_dcache_folio(folio); + copied = 0; + } + + spin_lock(&inode->v.i_lock); + if (pos + copied > inode->v.i_size) + i_size_write(&inode->v, pos + copied); + spin_unlock(&inode->v.i_lock); + + if (copied) { + if (!folio_test_uptodate(folio)) + folio_mark_uptodate(folio); + + bch2_set_folio_dirty(c, inode, folio, res, offset, copied); + + inode->ei_last_dirtied = (unsigned long) current; + } + + folio_unlock(folio); + folio_put(folio); + bch2_pagecache_add_put(inode); + + bch2_folio_reservation_put(c, inode, res); + kfree(res); + + return copied; +} + +static noinline void folios_trunc(folios *folios, struct folio **fi) +{ + while (folios->data + folios->nr > fi) { + struct folio *f = darray_pop(folios); + + folio_unlock(f); + folio_put(f); + } +} + +static int __bch2_buffered_write(struct bch_inode_info *inode, + struct address_space *mapping, + struct iov_iter *iter, + loff_t pos, unsigned len) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch2_folio_reservation res; + folios folios; + struct folio **fi, *f; + unsigned copied = 0, f_offset; + u64 end = pos + len, f_pos; + loff_t last_folio_pos = inode->v.i_size; + int ret = 0; + + BUG_ON(!len); + + bch2_folio_reservation_init(c, inode, &res); + darray_init(&folios); + + ret = bch2_filemap_get_contig_folios_d(mapping, pos, end, + FGP_LOCK|FGP_WRITE|FGP_STABLE|FGP_CREAT, + mapping_gfp_mask(mapping), + &folios); + if (ret) + goto out; + + BUG_ON(!folios.nr); + + f = darray_first(folios); + if (pos != folio_pos(f) && !folio_test_uptodate(f)) { + ret = bch2_read_single_folio(f, mapping); + if (ret) + goto out; + } + + f = darray_last(folios); + end = min(end, folio_end_pos(f)); + last_folio_pos = folio_pos(f); + if (end != folio_end_pos(f) && !folio_test_uptodate(f)) { + if (end >= inode->v.i_size) { + folio_zero_range(f, 0, folio_size(f)); + } else { + ret = bch2_read_single_folio(f, mapping); + if (ret) + goto out; + } + } + + ret = bch2_folio_set(c, inode_inum(inode), folios.data, folios.nr); + if (ret) + goto out; + + f_pos = pos; + f_offset = pos - folio_pos(darray_first(folios)); + darray_for_each(folios, fi) { + struct folio *f = *fi; + u64 f_len = min(end, folio_end_pos(f)) - f_pos; + + /* + * XXX: per POSIX and fstests generic/275, on -ENOSPC we're + * supposed to write as much as we have disk space for. + * + * On failure here we should still write out a partial page if + * we aren't completely out of disk space - we don't do that + * yet: + */ + ret = bch2_folio_reservation_get(c, inode, f, &res, f_offset, f_len); + if (unlikely(ret)) { + folios_trunc(&folios, fi); + if (!folios.nr) + goto out; + + end = min(end, folio_end_pos(darray_last(folios))); + break; + } + + f_pos = folio_end_pos(f); + f_offset = 0; + } + + if (mapping_writably_mapped(mapping)) + darray_for_each(folios, fi) + flush_dcache_folio(*fi); + + f_pos = pos; + f_offset = pos - folio_pos(darray_first(folios)); + darray_for_each(folios, fi) { + struct folio *f = *fi; + u64 f_len = min(end, folio_end_pos(f)) - f_pos; + unsigned f_copied = copy_page_from_iter_atomic(&f->page, f_offset, f_len, iter); + + if (!f_copied) { + folios_trunc(&folios, fi); + break; + } + + if (!folio_test_uptodate(f) && + f_copied != folio_size(f) && + pos + copied + f_copied < inode->v.i_size) { + folio_zero_range(f, 0, folio_size(f)); + folios_trunc(&folios, fi); + break; + } + + flush_dcache_folio(f); + copied += f_copied; + + if (f_copied != f_len) { + folios_trunc(&folios, fi + 1); + break; + } + + f_pos = folio_end_pos(f); + f_offset = 0; + } + + if (!copied) + goto out; + + end = pos + copied; + + spin_lock(&inode->v.i_lock); + if (end > inode->v.i_size) + i_size_write(&inode->v, end); + spin_unlock(&inode->v.i_lock); + + f_pos = pos; + f_offset = pos - folio_pos(darray_first(folios)); + darray_for_each(folios, fi) { + struct folio *f = *fi; + u64 f_len = min(end, folio_end_pos(f)) - f_pos; + + if (!folio_test_uptodate(f)) + folio_mark_uptodate(f); + + bch2_set_folio_dirty(c, inode, f, &res, f_offset, f_len); + + f_pos = folio_end_pos(f); + f_offset = 0; + } + + inode->ei_last_dirtied = (unsigned long) current; +out: + darray_for_each(folios, fi) { + folio_unlock(*fi); + folio_put(*fi); + } + + /* + * If the last folio added to the mapping starts beyond current EOF, we + * performed a short write but left around at least one post-EOF folio. + * Clean up the mapping before we return. + */ + if (last_folio_pos >= inode->v.i_size) + truncate_pagecache(&inode->v, inode->v.i_size); + + darray_exit(&folios); + bch2_folio_reservation_put(c, inode, &res); + + return copied ?: ret; +} + +static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct bch_inode_info *inode = file_bch_inode(file); + loff_t pos = iocb->ki_pos; + ssize_t written = 0; + int ret = 0; + + bch2_pagecache_add_get(inode); + + do { + unsigned offset = pos & (PAGE_SIZE - 1); + unsigned bytes = iov_iter_count(iter); +again: + /* + * Bring in the user page that we will copy from _first_. + * Otherwise there's a nasty deadlock on copying from the + * same page as we're writing to, without it being marked + * up-to-date. + * + * Not only is this an optimisation, but it is also required + * to check that the address is actually valid, when atomic + * usercopies are used, below. + */ + if (unlikely(fault_in_iov_iter_readable(iter, bytes))) { + bytes = min_t(unsigned long, iov_iter_count(iter), + PAGE_SIZE - offset); + + if (unlikely(fault_in_iov_iter_readable(iter, bytes))) { + ret = -EFAULT; + break; + } + } + + if (unlikely(fatal_signal_pending(current))) { + ret = -EINTR; + break; + } + + ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes); + if (unlikely(ret < 0)) + break; + + cond_resched(); + + if (unlikely(ret == 0)) { + /* + * If we were unable to copy any data at all, we must + * fall back to a single segment length write. + * + * If we didn't fallback here, we could livelock + * because not all segments in the iov can be copied at + * once without a pagefault. + */ + bytes = min_t(unsigned long, PAGE_SIZE - offset, + iov_iter_single_seg_count(iter)); + goto again; + } + pos += ret; + written += ret; + ret = 0; + + balance_dirty_pages_ratelimited(mapping); + } while (iov_iter_count(iter)); + + bch2_pagecache_add_put(inode); + + return written ? written : ret; +} + +ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct bch_inode_info *inode = file_bch_inode(file); + ssize_t ret; + + if (iocb->ki_flags & IOCB_DIRECT) { + ret = bch2_direct_write(iocb, from); + goto out; + } + + /* We can write back this queue in page reclaim */ + current->backing_dev_info = inode_to_bdi(&inode->v); + inode_lock(&inode->v); + + ret = generic_write_checks(iocb, from); + if (ret <= 0) + goto unlock; + + ret = file_remove_privs(file); + if (ret) + goto unlock; + + ret = file_update_time(file); + if (ret) + goto unlock; + + ret = bch2_buffered_write(iocb, from); + if (likely(ret > 0)) + iocb->ki_pos += ret; +unlock: + inode_unlock(&inode->v); + current->backing_dev_info = NULL; + + if (ret > 0) + ret = generic_write_sync(iocb, ret); +out: + return bch2_err_class(ret); +} + +void bch2_fs_fs_io_buffered_exit(struct bch_fs *c) +{ + bioset_exit(&c->writepage_bioset); +} + +int bch2_fs_fs_io_buffered_init(struct bch_fs *c) +{ + if (bioset_init(&c->writepage_bioset, + 4, offsetof(struct bch_writepage_io, op.wbio.bio), + BIOSET_NEED_BVECS)) + return -BCH_ERR_ENOMEM_writepage_bioset_init; + + return 0; +} + +#endif /* NO_BCACHEFS_FS */ diff --git a/libbcachefs/fs-io-buffered.h b/libbcachefs/fs-io-buffered.h new file mode 100644 index 00000000..74eba219 --- /dev/null +++ b/libbcachefs/fs-io-buffered.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_FS_IO_BUFFERED_H +#define _BCACHEFS_FS_IO_BUFFERED_H + +#ifndef NO_BCACHEFS_FS + +int bch2_read_single_folio(struct folio *, struct address_space *); +int bch2_read_folio(struct file *, struct folio *); + +int bch2_writepages(struct address_space *, struct writeback_control *); +void bch2_readahead(struct readahead_control *); + +int bch2_write_begin(struct file *, struct address_space *, loff_t, + unsigned, struct page **, void **); +int bch2_write_end(struct file *, struct address_space *, loff_t, + unsigned, unsigned, struct page *, void *); + +ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *); +ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *); + +void bch2_fs_fs_io_buffered_exit(struct bch_fs *); +int bch2_fs_fs_io_buffered_init(struct bch_fs *); +#else +static inline void bch2_fs_fs_io_buffered_exit(struct bch_fs *c) {} +static inline int bch2_fs_fs_io_buffered_init(struct bch_fs *c) { return 0; } +#endif + +#endif /* _BCACHEFS_FS_IO_BUFFERED_H */ diff --git a/libbcachefs/fs-io-direct.c b/libbcachefs/fs-io-direct.c new file mode 100644 index 00000000..a5cadc5d --- /dev/null +++ b/libbcachefs/fs-io-direct.c @@ -0,0 +1,678 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef NO_BCACHEFS_FS + +#include "bcachefs.h" +#include "alloc_foreground.h" +#include "fs.h" +#include "fs-io.h" +#include "fs-io-direct.h" +#include "fs-io-pagecache.h" +#include "io.h" + +#include +#include +#include + +/* O_DIRECT reads */ + +struct dio_read { + struct closure cl; + struct kiocb *req; + long ret; + bool should_dirty; + struct bch_read_bio rbio; +}; + +static void bio_check_or_release(struct bio *bio, bool check_dirty) +{ + if (check_dirty) { + bio_check_pages_dirty(bio); + } else { + bio_release_pages(bio, false); + bio_put(bio); + } +} + +static void bch2_dio_read_complete(struct closure *cl) +{ + struct dio_read *dio = container_of(cl, struct dio_read, cl); + + dio->req->ki_complete(dio->req, dio->ret); + bio_check_or_release(&dio->rbio.bio, dio->should_dirty); +} + +static void bch2_direct_IO_read_endio(struct bio *bio) +{ + struct dio_read *dio = bio->bi_private; + + if (bio->bi_status) + dio->ret = blk_status_to_errno(bio->bi_status); + + closure_put(&dio->cl); +} + +static void bch2_direct_IO_read_split_endio(struct bio *bio) +{ + struct dio_read *dio = bio->bi_private; + bool should_dirty = dio->should_dirty; + + bch2_direct_IO_read_endio(bio); + bio_check_or_release(bio, should_dirty); +} + +static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) +{ + struct file *file = req->ki_filp; + struct bch_inode_info *inode = file_bch_inode(file); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_io_opts opts; + struct dio_read *dio; + struct bio *bio; + loff_t offset = req->ki_pos; + bool sync = is_sync_kiocb(req); + size_t shorten; + ssize_t ret; + + bch2_inode_opts_get(&opts, c, &inode->ei_inode); + + if ((offset|iter->count) & (block_bytes(c) - 1)) + return -EINVAL; + + ret = min_t(loff_t, iter->count, + max_t(loff_t, 0, i_size_read(&inode->v) - offset)); + + if (!ret) + return ret; + + shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c)); + iter->count -= shorten; + + bio = bio_alloc_bioset(NULL, + bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), + REQ_OP_READ, + GFP_KERNEL, + &c->dio_read_bioset); + + bio->bi_end_io = bch2_direct_IO_read_endio; + + dio = container_of(bio, struct dio_read, rbio.bio); + closure_init(&dio->cl, NULL); + + /* + * this is a _really_ horrible hack just to avoid an atomic sub at the + * end: + */ + if (!sync) { + set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL); + atomic_set(&dio->cl.remaining, + CLOSURE_REMAINING_INITIALIZER - + CLOSURE_RUNNING + + CLOSURE_DESTRUCTOR); + } else { + atomic_set(&dio->cl.remaining, + CLOSURE_REMAINING_INITIALIZER + 1); + } + + dio->req = req; + dio->ret = ret; + /* + * This is one of the sketchier things I've encountered: we have to skip + * the dirtying of requests that are internal from the kernel (i.e. from + * loopback), because we'll deadlock on page_lock. + */ + dio->should_dirty = iter_is_iovec(iter); + + goto start; + while (iter->count) { + bio = bio_alloc_bioset(NULL, + bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), + REQ_OP_READ, + GFP_KERNEL, + &c->bio_read); + bio->bi_end_io = bch2_direct_IO_read_split_endio; +start: + bio->bi_opf = REQ_OP_READ|REQ_SYNC; + bio->bi_iter.bi_sector = offset >> 9; + bio->bi_private = dio; + + ret = bio_iov_iter_get_pages(bio, iter); + if (ret < 0) { + /* XXX: fault inject this path */ + bio->bi_status = BLK_STS_RESOURCE; + bio_endio(bio); + break; + } + + offset += bio->bi_iter.bi_size; + + if (dio->should_dirty) + bio_set_pages_dirty(bio); + + if (iter->count) + closure_get(&dio->cl); + + bch2_read(c, rbio_init(bio, opts), inode_inum(inode)); + } + + iter->count += shorten; + + if (sync) { + closure_sync(&dio->cl); + closure_debug_destroy(&dio->cl); + ret = dio->ret; + bio_check_or_release(&dio->rbio.bio, dio->should_dirty); + return ret; + } else { + return -EIOCBQUEUED; + } +} + +ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + struct bch_inode_info *inode = file_bch_inode(file); + struct address_space *mapping = file->f_mapping; + size_t count = iov_iter_count(iter); + ssize_t ret; + + if (!count) + return 0; /* skip atime */ + + if (iocb->ki_flags & IOCB_DIRECT) { + struct blk_plug plug; + + if (unlikely(mapping->nrpages)) { + ret = filemap_write_and_wait_range(mapping, + iocb->ki_pos, + iocb->ki_pos + count - 1); + if (ret < 0) + goto out; + } + + file_accessed(file); + + blk_start_plug(&plug); + ret = bch2_direct_IO_read(iocb, iter); + blk_finish_plug(&plug); + + if (ret >= 0) + iocb->ki_pos += ret; + } else { + bch2_pagecache_add_get(inode); + ret = generic_file_read_iter(iocb, iter); + bch2_pagecache_add_put(inode); + } +out: + return bch2_err_class(ret); +} + +/* O_DIRECT writes */ + +struct dio_write { + struct kiocb *req; + struct address_space *mapping; + struct bch_inode_info *inode; + struct mm_struct *mm; + unsigned loop:1, + extending:1, + sync:1, + flush:1, + free_iov:1; + struct quota_res quota_res; + u64 written; + + struct iov_iter iter; + struct iovec inline_vecs[2]; + + /* must be last: */ + struct bch_write_op op; +}; + +static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum, + u64 offset, u64 size, + unsigned nr_replicas, bool compressed) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + u64 end = offset + size; + u32 snapshot; + bool ret = true; + int err; + + bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + + err = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + if (err) + goto err; + + for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, + SPOS(inum.inum, offset, snapshot), + BTREE_ITER_SLOTS, k, err) { + if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end))) + break; + + if (k.k->p.snapshot != snapshot || + nr_replicas > bch2_bkey_replicas(c, k) || + (!compressed && bch2_bkey_sectors_compressed(k))) { + ret = false; + break; + } + } + + offset = iter.pos.offset; + bch2_trans_iter_exit(&trans, &iter); +err: + if (bch2_err_matches(err, BCH_ERR_transaction_restart)) + goto retry; + bch2_trans_exit(&trans); + + return err ? false : ret; +} + +static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio) +{ + struct bch_fs *c = dio->op.c; + struct bch_inode_info *inode = dio->inode; + struct bio *bio = &dio->op.wbio.bio; + + return bch2_check_range_allocated(c, inode_inum(inode), + dio->op.pos.offset, bio_sectors(bio), + dio->op.opts.data_replicas, + dio->op.opts.compression != 0); +} + +static void bch2_dio_write_loop_async(struct bch_write_op *); +static __always_inline long bch2_dio_write_done(struct dio_write *dio); + +/* + * We're going to return -EIOCBQUEUED, but we haven't finished consuming the + * iov_iter yet, so we need to stash a copy of the iovec: it might be on the + * caller's stack, we're not guaranteed that it will live for the duration of + * the IO: + */ +static noinline int bch2_dio_write_copy_iov(struct dio_write *dio) +{ + struct iovec *iov = dio->inline_vecs; + + /* + * iov_iter has a single embedded iovec - nothing to do: + */ + if (iter_is_ubuf(&dio->iter)) + return 0; + + /* + * We don't currently handle non-iovec iov_iters here - return an error, + * and we'll fall back to doing the IO synchronously: + */ + if (!iter_is_iovec(&dio->iter)) + return -1; + + if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) { + iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov), + GFP_KERNEL); + if (unlikely(!iov)) + return -ENOMEM; + + dio->free_iov = true; + } + + memcpy(iov, dio->iter.__iov, dio->iter.nr_segs * sizeof(*iov)); + dio->iter.__iov = iov; + return 0; +} + +static void bch2_dio_write_flush_done(struct closure *cl) +{ + struct dio_write *dio = container_of(cl, struct dio_write, op.cl); + struct bch_fs *c = dio->op.c; + + closure_debug_destroy(cl); + + dio->op.error = bch2_journal_error(&c->journal); + + bch2_dio_write_done(dio); +} + +static noinline void bch2_dio_write_flush(struct dio_write *dio) +{ + struct bch_fs *c = dio->op.c; + struct bch_inode_unpacked inode; + int ret; + + dio->flush = 0; + + closure_init(&dio->op.cl, NULL); + + if (!dio->op.error) { + ret = bch2_inode_find_by_inum(c, inode_inum(dio->inode), &inode); + if (ret) { + dio->op.error = ret; + } else { + bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq, &dio->op.cl); + bch2_inode_flush_nocow_writes_async(c, dio->inode, &dio->op.cl); + } + } + + if (dio->sync) { + closure_sync(&dio->op.cl); + closure_debug_destroy(&dio->op.cl); + } else { + continue_at(&dio->op.cl, bch2_dio_write_flush_done, NULL); + } +} + +static __always_inline long bch2_dio_write_done(struct dio_write *dio) +{ + struct kiocb *req = dio->req; + struct bch_inode_info *inode = dio->inode; + bool sync = dio->sync; + long ret; + + if (unlikely(dio->flush)) { + bch2_dio_write_flush(dio); + if (!sync) + return -EIOCBQUEUED; + } + + bch2_pagecache_block_put(inode); + + if (dio->free_iov) + kfree(dio->iter.__iov); + + ret = dio->op.error ?: ((long) dio->written << 9); + bio_put(&dio->op.wbio.bio); + + /* inode->i_dio_count is our ref on inode and thus bch_fs */ + inode_dio_end(&inode->v); + + if (ret < 0) + ret = bch2_err_class(ret); + + if (!sync) { + req->ki_complete(req, ret); + ret = -EIOCBQUEUED; + } + return ret; +} + +static __always_inline void bch2_dio_write_end(struct dio_write *dio) +{ + struct bch_fs *c = dio->op.c; + struct kiocb *req = dio->req; + struct bch_inode_info *inode = dio->inode; + struct bio *bio = &dio->op.wbio.bio; + + req->ki_pos += (u64) dio->op.written << 9; + dio->written += dio->op.written; + + if (dio->extending) { + spin_lock(&inode->v.i_lock); + if (req->ki_pos > inode->v.i_size) + i_size_write(&inode->v, req->ki_pos); + spin_unlock(&inode->v.i_lock); + } + + if (dio->op.i_sectors_delta || dio->quota_res.sectors) { + mutex_lock(&inode->ei_quota_lock); + __bch2_i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta); + __bch2_quota_reservation_put(c, inode, &dio->quota_res); + mutex_unlock(&inode->ei_quota_lock); + } + + bio_release_pages(bio, false); + + if (unlikely(dio->op.error)) + set_bit(EI_INODE_ERROR, &inode->ei_flags); +} + +static __always_inline long bch2_dio_write_loop(struct dio_write *dio) +{ + struct bch_fs *c = dio->op.c; + struct kiocb *req = dio->req; + struct address_space *mapping = dio->mapping; + struct bch_inode_info *inode = dio->inode; + struct bch_io_opts opts; + struct bio *bio = &dio->op.wbio.bio; + unsigned unaligned, iter_count; + bool sync = dio->sync, dropped_locks; + long ret; + + bch2_inode_opts_get(&opts, c, &inode->ei_inode); + + while (1) { + iter_count = dio->iter.count; + + EBUG_ON(current->faults_disabled_mapping); + current->faults_disabled_mapping = mapping; + + ret = bio_iov_iter_get_pages(bio, &dio->iter); + + dropped_locks = fdm_dropped_locks(); + + current->faults_disabled_mapping = NULL; + + /* + * If the fault handler returned an error but also signalled + * that it dropped & retook ei_pagecache_lock, we just need to + * re-shoot down the page cache and retry: + */ + if (dropped_locks && ret) + ret = 0; + + if (unlikely(ret < 0)) + goto err; + + if (unlikely(dropped_locks)) { + ret = bch2_write_invalidate_inode_pages_range(mapping, + req->ki_pos, + req->ki_pos + iter_count - 1); + if (unlikely(ret)) + goto err; + + if (!bio->bi_iter.bi_size) + continue; + } + + unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1); + bio->bi_iter.bi_size -= unaligned; + iov_iter_revert(&dio->iter, unaligned); + + if (!bio->bi_iter.bi_size) { + /* + * bio_iov_iter_get_pages was only able to get < + * blocksize worth of pages: + */ + ret = -EFAULT; + goto err; + } + + bch2_write_op_init(&dio->op, c, opts); + dio->op.end_io = sync + ? NULL + : bch2_dio_write_loop_async; + dio->op.target = dio->op.opts.foreground_target; + dio->op.write_point = writepoint_hashed((unsigned long) current); + dio->op.nr_replicas = dio->op.opts.data_replicas; + dio->op.subvol = inode->ei_subvol; + dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9); + dio->op.devs_need_flush = &inode->ei_devs_need_flush; + + if (sync) + dio->op.flags |= BCH_WRITE_SYNC; + dio->op.flags |= BCH_WRITE_CHECK_ENOSPC; + + ret = bch2_quota_reservation_add(c, inode, &dio->quota_res, + bio_sectors(bio), true); + if (unlikely(ret)) + goto err; + + ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio), + dio->op.opts.data_replicas, 0); + if (unlikely(ret) && + !bch2_dio_write_check_allocated(dio)) + goto err; + + task_io_account_write(bio->bi_iter.bi_size); + + if (unlikely(dio->iter.count) && + !dio->sync && + !dio->loop && + bch2_dio_write_copy_iov(dio)) + dio->sync = sync = true; + + dio->loop = true; + closure_call(&dio->op.cl, bch2_write, NULL, NULL); + + if (!sync) + return -EIOCBQUEUED; + + bch2_dio_write_end(dio); + + if (likely(!dio->iter.count) || dio->op.error) + break; + + bio_reset(bio, NULL, REQ_OP_WRITE); + } +out: + return bch2_dio_write_done(dio); +err: + dio->op.error = ret; + + bio_release_pages(bio, false); + + bch2_quota_reservation_put(c, inode, &dio->quota_res); + goto out; +} + +static noinline __cold void bch2_dio_write_continue(struct dio_write *dio) +{ + struct mm_struct *mm = dio->mm; + + bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE); + + if (mm) + kthread_use_mm(mm); + bch2_dio_write_loop(dio); + if (mm) + kthread_unuse_mm(mm); +} + +static void bch2_dio_write_loop_async(struct bch_write_op *op) +{ + struct dio_write *dio = container_of(op, struct dio_write, op); + + bch2_dio_write_end(dio); + + if (likely(!dio->iter.count) || dio->op.error) + bch2_dio_write_done(dio); + else + bch2_dio_write_continue(dio); +} + +ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) +{ + struct file *file = req->ki_filp; + struct address_space *mapping = file->f_mapping; + struct bch_inode_info *inode = file_bch_inode(file); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct dio_write *dio; + struct bio *bio; + bool locked = true, extending; + ssize_t ret; + + prefetch(&c->opts); + prefetch((void *) &c->opts + 64); + prefetch(&inode->ei_inode); + prefetch((void *) &inode->ei_inode + 64); + + inode_lock(&inode->v); + + ret = generic_write_checks(req, iter); + if (unlikely(ret <= 0)) + goto err; + + ret = file_remove_privs(file); + if (unlikely(ret)) + goto err; + + ret = file_update_time(file); + if (unlikely(ret)) + goto err; + + if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) + goto err; + + inode_dio_begin(&inode->v); + bch2_pagecache_block_get(inode); + + extending = req->ki_pos + iter->count > inode->v.i_size; + if (!extending) { + inode_unlock(&inode->v); + locked = false; + } + + bio = bio_alloc_bioset(NULL, + bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), + REQ_OP_WRITE, + GFP_KERNEL, + &c->dio_write_bioset); + dio = container_of(bio, struct dio_write, op.wbio.bio); + dio->req = req; + dio->mapping = mapping; + dio->inode = inode; + dio->mm = current->mm; + dio->loop = false; + dio->extending = extending; + dio->sync = is_sync_kiocb(req) || extending; + dio->flush = iocb_is_dsync(req) && !c->opts.journal_flush_disabled; + dio->free_iov = false; + dio->quota_res.sectors = 0; + dio->written = 0; + dio->iter = *iter; + dio->op.c = c; + + if (unlikely(mapping->nrpages)) { + ret = bch2_write_invalidate_inode_pages_range(mapping, + req->ki_pos, + req->ki_pos + iter->count - 1); + if (unlikely(ret)) + goto err_put_bio; + } + + ret = bch2_dio_write_loop(dio); +err: + if (locked) + inode_unlock(&inode->v); + return ret; +err_put_bio: + bch2_pagecache_block_put(inode); + bio_put(bio); + inode_dio_end(&inode->v); + goto err; +} + +void bch2_fs_fs_io_direct_exit(struct bch_fs *c) +{ + bioset_exit(&c->dio_write_bioset); + bioset_exit(&c->dio_read_bioset); +} + +int bch2_fs_fs_io_direct_init(struct bch_fs *c) +{ + if (bioset_init(&c->dio_read_bioset, + 4, offsetof(struct dio_read, rbio.bio), + BIOSET_NEED_BVECS)) + return -BCH_ERR_ENOMEM_dio_read_bioset_init; + + if (bioset_init(&c->dio_write_bioset, + 4, offsetof(struct dio_write, op.wbio.bio), + BIOSET_NEED_BVECS)) + return -BCH_ERR_ENOMEM_dio_write_bioset_init; + + return 0; +} + +#endif /* NO_BCACHEFS_FS */ diff --git a/libbcachefs/fs-io-direct.h b/libbcachefs/fs-io-direct.h new file mode 100644 index 00000000..8e950cca --- /dev/null +++ b/libbcachefs/fs-io-direct.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_FS_IO_DIRECT_H +#define _BCACHEFS_FS_IO_DIRECT_H + +#ifndef NO_BCACHEFS_FS +ssize_t bch2_direct_write(struct kiocb *, struct iov_iter *); + +void bch2_fs_fs_io_direct_exit(struct bch_fs *); +int bch2_fs_fs_io_direct_init(struct bch_fs *); +#else +static inline void bch2_fs_fs_io_direct_exit(struct bch_fs *c) {} +static inline int bch2_fs_fs_io_direct_init(struct bch_fs *c) { return 0; } +#endif + +#endif /* _BCACHEFS_FS_IO_DIRECT_H */ diff --git a/libbcachefs/fs-io-pagecache.c b/libbcachefs/fs-io-pagecache.c new file mode 100644 index 00000000..07c4bfe3 --- /dev/null +++ b/libbcachefs/fs-io-pagecache.c @@ -0,0 +1,777 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef NO_BCACHEFS_FS + +#include "bcachefs.h" +#include "btree_iter.h" +#include "extents.h" +#include "fs-io.h" +#include "fs-io-pagecache.h" +#include "subvolume.h" + +#include +#include + +int bch2_filemap_get_contig_folios_d(struct address_space *mapping, + loff_t start, u64 end, + int fgp_flags, gfp_t gfp, + folios *folios) +{ + struct folio *f; + u64 pos = start; + int ret = 0; + + while (pos < end) { + if ((u64) pos >= (u64) start + (1ULL << 20)) + fgp_flags &= ~FGP_CREAT; + + ret = darray_make_room_gfp(folios, 1, gfp & GFP_KERNEL); + if (ret) + break; + + f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp); + if (IS_ERR_OR_NULL(f)) + break; + + BUG_ON(folios->nr && folio_pos(f) != pos); + + pos = folio_end_pos(f); + darray_push(folios, f); + } + + if (!folios->nr && !ret && (fgp_flags & FGP_CREAT)) + ret = -ENOMEM; + + return folios->nr ? 0 : ret; +} + +/* pagecache_block must be held */ +int bch2_write_invalidate_inode_pages_range(struct address_space *mapping, + loff_t start, loff_t end) +{ + int ret; + + /* + * XXX: the way this is currently implemented, we can spin if a process + * is continually redirtying a specific page + */ + do { + if (!mapping->nrpages) + return 0; + + ret = filemap_write_and_wait_range(mapping, start, end); + if (ret) + break; + + if (!mapping->nrpages) + return 0; + + ret = invalidate_inode_pages2_range(mapping, + start >> PAGE_SHIFT, + end >> PAGE_SHIFT); + } while (ret == -EBUSY); + + return ret; +} + +static const char * const bch2_folio_sector_states[] = { +#define x(n) #n, + BCH_FOLIO_SECTOR_STATE() +#undef x + NULL +}; + +static inline enum bch_folio_sector_state +folio_sector_dirty(enum bch_folio_sector_state state) +{ + switch (state) { + case SECTOR_unallocated: + return SECTOR_dirty; + case SECTOR_reserved: + return SECTOR_dirty_reserved; + default: + return state; + } +} + +static inline enum bch_folio_sector_state +folio_sector_undirty(enum bch_folio_sector_state state) +{ + switch (state) { + case SECTOR_dirty: + return SECTOR_unallocated; + case SECTOR_dirty_reserved: + return SECTOR_reserved; + default: + return state; + } +} + +static inline enum bch_folio_sector_state +folio_sector_reserve(enum bch_folio_sector_state state) +{ + switch (state) { + case SECTOR_unallocated: + return SECTOR_reserved; + case SECTOR_dirty: + return SECTOR_dirty_reserved; + default: + return state; + } +} + +/* for newly allocated folios: */ +struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp) +{ + struct bch_folio *s; + + s = kzalloc(sizeof(*s) + + sizeof(struct bch_folio_sector) * + folio_sectors(folio), gfp); + if (!s) + return NULL; + + spin_lock_init(&s->lock); + folio_attach_private(folio, s); + return s; +} + +struct bch_folio *bch2_folio_create(struct folio *folio, gfp_t gfp) +{ + return bch2_folio(folio) ?: __bch2_folio_create(folio, gfp); +} + +static unsigned bkey_to_sector_state(struct bkey_s_c k) +{ + if (bkey_extent_is_reservation(k)) + return SECTOR_reserved; + if (bkey_extent_is_allocation(k.k)) + return SECTOR_allocated; + return SECTOR_unallocated; +} + +static void __bch2_folio_set(struct folio *folio, + unsigned pg_offset, unsigned pg_len, + unsigned nr_ptrs, unsigned state) +{ + struct bch_folio *s = bch2_folio(folio); + unsigned i, sectors = folio_sectors(folio); + + BUG_ON(pg_offset >= sectors); + BUG_ON(pg_offset + pg_len > sectors); + + spin_lock(&s->lock); + + for (i = pg_offset; i < pg_offset + pg_len; i++) { + s->s[i].nr_replicas = nr_ptrs; + bch2_folio_sector_set(folio, s, i, state); + } + + if (i == sectors) + s->uptodate = true; + + spin_unlock(&s->lock); +} + +/* + * Initialize bch_folio state (allocated/unallocated, nr_replicas) from the + * extents btree: + */ +int bch2_folio_set(struct bch_fs *c, subvol_inum inum, + struct folio **folios, unsigned nr_folios) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bch_folio *s; + u64 offset = folio_sector(folios[0]); + unsigned folio_idx; + u32 snapshot; + bool need_set = false; + int ret; + + for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) { + s = bch2_folio_create(folios[folio_idx], GFP_KERNEL); + if (!s) + return -ENOMEM; + + need_set |= !s->uptodate; + } + + if (!need_set) + return 0; + + folio_idx = 0; + bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + + ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + if (ret) + goto err; + + for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, + SPOS(inum.inum, offset, snapshot), + BTREE_ITER_SLOTS, k, ret) { + unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k); + unsigned state = bkey_to_sector_state(k); + + while (folio_idx < nr_folios) { + struct folio *folio = folios[folio_idx]; + u64 folio_start = folio_sector(folio); + u64 folio_end = folio_end_sector(folio); + unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) - folio_start; + unsigned folio_len = min(k.k->p.offset, folio_end) - folio_offset - folio_start; + + BUG_ON(k.k->p.offset < folio_start); + BUG_ON(bkey_start_offset(k.k) > folio_end); + + if (!bch2_folio(folio)->uptodate) + __bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state); + + if (k.k->p.offset < folio_end) + break; + folio_idx++; + } + + if (folio_idx == nr_folios) + break; + } + + offset = iter.pos.offset; + bch2_trans_iter_exit(&trans, &iter); +err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + bch2_trans_exit(&trans); + + return ret; +} + +void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k) +{ + struct bvec_iter iter; + struct folio_vec fv; + unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v + ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k); + unsigned state = bkey_to_sector_state(k); + + bio_for_each_folio(fv, bio, iter) + __bch2_folio_set(fv.fv_folio, + fv.fv_offset >> 9, + fv.fv_len >> 9, + nr_ptrs, state); +} + +void bch2_mark_pagecache_unallocated(struct bch_inode_info *inode, + u64 start, u64 end) +{ + pgoff_t index = start >> PAGE_SECTORS_SHIFT; + pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; + struct folio_batch fbatch; + unsigned i, j; + + if (end <= start) + return; + + folio_batch_init(&fbatch); + + while (filemap_get_folios(inode->v.i_mapping, + &index, end_index, &fbatch)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; + u64 folio_start = folio_sector(folio); + u64 folio_end = folio_end_sector(folio); + unsigned folio_offset = max(start, folio_start) - folio_start; + unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; + struct bch_folio *s; + + BUG_ON(end <= folio_start); + + folio_lock(folio); + s = bch2_folio(folio); + + if (s) { + spin_lock(&s->lock); + for (j = folio_offset; j < folio_offset + folio_len; j++) + s->s[j].nr_replicas = 0; + spin_unlock(&s->lock); + } + + folio_unlock(folio); + } + folio_batch_release(&fbatch); + cond_resched(); + } +} + +void bch2_mark_pagecache_reserved(struct bch_inode_info *inode, + u64 start, u64 end) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + pgoff_t index = start >> PAGE_SECTORS_SHIFT; + pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; + struct folio_batch fbatch; + s64 i_sectors_delta = 0; + unsigned i, j; + + if (end <= start) + return; + + folio_batch_init(&fbatch); + + while (filemap_get_folios(inode->v.i_mapping, + &index, end_index, &fbatch)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; + u64 folio_start = folio_sector(folio); + u64 folio_end = folio_end_sector(folio); + unsigned folio_offset = max(start, folio_start) - folio_start; + unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; + struct bch_folio *s; + + BUG_ON(end <= folio_start); + + folio_lock(folio); + s = bch2_folio(folio); + + if (s) { + spin_lock(&s->lock); + for (j = folio_offset; j < folio_offset + folio_len; j++) { + i_sectors_delta -= s->s[j].state == SECTOR_dirty; + bch2_folio_sector_set(folio, s, j, folio_sector_reserve(s->s[j].state)); + } + spin_unlock(&s->lock); + } + + folio_unlock(folio); + } + folio_batch_release(&fbatch); + cond_resched(); + } + + bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); +} + +static inline unsigned sectors_to_reserve(struct bch_folio_sector *s, + unsigned nr_replicas) +{ + return max(0, (int) nr_replicas - + s->nr_replicas - + s->replicas_reserved); +} + +int bch2_get_folio_disk_reservation(struct bch_fs *c, + struct bch_inode_info *inode, + struct folio *folio, bool check_enospc) +{ + struct bch_folio *s = bch2_folio_create(folio, 0); + unsigned nr_replicas = inode_nr_replicas(c, inode); + struct disk_reservation disk_res = { 0 }; + unsigned i, sectors = folio_sectors(folio), disk_res_sectors = 0; + int ret; + + if (!s) + return -ENOMEM; + + for (i = 0; i < sectors; i++) + disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas); + + if (!disk_res_sectors) + return 0; + + ret = bch2_disk_reservation_get(c, &disk_res, + disk_res_sectors, 1, + !check_enospc + ? BCH_DISK_RESERVATION_NOFAIL + : 0); + if (unlikely(ret)) + return ret; + + for (i = 0; i < sectors; i++) + s->s[i].replicas_reserved += + sectors_to_reserve(&s->s[i], nr_replicas); + + return 0; +} + +void bch2_folio_reservation_put(struct bch_fs *c, + struct bch_inode_info *inode, + struct bch2_folio_reservation *res) +{ + bch2_disk_reservation_put(c, &res->disk); + bch2_quota_reservation_put(c, inode, &res->quota); +} + +int bch2_folio_reservation_get(struct bch_fs *c, + struct bch_inode_info *inode, + struct folio *folio, + struct bch2_folio_reservation *res, + unsigned offset, unsigned len) +{ + struct bch_folio *s = bch2_folio_create(folio, 0); + unsigned i, disk_sectors = 0, quota_sectors = 0; + int ret; + + if (!s) + return -ENOMEM; + + BUG_ON(!s->uptodate); + + for (i = round_down(offset, block_bytes(c)) >> 9; + i < round_up(offset + len, block_bytes(c)) >> 9; + i++) { + disk_sectors += sectors_to_reserve(&s->s[i], + res->disk.nr_replicas); + quota_sectors += s->s[i].state == SECTOR_unallocated; + } + + if (disk_sectors) { + ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0); + if (unlikely(ret)) + return ret; + } + + if (quota_sectors) { + ret = bch2_quota_reservation_add(c, inode, &res->quota, + quota_sectors, true); + if (unlikely(ret)) { + struct disk_reservation tmp = { + .sectors = disk_sectors + }; + + bch2_disk_reservation_put(c, &tmp); + res->disk.sectors -= disk_sectors; + return ret; + } + } + + return 0; +} + +static void bch2_clear_folio_bits(struct folio *folio) +{ + struct bch_inode_info *inode = to_bch_ei(folio->mapping->host); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_folio *s = bch2_folio(folio); + struct disk_reservation disk_res = { 0 }; + int i, sectors = folio_sectors(folio), dirty_sectors = 0; + + if (!s) + return; + + EBUG_ON(!folio_test_locked(folio)); + EBUG_ON(folio_test_writeback(folio)); + + for (i = 0; i < sectors; i++) { + disk_res.sectors += s->s[i].replicas_reserved; + s->s[i].replicas_reserved = 0; + + dirty_sectors -= s->s[i].state == SECTOR_dirty; + bch2_folio_sector_set(folio, s, i, folio_sector_undirty(s->s[i].state)); + } + + bch2_disk_reservation_put(c, &disk_res); + + bch2_i_sectors_acct(c, inode, NULL, dirty_sectors); + + bch2_folio_release(folio); +} + +void bch2_set_folio_dirty(struct bch_fs *c, + struct bch_inode_info *inode, + struct folio *folio, + struct bch2_folio_reservation *res, + unsigned offset, unsigned len) +{ + struct bch_folio *s = bch2_folio(folio); + unsigned i, dirty_sectors = 0; + + WARN_ON((u64) folio_pos(folio) + offset + len > + round_up((u64) i_size_read(&inode->v), block_bytes(c))); + + BUG_ON(!s->uptodate); + + spin_lock(&s->lock); + + for (i = round_down(offset, block_bytes(c)) >> 9; + i < round_up(offset + len, block_bytes(c)) >> 9; + i++) { + unsigned sectors = sectors_to_reserve(&s->s[i], + res->disk.nr_replicas); + + /* + * This can happen if we race with the error path in + * bch2_writepage_io_done(): + */ + sectors = min_t(unsigned, sectors, res->disk.sectors); + + s->s[i].replicas_reserved += sectors; + res->disk.sectors -= sectors; + + dirty_sectors += s->s[i].state == SECTOR_unallocated; + + bch2_folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state)); + } + + spin_unlock(&s->lock); + + bch2_i_sectors_acct(c, inode, &res->quota, dirty_sectors); + + if (!folio_test_dirty(folio)) + filemap_dirty_folio(inode->v.i_mapping, folio); +} + +vm_fault_t bch2_page_fault(struct vm_fault *vmf) +{ + struct file *file = vmf->vma->vm_file; + struct address_space *mapping = file->f_mapping; + struct address_space *fdm = faults_disabled_mapping(); + struct bch_inode_info *inode = file_bch_inode(file); + vm_fault_t ret; + + if (fdm == mapping) + return VM_FAULT_SIGBUS; + + /* Lock ordering: */ + if (fdm > mapping) { + struct bch_inode_info *fdm_host = to_bch_ei(fdm->host); + + if (bch2_pagecache_add_tryget(inode)) + goto got_lock; + + bch2_pagecache_block_put(fdm_host); + + bch2_pagecache_add_get(inode); + bch2_pagecache_add_put(inode); + + bch2_pagecache_block_get(fdm_host); + + /* Signal that lock has been dropped: */ + set_fdm_dropped_locks(); + return VM_FAULT_SIGBUS; + } + + bch2_pagecache_add_get(inode); +got_lock: + ret = filemap_fault(vmf); + bch2_pagecache_add_put(inode); + + return ret; +} + +vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) +{ + struct folio *folio = page_folio(vmf->page); + struct file *file = vmf->vma->vm_file; + struct bch_inode_info *inode = file_bch_inode(file); + struct address_space *mapping = file->f_mapping; + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch2_folio_reservation res; + unsigned len; + loff_t isize; + vm_fault_t ret; + + bch2_folio_reservation_init(c, inode, &res); + + sb_start_pagefault(inode->v.i_sb); + file_update_time(file); + + /* + * Not strictly necessary, but helps avoid dio writes livelocking in + * bch2_write_invalidate_inode_pages_range() - can drop this if/when we get + * a bch2_write_invalidate_inode_pages_range() that works without dropping + * page lock before invalidating page + */ + bch2_pagecache_add_get(inode); + + folio_lock(folio); + isize = i_size_read(&inode->v); + + if (folio->mapping != mapping || folio_pos(folio) >= isize) { + folio_unlock(folio); + ret = VM_FAULT_NOPAGE; + goto out; + } + + len = min_t(loff_t, folio_size(folio), isize - folio_pos(folio)); + + if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?: + bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) { + folio_unlock(folio); + ret = VM_FAULT_SIGBUS; + goto out; + } + + bch2_set_folio_dirty(c, inode, folio, &res, 0, len); + bch2_folio_reservation_put(c, inode, &res); + + folio_wait_stable(folio); + ret = VM_FAULT_LOCKED; +out: + bch2_pagecache_add_put(inode); + sb_end_pagefault(inode->v.i_sb); + + return ret; +} + +void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length) +{ + if (offset || length < folio_size(folio)) + return; + + bch2_clear_folio_bits(folio); +} + +bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask) +{ + if (folio_test_dirty(folio) || folio_test_writeback(folio)) + return false; + + bch2_clear_folio_bits(folio); + return true; +} + +/* fseek: */ + +static int folio_data_offset(struct folio *folio, loff_t pos, + unsigned min_replicas) +{ + struct bch_folio *s = bch2_folio(folio); + unsigned i, sectors = folio_sectors(folio); + + if (s) + for (i = folio_pos_to_s(folio, pos); i < sectors; i++) + if (s->s[i].state >= SECTOR_dirty && + s->s[i].nr_replicas + s->s[i].replicas_reserved >= min_replicas) + return i << SECTOR_SHIFT; + + return -1; +} + +loff_t bch2_seek_pagecache_data(struct inode *vinode, + loff_t start_offset, + loff_t end_offset, + unsigned min_replicas, + bool nonblock) +{ + struct folio_batch fbatch; + pgoff_t start_index = start_offset >> PAGE_SHIFT; + pgoff_t end_index = end_offset >> PAGE_SHIFT; + pgoff_t index = start_index; + unsigned i; + loff_t ret; + int offset; + + folio_batch_init(&fbatch); + + while (filemap_get_folios(vinode->i_mapping, + &index, end_index, &fbatch)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; + + if (!nonblock) { + folio_lock(folio); + } else if (!folio_trylock(folio)) { + folio_batch_release(&fbatch); + return -EAGAIN; + } + + offset = folio_data_offset(folio, + max(folio_pos(folio), start_offset), + min_replicas); + if (offset >= 0) { + ret = clamp(folio_pos(folio) + offset, + start_offset, end_offset); + folio_unlock(folio); + folio_batch_release(&fbatch); + return ret; + } + folio_unlock(folio); + } + folio_batch_release(&fbatch); + cond_resched(); + } + + return end_offset; +} + +static int folio_hole_offset(struct address_space *mapping, loff_t *offset, + unsigned min_replicas, bool nonblock) +{ + struct folio *folio; + struct bch_folio *s; + unsigned i, sectors; + bool ret = true; + + folio = __filemap_get_folio(mapping, *offset >> PAGE_SHIFT, + FGP_LOCK|(nonblock ? FGP_NOWAIT : 0), 0); + if (folio == ERR_PTR(-EAGAIN)) + return -EAGAIN; + if (IS_ERR_OR_NULL(folio)) + return true; + + s = bch2_folio(folio); + if (!s) + goto unlock; + + sectors = folio_sectors(folio); + for (i = folio_pos_to_s(folio, *offset); i < sectors; i++) + if (s->s[i].state < SECTOR_dirty || + s->s[i].nr_replicas + s->s[i].replicas_reserved < min_replicas) { + *offset = max(*offset, + folio_pos(folio) + (i << SECTOR_SHIFT)); + goto unlock; + } + + *offset = folio_end_pos(folio); + ret = false; +unlock: + folio_unlock(folio); + folio_put(folio); + return ret; +} + +loff_t bch2_seek_pagecache_hole(struct inode *vinode, + loff_t start_offset, + loff_t end_offset, + unsigned min_replicas, + bool nonblock) +{ + struct address_space *mapping = vinode->i_mapping; + loff_t offset = start_offset; + + while (offset < end_offset && + !folio_hole_offset(mapping, &offset, min_replicas, nonblock)) + ; + + return min(offset, end_offset); +} + +int bch2_clamp_data_hole(struct inode *inode, + u64 *hole_start, + u64 *hole_end, + unsigned min_replicas, + bool nonblock) +{ + loff_t ret; + + ret = bch2_seek_pagecache_hole(inode, + *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9; + if (ret < 0) + return ret; + + *hole_start = ret; + + if (*hole_start == *hole_end) + return 0; + + ret = bch2_seek_pagecache_data(inode, + *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9; + if (ret < 0) + return ret; + + *hole_end = ret; + return 0; +} + +#endif /* NO_BCACHEFS_FS */ diff --git a/libbcachefs/fs-io-pagecache.h b/libbcachefs/fs-io-pagecache.h new file mode 100644 index 00000000..f1c747cb --- /dev/null +++ b/libbcachefs/fs-io-pagecache.h @@ -0,0 +1,175 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_FS_IO_PAGECACHE_H +#define _BCACHEFS_FS_IO_PAGECACHE_H + +#include + +typedef DARRAY(struct folio *) folios; + +int bch2_filemap_get_contig_folios_d(struct address_space *, loff_t, + u64, int, gfp_t, folios *); +int bch2_write_invalidate_inode_pages_range(struct address_space *, loff_t, loff_t); + +/* + * Use u64 for the end pos and sector helpers because if the folio covers the + * max supported range of the mapping, the start offset of the next folio + * overflows loff_t. This breaks much of the range based processing in the + * buffered write path. + */ +static inline u64 folio_end_pos(struct folio *folio) +{ + return folio_pos(folio) + folio_size(folio); +} + +static inline size_t folio_sectors(struct folio *folio) +{ + return PAGE_SECTORS << folio_order(folio); +} + +static inline loff_t folio_sector(struct folio *folio) +{ + return folio_pos(folio) >> 9; +} + +static inline u64 folio_end_sector(struct folio *folio) +{ + return folio_end_pos(folio) >> 9; +} + +#define BCH_FOLIO_SECTOR_STATE() \ + x(unallocated) \ + x(reserved) \ + x(dirty) \ + x(dirty_reserved) \ + x(allocated) + +enum bch_folio_sector_state { +#define x(n) SECTOR_##n, + BCH_FOLIO_SECTOR_STATE() +#undef x +}; + +struct bch_folio_sector { + /* Uncompressed, fully allocated replicas (or on disk reservation): */ + unsigned nr_replicas:4; + + /* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */ + unsigned replicas_reserved:4; + + /* i_sectors: */ + enum bch_folio_sector_state state:8; +}; + +struct bch_folio { + spinlock_t lock; + atomic_t write_count; + /* + * Is the sector state up to date with the btree? + * (Not the data itself) + */ + bool uptodate; + struct bch_folio_sector s[]; +}; + +/* Helper for when we need to add debug instrumentation: */ +static inline void bch2_folio_sector_set(struct folio *folio, + struct bch_folio *s, + unsigned i, unsigned n) +{ + s->s[i].state = n; +} + +/* file offset (to folio offset) to bch_folio_sector index */ +static inline int folio_pos_to_s(struct folio *folio, loff_t pos) +{ + u64 f_offset = pos - folio_pos(folio); + BUG_ON(pos < folio_pos(folio) || pos >= folio_end_pos(folio)); + return f_offset >> SECTOR_SHIFT; +} + +/* for newly allocated folios: */ +static inline void __bch2_folio_release(struct folio *folio) +{ + kfree(folio_detach_private(folio)); +} + +static inline void bch2_folio_release(struct folio *folio) +{ + EBUG_ON(!folio_test_locked(folio)); + __bch2_folio_release(folio); +} + +static inline struct bch_folio *__bch2_folio(struct folio *folio) +{ + return folio_has_private(folio) + ? (struct bch_folio *) folio_get_private(folio) + : NULL; +} + +static inline struct bch_folio *bch2_folio(struct folio *folio) +{ + EBUG_ON(!folio_test_locked(folio)); + + return __bch2_folio(folio); +} + +struct bch_folio *__bch2_folio_create(struct folio *, gfp_t); +struct bch_folio *bch2_folio_create(struct folio *, gfp_t); + +struct bch2_folio_reservation { + struct disk_reservation disk; + struct quota_res quota; +}; + +static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode) +{ + /* XXX: this should not be open coded */ + return inode->ei_inode.bi_data_replicas + ? inode->ei_inode.bi_data_replicas - 1 + : c->opts.data_replicas; +} + +static inline void bch2_folio_reservation_init(struct bch_fs *c, + struct bch_inode_info *inode, + struct bch2_folio_reservation *res) +{ + memset(res, 0, sizeof(*res)); + + res->disk.nr_replicas = inode_nr_replicas(c, inode); +} + +int bch2_folio_set(struct bch_fs *, subvol_inum, struct folio **, unsigned); +void bch2_bio_page_state_set(struct bio *, struct bkey_s_c); + +void bch2_mark_pagecache_unallocated(struct bch_inode_info *, u64, u64); +void bch2_mark_pagecache_reserved(struct bch_inode_info *, u64, u64); + +int bch2_get_folio_disk_reservation(struct bch_fs *, + struct bch_inode_info *, + struct folio *, bool); + +void bch2_folio_reservation_put(struct bch_fs *, + struct bch_inode_info *, + struct bch2_folio_reservation *); +int bch2_folio_reservation_get(struct bch_fs *, + struct bch_inode_info *, + struct folio *, + struct bch2_folio_reservation *, + unsigned, unsigned); + +void bch2_set_folio_dirty(struct bch_fs *, + struct bch_inode_info *, + struct folio *, + struct bch2_folio_reservation *, + unsigned, unsigned); + +vm_fault_t bch2_page_fault(struct vm_fault *); +vm_fault_t bch2_page_mkwrite(struct vm_fault *); +void bch2_invalidate_folio(struct folio *, size_t, size_t); +bool bch2_release_folio(struct folio *, gfp_t); + +loff_t bch2_seek_pagecache_data(struct inode *, loff_t, loff_t, unsigned, bool); +loff_t bch2_seek_pagecache_hole(struct inode *, loff_t, loff_t, unsigned, bool); +int bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned, bool); + +#endif /* _BCACHEFS_FS_IO_PAGECACHE_H */ diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index d433f4d5..e846d1eb 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -3,7 +3,7 @@ #include "bcachefs.h" #include "alloc_foreground.h" -#include "bkey_buf.h" +//#include "bkey_buf.h" #include "btree_update.h" #include "buckets.h" #include "clock.h" @@ -12,6 +12,9 @@ #include "extent_update.h" #include "fs.h" #include "fs-io.h" +#include "fs-io-buffered.h" +//#include "fs-io-direct.h" +#include "fs-io-pagecache.h" #include "fsck.h" #include "inode.h" #include "journal.h" @@ -31,2746 +34,137 @@ #include #include #include -#include #include -static int bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned, bool); - -struct folio_vec { - struct folio *fv_folio; - size_t fv_offset; - size_t fv_len; -}; - -static inline struct folio_vec biovec_to_foliovec(struct bio_vec bv) -{ - - struct folio *folio = page_folio(bv.bv_page); - size_t offset = (folio_page_idx(folio, bv.bv_page) << PAGE_SHIFT) + - bv.bv_offset; - size_t len = min_t(size_t, folio_size(folio) - offset, bv.bv_len); - - return (struct folio_vec) { - .fv_folio = folio, - .fv_offset = offset, - .fv_len = len, - }; -} - -static inline struct folio_vec bio_iter_iovec_folio(struct bio *bio, - struct bvec_iter iter) -{ - return biovec_to_foliovec(bio_iter_iovec(bio, iter)); -} - -#define __bio_for_each_folio(bvl, bio, iter, start) \ - for (iter = (start); \ - (iter).bi_size && \ - ((bvl = bio_iter_iovec_folio((bio), (iter))), 1); \ - bio_advance_iter_single((bio), &(iter), (bvl).fv_len)) - -/** - * bio_for_each_folio - iterate over folios within a bio - * - * Like other non-_all versions, this iterates over what bio->bi_iter currently - * points to. This version is for drivers, where the bio may have previously - * been split or cloned. - */ -#define bio_for_each_folio(bvl, bio, iter) \ - __bio_for_each_folio(bvl, bio, iter, (bio)->bi_iter) - -/* - * Use u64 for the end pos and sector helpers because if the folio covers the - * max supported range of the mapping, the start offset of the next folio - * overflows loff_t. This breaks much of the range based processing in the - * buffered write path. - */ -static inline u64 folio_end_pos(struct folio *folio) -{ - return folio_pos(folio) + folio_size(folio); -} - -static inline size_t folio_sectors(struct folio *folio) -{ - return PAGE_SECTORS << folio_order(folio); -} - -static inline loff_t folio_sector(struct folio *folio) -{ - return folio_pos(folio) >> 9; -} - -static inline u64 folio_end_sector(struct folio *folio) -{ - return folio_end_pos(folio) >> 9; -} - -typedef DARRAY(struct folio *) folios; - -static int filemap_get_contig_folios_d(struct address_space *mapping, - loff_t start, u64 end, - int fgp_flags, gfp_t gfp, - folios *folios) -{ - struct folio *f; - u64 pos = start; - int ret = 0; - - while (pos < end) { - if ((u64) pos >= (u64) start + (1ULL << 20)) - fgp_flags &= ~FGP_CREAT; - - ret = darray_make_room_gfp(folios, 1, gfp & GFP_KERNEL); - if (ret) - break; - - f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp); - if (IS_ERR_OR_NULL(f)) - break; - - BUG_ON(folios->nr && folio_pos(f) != pos); - - pos = folio_end_pos(f); - darray_push(folios, f); - } - - if (!folios->nr && !ret && (fgp_flags & FGP_CREAT)) - ret = -ENOMEM; - - return folios->nr ? 0 : ret; -} - -struct nocow_flush { - struct closure *cl; - struct bch_dev *ca; - struct bio bio; -}; - -static void nocow_flush_endio(struct bio *_bio) -{ - - struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio); - - closure_put(bio->cl); - percpu_ref_put(&bio->ca->io_ref); - bio_put(&bio->bio); -} - -static void bch2_inode_flush_nocow_writes_async(struct bch_fs *c, - struct bch_inode_info *inode, - struct closure *cl) -{ - struct nocow_flush *bio; - struct bch_dev *ca; - struct bch_devs_mask devs; - unsigned dev; - - dev = find_first_bit(inode->ei_devs_need_flush.d, BCH_SB_MEMBERS_MAX); - if (dev == BCH_SB_MEMBERS_MAX) - return; - - devs = inode->ei_devs_need_flush; - memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); - - for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) { - rcu_read_lock(); - ca = rcu_dereference(c->devs[dev]); - if (ca && !percpu_ref_tryget(&ca->io_ref)) - ca = NULL; - rcu_read_unlock(); - - if (!ca) - continue; - - bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0, - REQ_OP_FLUSH, - GFP_KERNEL, - &c->nocow_flush_bioset), - struct nocow_flush, bio); - bio->cl = cl; - bio->ca = ca; - bio->bio.bi_end_io = nocow_flush_endio; - closure_bio_submit(&bio->bio, cl); - } -} - -static int bch2_inode_flush_nocow_writes(struct bch_fs *c, - struct bch_inode_info *inode) -{ - struct closure cl; - - closure_init_stack(&cl); - bch2_inode_flush_nocow_writes_async(c, inode, &cl); - closure_sync(&cl); - - return 0; -} - -static inline bool bio_full(struct bio *bio, unsigned len) -{ - if (bio->bi_vcnt >= bio->bi_max_vecs) - return true; - if (bio->bi_iter.bi_size > UINT_MAX - len) - return true; - return false; -} - -static inline struct address_space *faults_disabled_mapping(void) -{ - return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL); -} - -static inline void set_fdm_dropped_locks(void) -{ - current->faults_disabled_mapping = - (void *) (((unsigned long) current->faults_disabled_mapping)|1); -} - -static inline bool fdm_dropped_locks(void) -{ - return ((unsigned long) current->faults_disabled_mapping) & 1; -} - -struct quota_res { - u64 sectors; -}; - -struct bch_writepage_io { - struct bch_inode_info *inode; - - /* must be last: */ - struct bch_write_op op; -}; - -struct dio_write { - struct kiocb *req; - struct address_space *mapping; - struct bch_inode_info *inode; - struct mm_struct *mm; - unsigned loop:1, - extending:1, - sync:1, - flush:1, - free_iov:1; - struct quota_res quota_res; - u64 written; - - struct iov_iter iter; - struct iovec inline_vecs[2]; - - /* must be last: */ - struct bch_write_op op; -}; - -struct dio_read { - struct closure cl; - struct kiocb *req; - long ret; - bool should_dirty; - struct bch_read_bio rbio; -}; - -/* pagecache_block must be held */ -static noinline int write_invalidate_inode_pages_range(struct address_space *mapping, - loff_t start, loff_t end) -{ - int ret; - - /* - * XXX: the way this is currently implemented, we can spin if a process - * is continually redirtying a specific page - */ - do { - if (!mapping->nrpages) - return 0; - - ret = filemap_write_and_wait_range(mapping, start, end); - if (ret) - break; - - if (!mapping->nrpages) - return 0; - - ret = invalidate_inode_pages2_range(mapping, - start >> PAGE_SHIFT, - end >> PAGE_SHIFT); - } while (ret == -EBUSY); - - return ret; -} - -/* quotas */ - -#ifdef CONFIG_BCACHEFS_QUOTA - -static void __bch2_quota_reservation_put(struct bch_fs *c, - struct bch_inode_info *inode, - struct quota_res *res) -{ - BUG_ON(res->sectors > inode->ei_quota_reserved); - - bch2_quota_acct(c, inode->ei_qid, Q_SPC, - -((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC); - inode->ei_quota_reserved -= res->sectors; - res->sectors = 0; -} - -static void bch2_quota_reservation_put(struct bch_fs *c, - struct bch_inode_info *inode, - struct quota_res *res) -{ - if (res->sectors) { - mutex_lock(&inode->ei_quota_lock); - __bch2_quota_reservation_put(c, inode, res); - mutex_unlock(&inode->ei_quota_lock); - } -} - -static int bch2_quota_reservation_add(struct bch_fs *c, - struct bch_inode_info *inode, - struct quota_res *res, - u64 sectors, - bool check_enospc) -{ - int ret; - - if (test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags)) - return 0; - - mutex_lock(&inode->ei_quota_lock); - ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, - check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK); - if (likely(!ret)) { - inode->ei_quota_reserved += sectors; - res->sectors += sectors; - } - mutex_unlock(&inode->ei_quota_lock); - - return ret; -} - -#else - -static void __bch2_quota_reservation_put(struct bch_fs *c, - struct bch_inode_info *inode, - struct quota_res *res) {} - -static void bch2_quota_reservation_put(struct bch_fs *c, - struct bch_inode_info *inode, - struct quota_res *res) {} - -static int bch2_quota_reservation_add(struct bch_fs *c, - struct bch_inode_info *inode, - struct quota_res *res, - unsigned sectors, - bool check_enospc) -{ - return 0; -} - -#endif - -/* i_size updates: */ - -struct inode_new_size { - loff_t new_size; - u64 now; - unsigned fields; -}; - -static int inode_set_size(struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - void *p) -{ - struct inode_new_size *s = p; - - bi->bi_size = s->new_size; - if (s->fields & ATTR_ATIME) - bi->bi_atime = s->now; - if (s->fields & ATTR_MTIME) - bi->bi_mtime = s->now; - if (s->fields & ATTR_CTIME) - bi->bi_ctime = s->now; - - return 0; -} - -int __must_check bch2_write_inode_size(struct bch_fs *c, - struct bch_inode_info *inode, - loff_t new_size, unsigned fields) -{ - struct inode_new_size s = { - .new_size = new_size, - .now = bch2_current_time(c), - .fields = fields, - }; - - return bch2_write_inode(c, inode, inode_set_size, &s, fields); -} - -static void __i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, - struct quota_res *quota_res, s64 sectors) -{ - bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c, - "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)", - inode->v.i_ino, (u64) inode->v.i_blocks, sectors, - inode->ei_inode.bi_sectors); - inode->v.i_blocks += sectors; - -#ifdef CONFIG_BCACHEFS_QUOTA - if (quota_res && - !test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags) && - sectors > 0) { - BUG_ON(sectors > quota_res->sectors); - BUG_ON(sectors > inode->ei_quota_reserved); - - quota_res->sectors -= sectors; - inode->ei_quota_reserved -= sectors; - } else { - bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN); - } -#endif -} - -static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, - struct quota_res *quota_res, s64 sectors) -{ - if (sectors) { - mutex_lock(&inode->ei_quota_lock); - __i_sectors_acct(c, inode, quota_res, sectors); - mutex_unlock(&inode->ei_quota_lock); - } -} - -/* page state: */ - -/* stored in page->private: */ - -#define BCH_FOLIO_SECTOR_STATE() \ - x(unallocated) \ - x(reserved) \ - x(dirty) \ - x(dirty_reserved) \ - x(allocated) - -enum bch_folio_sector_state { -#define x(n) SECTOR_##n, - BCH_FOLIO_SECTOR_STATE() -#undef x -}; - -static const char * const bch2_folio_sector_states[] = { -#define x(n) #n, - BCH_FOLIO_SECTOR_STATE() -#undef x - NULL -}; - -static inline enum bch_folio_sector_state -folio_sector_dirty(enum bch_folio_sector_state state) -{ - switch (state) { - case SECTOR_unallocated: - return SECTOR_dirty; - case SECTOR_reserved: - return SECTOR_dirty_reserved; - default: - return state; - } -} - -static inline enum bch_folio_sector_state -folio_sector_undirty(enum bch_folio_sector_state state) -{ - switch (state) { - case SECTOR_dirty: - return SECTOR_unallocated; - case SECTOR_dirty_reserved: - return SECTOR_reserved; - default: - return state; - } -} - -static inline enum bch_folio_sector_state -folio_sector_reserve(enum bch_folio_sector_state state) -{ - switch (state) { - case SECTOR_unallocated: - return SECTOR_reserved; - case SECTOR_dirty: - return SECTOR_dirty_reserved; - default: - return state; - } -} - -struct bch_folio_sector { - /* Uncompressed, fully allocated replicas (or on disk reservation): */ - unsigned nr_replicas:4; - - /* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */ - unsigned replicas_reserved:4; - - /* i_sectors: */ - enum bch_folio_sector_state state:8; -}; - -struct bch_folio { - spinlock_t lock; - atomic_t write_count; - /* - * Is the sector state up to date with the btree? - * (Not the data itself) - */ - bool uptodate; - struct bch_folio_sector s[]; -}; - -static inline void folio_sector_set(struct folio *folio, - struct bch_folio *s, - unsigned i, unsigned n) -{ - s->s[i].state = n; -} - -/* file offset (to folio offset) to bch_folio_sector index */ -static inline int folio_pos_to_s(struct folio *folio, loff_t pos) -{ - u64 f_offset = pos - folio_pos(folio); - BUG_ON(pos < folio_pos(folio) || pos >= folio_end_pos(folio)); - return f_offset >> SECTOR_SHIFT; -} - -static inline struct bch_folio *__bch2_folio(struct folio *folio) -{ - return folio_has_private(folio) - ? (struct bch_folio *) folio_get_private(folio) - : NULL; -} - -static inline struct bch_folio *bch2_folio(struct folio *folio) -{ - EBUG_ON(!folio_test_locked(folio)); - - return __bch2_folio(folio); -} - -/* for newly allocated folios: */ -static void __bch2_folio_release(struct folio *folio) -{ - kfree(folio_detach_private(folio)); -} - -static void bch2_folio_release(struct folio *folio) -{ - EBUG_ON(!folio_test_locked(folio)); - __bch2_folio_release(folio); -} - -/* for newly allocated folios: */ -static struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp) -{ - struct bch_folio *s; - - s = kzalloc(sizeof(*s) + - sizeof(struct bch_folio_sector) * - folio_sectors(folio), gfp); - if (!s) - return NULL; - - spin_lock_init(&s->lock); - folio_attach_private(folio, s); - return s; -} - -static struct bch_folio *bch2_folio_create(struct folio *folio, gfp_t gfp) -{ - return bch2_folio(folio) ?: __bch2_folio_create(folio, gfp); -} - -static unsigned bkey_to_sector_state(struct bkey_s_c k) -{ - if (bkey_extent_is_reservation(k)) - return SECTOR_reserved; - if (bkey_extent_is_allocation(k.k)) - return SECTOR_allocated; - return SECTOR_unallocated; -} - -static void __bch2_folio_set(struct folio *folio, - unsigned pg_offset, unsigned pg_len, - unsigned nr_ptrs, unsigned state) -{ - struct bch_folio *s = bch2_folio(folio); - unsigned i, sectors = folio_sectors(folio); - - BUG_ON(pg_offset >= sectors); - BUG_ON(pg_offset + pg_len > sectors); - - spin_lock(&s->lock); - - for (i = pg_offset; i < pg_offset + pg_len; i++) { - s->s[i].nr_replicas = nr_ptrs; - folio_sector_set(folio, s, i, state); - } - - if (i == sectors) - s->uptodate = true; - - spin_unlock(&s->lock); -} - -/* - * Initialize bch_folio state (allocated/unallocated, nr_replicas) from the - * extents btree: - */ -static int bch2_folio_set(struct bch_fs *c, subvol_inum inum, - struct folio **folios, unsigned nr_folios) -{ - struct btree_trans trans; - struct btree_iter iter; - struct bkey_s_c k; - struct bch_folio *s; - u64 offset = folio_sector(folios[0]); - unsigned folio_idx; - u32 snapshot; - bool need_set = false; - int ret; - - for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) { - s = bch2_folio_create(folios[folio_idx], GFP_KERNEL); - if (!s) - return -ENOMEM; - - need_set |= !s->uptodate; - } - - if (!need_set) - return 0; - - folio_idx = 0; - bch2_trans_init(&trans, c, 0, 0); -retry: - bch2_trans_begin(&trans); - - ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); - if (ret) - goto err; - - for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, - SPOS(inum.inum, offset, snapshot), - BTREE_ITER_SLOTS, k, ret) { - unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k); - unsigned state = bkey_to_sector_state(k); - - while (folio_idx < nr_folios) { - struct folio *folio = folios[folio_idx]; - u64 folio_start = folio_sector(folio); - u64 folio_end = folio_end_sector(folio); - unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) - folio_start; - unsigned folio_len = min(k.k->p.offset, folio_end) - folio_offset - folio_start; - - BUG_ON(k.k->p.offset < folio_start); - BUG_ON(bkey_start_offset(k.k) > folio_end); - - if (!bch2_folio(folio)->uptodate) - __bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state); - - if (k.k->p.offset < folio_end) - break; - folio_idx++; - } - - if (folio_idx == nr_folios) - break; - } - - offset = iter.pos.offset; - bch2_trans_iter_exit(&trans, &iter); -err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - bch2_trans_exit(&trans); - - return ret; -} - -static void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k) -{ - struct bvec_iter iter; - struct folio_vec fv; - unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v - ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k); - unsigned state = bkey_to_sector_state(k); - - bio_for_each_folio(fv, bio, iter) - __bch2_folio_set(fv.fv_folio, - fv.fv_offset >> 9, - fv.fv_len >> 9, - nr_ptrs, state); -} - -static void mark_pagecache_unallocated(struct bch_inode_info *inode, - u64 start, u64 end) -{ - pgoff_t index = start >> PAGE_SECTORS_SHIFT; - pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; - struct folio_batch fbatch; - unsigned i, j; - - if (end <= start) - return; - - folio_batch_init(&fbatch); - - while (filemap_get_folios(inode->v.i_mapping, - &index, end_index, &fbatch)) { - for (i = 0; i < folio_batch_count(&fbatch); i++) { - struct folio *folio = fbatch.folios[i]; - u64 folio_start = folio_sector(folio); - u64 folio_end = folio_end_sector(folio); - unsigned folio_offset = max(start, folio_start) - folio_start; - unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; - struct bch_folio *s; - - BUG_ON(end <= folio_start); - - folio_lock(folio); - s = bch2_folio(folio); - - if (s) { - spin_lock(&s->lock); - for (j = folio_offset; j < folio_offset + folio_len; j++) - s->s[j].nr_replicas = 0; - spin_unlock(&s->lock); - } - - folio_unlock(folio); - } - folio_batch_release(&fbatch); - cond_resched(); - } -} - -static void mark_pagecache_reserved(struct bch_inode_info *inode, - u64 start, u64 end) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - pgoff_t index = start >> PAGE_SECTORS_SHIFT; - pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; - struct folio_batch fbatch; - s64 i_sectors_delta = 0; - unsigned i, j; - - if (end <= start) - return; - - folio_batch_init(&fbatch); - - while (filemap_get_folios(inode->v.i_mapping, - &index, end_index, &fbatch)) { - for (i = 0; i < folio_batch_count(&fbatch); i++) { - struct folio *folio = fbatch.folios[i]; - u64 folio_start = folio_sector(folio); - u64 folio_end = folio_end_sector(folio); - unsigned folio_offset = max(start, folio_start) - folio_start; - unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; - struct bch_folio *s; - - BUG_ON(end <= folio_start); - - folio_lock(folio); - s = bch2_folio(folio); - - if (s) { - spin_lock(&s->lock); - for (j = folio_offset; j < folio_offset + folio_len; j++) { - i_sectors_delta -= s->s[j].state == SECTOR_dirty; - folio_sector_set(folio, s, j, folio_sector_reserve(s->s[j].state)); - } - spin_unlock(&s->lock); - } - - folio_unlock(folio); - } - folio_batch_release(&fbatch); - cond_resched(); - } - - i_sectors_acct(c, inode, NULL, i_sectors_delta); -} - -static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode) -{ - /* XXX: this should not be open coded */ - return inode->ei_inode.bi_data_replicas - ? inode->ei_inode.bi_data_replicas - 1 - : c->opts.data_replicas; -} - -static inline unsigned sectors_to_reserve(struct bch_folio_sector *s, - unsigned nr_replicas) -{ - return max(0, (int) nr_replicas - - s->nr_replicas - - s->replicas_reserved); -} - -static int bch2_get_folio_disk_reservation(struct bch_fs *c, - struct bch_inode_info *inode, - struct folio *folio, bool check_enospc) -{ - struct bch_folio *s = bch2_folio_create(folio, 0); - unsigned nr_replicas = inode_nr_replicas(c, inode); - struct disk_reservation disk_res = { 0 }; - unsigned i, sectors = folio_sectors(folio), disk_res_sectors = 0; - int ret; - - if (!s) - return -ENOMEM; - - for (i = 0; i < sectors; i++) - disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas); - - if (!disk_res_sectors) - return 0; - - ret = bch2_disk_reservation_get(c, &disk_res, - disk_res_sectors, 1, - !check_enospc - ? BCH_DISK_RESERVATION_NOFAIL - : 0); - if (unlikely(ret)) - return ret; - - for (i = 0; i < sectors; i++) - s->s[i].replicas_reserved += - sectors_to_reserve(&s->s[i], nr_replicas); - - return 0; -} - -struct bch2_folio_reservation { - struct disk_reservation disk; - struct quota_res quota; -}; - -static void bch2_folio_reservation_init(struct bch_fs *c, - struct bch_inode_info *inode, - struct bch2_folio_reservation *res) -{ - memset(res, 0, sizeof(*res)); - - res->disk.nr_replicas = inode_nr_replicas(c, inode); -} - -static void bch2_folio_reservation_put(struct bch_fs *c, - struct bch_inode_info *inode, - struct bch2_folio_reservation *res) -{ - bch2_disk_reservation_put(c, &res->disk); - bch2_quota_reservation_put(c, inode, &res->quota); -} - -static int bch2_folio_reservation_get(struct bch_fs *c, - struct bch_inode_info *inode, - struct folio *folio, - struct bch2_folio_reservation *res, - unsigned offset, unsigned len) -{ - struct bch_folio *s = bch2_folio_create(folio, 0); - unsigned i, disk_sectors = 0, quota_sectors = 0; - int ret; - - if (!s) - return -ENOMEM; - - BUG_ON(!s->uptodate); - - for (i = round_down(offset, block_bytes(c)) >> 9; - i < round_up(offset + len, block_bytes(c)) >> 9; - i++) { - disk_sectors += sectors_to_reserve(&s->s[i], - res->disk.nr_replicas); - quota_sectors += s->s[i].state == SECTOR_unallocated; - } - - if (disk_sectors) { - ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0); - if (unlikely(ret)) - return ret; - } - - if (quota_sectors) { - ret = bch2_quota_reservation_add(c, inode, &res->quota, - quota_sectors, true); - if (unlikely(ret)) { - struct disk_reservation tmp = { - .sectors = disk_sectors - }; - - bch2_disk_reservation_put(c, &tmp); - res->disk.sectors -= disk_sectors; - return ret; - } - } - - return 0; -} - -static void bch2_clear_folio_bits(struct folio *folio) -{ - struct bch_inode_info *inode = to_bch_ei(folio->mapping->host); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_folio *s = bch2_folio(folio); - struct disk_reservation disk_res = { 0 }; - int i, sectors = folio_sectors(folio), dirty_sectors = 0; - - if (!s) - return; - - EBUG_ON(!folio_test_locked(folio)); - EBUG_ON(folio_test_writeback(folio)); - - for (i = 0; i < sectors; i++) { - disk_res.sectors += s->s[i].replicas_reserved; - s->s[i].replicas_reserved = 0; - - dirty_sectors -= s->s[i].state == SECTOR_dirty; - folio_sector_set(folio, s, i, folio_sector_undirty(s->s[i].state)); - } - - bch2_disk_reservation_put(c, &disk_res); - - i_sectors_acct(c, inode, NULL, dirty_sectors); - - bch2_folio_release(folio); -} - -static void bch2_set_folio_dirty(struct bch_fs *c, - struct bch_inode_info *inode, - struct folio *folio, - struct bch2_folio_reservation *res, - unsigned offset, unsigned len) -{ - struct bch_folio *s = bch2_folio(folio); - unsigned i, dirty_sectors = 0; - - WARN_ON((u64) folio_pos(folio) + offset + len > - round_up((u64) i_size_read(&inode->v), block_bytes(c))); - - BUG_ON(!s->uptodate); - - spin_lock(&s->lock); - - for (i = round_down(offset, block_bytes(c)) >> 9; - i < round_up(offset + len, block_bytes(c)) >> 9; - i++) { - unsigned sectors = sectors_to_reserve(&s->s[i], - res->disk.nr_replicas); - - /* - * This can happen if we race with the error path in - * bch2_writepage_io_done(): - */ - sectors = min_t(unsigned, sectors, res->disk.sectors); - - s->s[i].replicas_reserved += sectors; - res->disk.sectors -= sectors; - - dirty_sectors += s->s[i].state == SECTOR_unallocated; - - folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state)); - } - - spin_unlock(&s->lock); - - i_sectors_acct(c, inode, &res->quota, dirty_sectors); - - if (!folio_test_dirty(folio)) - filemap_dirty_folio(inode->v.i_mapping, folio); -} - -vm_fault_t bch2_page_fault(struct vm_fault *vmf) -{ - struct file *file = vmf->vma->vm_file; - struct address_space *mapping = file->f_mapping; - struct address_space *fdm = faults_disabled_mapping(); - struct bch_inode_info *inode = file_bch_inode(file); - vm_fault_t ret; - - if (fdm == mapping) - return VM_FAULT_SIGBUS; - - /* Lock ordering: */ - if (fdm > mapping) { - struct bch_inode_info *fdm_host = to_bch_ei(fdm->host); - - if (bch2_pagecache_add_tryget(inode)) - goto got_lock; - - bch2_pagecache_block_put(fdm_host); - - bch2_pagecache_add_get(inode); - bch2_pagecache_add_put(inode); - - bch2_pagecache_block_get(fdm_host); - - /* Signal that lock has been dropped: */ - set_fdm_dropped_locks(); - return VM_FAULT_SIGBUS; - } - - bch2_pagecache_add_get(inode); -got_lock: - ret = filemap_fault(vmf); - bch2_pagecache_add_put(inode); - - return ret; -} - -vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) -{ - struct folio *folio = page_folio(vmf->page); - struct file *file = vmf->vma->vm_file; - struct bch_inode_info *inode = file_bch_inode(file); - struct address_space *mapping = file->f_mapping; - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch2_folio_reservation res; - unsigned len; - loff_t isize; - vm_fault_t ret; - - bch2_folio_reservation_init(c, inode, &res); - - sb_start_pagefault(inode->v.i_sb); - file_update_time(file); - - /* - * Not strictly necessary, but helps avoid dio writes livelocking in - * write_invalidate_inode_pages_range() - can drop this if/when we get - * a write_invalidate_inode_pages_range() that works without dropping - * page lock before invalidating page - */ - bch2_pagecache_add_get(inode); - - folio_lock(folio); - isize = i_size_read(&inode->v); - - if (folio->mapping != mapping || folio_pos(folio) >= isize) { - folio_unlock(folio); - ret = VM_FAULT_NOPAGE; - goto out; - } - - len = min_t(loff_t, folio_size(folio), isize - folio_pos(folio)); - - if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?: - bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) { - folio_unlock(folio); - ret = VM_FAULT_SIGBUS; - goto out; - } - - bch2_set_folio_dirty(c, inode, folio, &res, 0, len); - bch2_folio_reservation_put(c, inode, &res); - - folio_wait_stable(folio); - ret = VM_FAULT_LOCKED; -out: - bch2_pagecache_add_put(inode); - sb_end_pagefault(inode->v.i_sb); - - return ret; -} - -void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length) -{ - if (offset || length < folio_size(folio)) - return; - - bch2_clear_folio_bits(folio); -} - -bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask) -{ - if (folio_test_dirty(folio) || folio_test_writeback(folio)) - return false; - - bch2_clear_folio_bits(folio); - return true; -} - -/* readpage(s): */ - -static void bch2_readpages_end_io(struct bio *bio) -{ - struct folio_iter fi; - - bio_for_each_folio_all(fi, bio) { - if (!bio->bi_status) { - folio_mark_uptodate(fi.folio); - } else { - folio_clear_uptodate(fi.folio); - folio_set_error(fi.folio); - } - folio_unlock(fi.folio); - } - - bio_put(bio); -} - -struct readpages_iter { - struct address_space *mapping; - unsigned idx; - folios folios; -}; - -static int readpages_iter_init(struct readpages_iter *iter, - struct readahead_control *ractl) -{ - struct folio **fi; - int ret; - - memset(iter, 0, sizeof(*iter)); - - iter->mapping = ractl->mapping; - - ret = filemap_get_contig_folios_d(iter->mapping, - ractl->_index << PAGE_SHIFT, - (ractl->_index + ractl->_nr_pages) << PAGE_SHIFT, - 0, mapping_gfp_mask(iter->mapping), - &iter->folios); - if (ret) - return ret; - - darray_for_each(iter->folios, fi) { - ractl->_nr_pages -= 1U << folio_order(*fi); - __bch2_folio_create(*fi, __GFP_NOFAIL|GFP_KERNEL); - folio_put(*fi); - folio_put(*fi); - } - - return 0; -} - -static inline struct folio *readpage_iter_peek(struct readpages_iter *iter) -{ - if (iter->idx >= iter->folios.nr) - return NULL; - return iter->folios.data[iter->idx]; -} - -static inline void readpage_iter_advance(struct readpages_iter *iter) -{ - iter->idx++; -} - -static bool extent_partial_reads_expensive(struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - struct bch_extent_crc_unpacked crc; - const union bch_extent_entry *i; - - bkey_for_each_crc(k.k, ptrs, crc, i) - if (crc.csum_type || crc.compression_type) - return true; - return false; -} - -static int readpage_bio_extend(struct btree_trans *trans, - struct readpages_iter *iter, - struct bio *bio, - unsigned sectors_this_extent, - bool get_more) -{ - /* Don't hold btree locks while allocating memory: */ - bch2_trans_unlock(trans); - - while (bio_sectors(bio) < sectors_this_extent && - bio->bi_vcnt < bio->bi_max_vecs) { - struct folio *folio = readpage_iter_peek(iter); - int ret; - - if (folio) { - readpage_iter_advance(iter); - } else { - pgoff_t folio_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT; - - if (!get_more) - break; - - folio = xa_load(&iter->mapping->i_pages, folio_offset); - if (folio && !xa_is_value(folio)) - break; - - folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), 0); - if (!folio) - break; - - if (!__bch2_folio_create(folio, GFP_KERNEL)) { - folio_put(folio); - break; - } - - ret = filemap_add_folio(iter->mapping, folio, folio_offset, GFP_KERNEL); - if (ret) { - __bch2_folio_release(folio); - folio_put(folio); - break; - } - - folio_put(folio); - } - - BUG_ON(folio_sector(folio) != bio_end_sector(bio)); - - BUG_ON(!bio_add_folio(bio, folio, folio_size(folio), 0)); - } - - return bch2_trans_relock(trans); -} - -static void bchfs_read(struct btree_trans *trans, - struct bch_read_bio *rbio, - subvol_inum inum, - struct readpages_iter *readpages_iter) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_buf sk; - int flags = BCH_READ_RETRY_IF_STALE| - BCH_READ_MAY_PROMOTE; - u32 snapshot; - int ret = 0; - - rbio->c = c; - rbio->start_time = local_clock(); - rbio->subvol = inum.subvol; - - bch2_bkey_buf_init(&sk); -retry: - bch2_trans_begin(trans); - iter = (struct btree_iter) { NULL }; - - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - goto err; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot), - BTREE_ITER_SLOTS); - while (1) { - struct bkey_s_c k; - unsigned bytes, sectors, offset_into_extent; - enum btree_id data_btree = BTREE_ID_extents; - - /* - * read_extent -> io_time_reset may cause a transaction restart - * without returning an error, we need to check for that here: - */ - ret = bch2_trans_relock(trans); - if (ret) - break; - - bch2_btree_iter_set_pos(&iter, - POS(inum.inum, rbio->bio.bi_iter.bi_sector)); - - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); - if (ret) - break; - - offset_into_extent = iter.pos.offset - - bkey_start_offset(k.k); - sectors = k.k->size - offset_into_extent; - - bch2_bkey_buf_reassemble(&sk, c, k); - - ret = bch2_read_indirect_extent(trans, &data_btree, - &offset_into_extent, &sk); - if (ret) - break; - - k = bkey_i_to_s_c(sk.k); - - sectors = min(sectors, k.k->size - offset_into_extent); - - if (readpages_iter) { - ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors, - extent_partial_reads_expensive(k)); - if (ret) - break; - } - - bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; - swap(rbio->bio.bi_iter.bi_size, bytes); - - if (rbio->bio.bi_iter.bi_size == bytes) - flags |= BCH_READ_LAST_FRAGMENT; - - bch2_bio_page_state_set(&rbio->bio, k); - - bch2_read_extent(trans, rbio, iter.pos, - data_btree, k, offset_into_extent, flags); - - if (flags & BCH_READ_LAST_FRAGMENT) - break; - - swap(rbio->bio.bi_iter.bi_size, bytes); - bio_advance(&rbio->bio, bytes); - - ret = btree_trans_too_many_iters(trans); - if (ret) - break; - } -err: - bch2_trans_iter_exit(trans, &iter); - - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - if (ret) { - bch_err_inum_offset_ratelimited(c, - iter.pos.inode, - iter.pos.offset << 9, - "read error %i from btree lookup", ret); - rbio->bio.bi_status = BLK_STS_IOERR; - bio_endio(&rbio->bio); - } - - bch2_bkey_buf_exit(&sk, c); -} - -void bch2_readahead(struct readahead_control *ractl) -{ - struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_io_opts opts; - struct btree_trans trans; - struct folio *folio; - struct readpages_iter readpages_iter; - int ret; - - bch2_inode_opts_get(&opts, c, &inode->ei_inode); - - ret = readpages_iter_init(&readpages_iter, ractl); - BUG_ON(ret); - - bch2_trans_init(&trans, c, 0, 0); - - bch2_pagecache_add_get(inode); - - while ((folio = readpage_iter_peek(&readpages_iter))) { - unsigned n = min_t(unsigned, - readpages_iter.folios.nr - - readpages_iter.idx, - BIO_MAX_VECS); - struct bch_read_bio *rbio = - rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ, - GFP_KERNEL, &c->bio_read), - opts); - - readpage_iter_advance(&readpages_iter); - - rbio->bio.bi_iter.bi_sector = folio_sector(folio); - rbio->bio.bi_end_io = bch2_readpages_end_io; - BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); - - bchfs_read(&trans, rbio, inode_inum(inode), - &readpages_iter); - bch2_trans_unlock(&trans); - } - - bch2_pagecache_add_put(inode); - - bch2_trans_exit(&trans); - darray_exit(&readpages_iter.folios); -} - -static void __bchfs_readfolio(struct bch_fs *c, struct bch_read_bio *rbio, - subvol_inum inum, struct folio *folio) -{ - struct btree_trans trans; - - bch2_folio_create(folio, __GFP_NOFAIL); - - rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC; - rbio->bio.bi_iter.bi_sector = folio_sector(folio); - BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); - - bch2_trans_init(&trans, c, 0, 0); - bchfs_read(&trans, rbio, inum, NULL); - bch2_trans_exit(&trans); -} - -static void bch2_read_single_folio_end_io(struct bio *bio) -{ - complete(bio->bi_private); -} - -static int bch2_read_single_folio(struct folio *folio, - struct address_space *mapping) -{ - struct bch_inode_info *inode = to_bch_ei(mapping->host); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_read_bio *rbio; - struct bch_io_opts opts; - int ret; - DECLARE_COMPLETION_ONSTACK(done); - - bch2_inode_opts_get(&opts, c, &inode->ei_inode); - - rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read), - opts); - rbio->bio.bi_private = &done; - rbio->bio.bi_end_io = bch2_read_single_folio_end_io; - - __bchfs_readfolio(c, rbio, inode_inum(inode), folio); - wait_for_completion(&done); - - ret = blk_status_to_errno(rbio->bio.bi_status); - bio_put(&rbio->bio); - - if (ret < 0) - return ret; - - folio_mark_uptodate(folio); - return 0; -} - -int bch2_read_folio(struct file *file, struct folio *folio) -{ - int ret; - - ret = bch2_read_single_folio(folio, folio->mapping); - folio_unlock(folio); - return bch2_err_class(ret); -} - -/* writepages: */ - -struct bch_writepage_state { - struct bch_writepage_io *io; - struct bch_io_opts opts; - struct bch_folio_sector *tmp; - unsigned tmp_sectors; -}; - -static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c, - struct bch_inode_info *inode) -{ - struct bch_writepage_state ret = { 0 }; - - bch2_inode_opts_get(&ret.opts, c, &inode->ei_inode); - return ret; -} - -static void bch2_writepage_io_done(struct bch_write_op *op) -{ - struct bch_writepage_io *io = - container_of(op, struct bch_writepage_io, op); - struct bch_fs *c = io->op.c; - struct bio *bio = &io->op.wbio.bio; - struct folio_iter fi; - unsigned i; - - if (io->op.error) { - set_bit(EI_INODE_ERROR, &io->inode->ei_flags); - - bio_for_each_folio_all(fi, bio) { - struct bch_folio *s; - - folio_set_error(fi.folio); - mapping_set_error(fi.folio->mapping, -EIO); - - s = __bch2_folio(fi.folio); - spin_lock(&s->lock); - for (i = 0; i < folio_sectors(fi.folio); i++) - s->s[i].nr_replicas = 0; - spin_unlock(&s->lock); - } - } - - if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { - bio_for_each_folio_all(fi, bio) { - struct bch_folio *s; - - s = __bch2_folio(fi.folio); - spin_lock(&s->lock); - for (i = 0; i < folio_sectors(fi.folio); i++) - s->s[i].nr_replicas = 0; - spin_unlock(&s->lock); - } - } - - /* - * racing with fallocate can cause us to add fewer sectors than - * expected - but we shouldn't add more sectors than expected: - */ - WARN_ON_ONCE(io->op.i_sectors_delta > 0); - - /* - * (error (due to going RO) halfway through a page can screw that up - * slightly) - * XXX wtf? - BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS); - */ - - /* - * PageWriteback is effectively our ref on the inode - fixup i_blocks - * before calling end_page_writeback: - */ - i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); - - bio_for_each_folio_all(fi, bio) { - struct bch_folio *s = __bch2_folio(fi.folio); - - if (atomic_dec_and_test(&s->write_count)) - folio_end_writeback(fi.folio); - } - - bio_put(&io->op.wbio.bio); -} - -static void bch2_writepage_do_io(struct bch_writepage_state *w) -{ - struct bch_writepage_io *io = w->io; - - w->io = NULL; - closure_call(&io->op.cl, bch2_write, NULL, NULL); -} - -/* - * Get a bch_writepage_io and add @page to it - appending to an existing one if - * possible, else allocating a new one: - */ -static void bch2_writepage_io_alloc(struct bch_fs *c, - struct writeback_control *wbc, - struct bch_writepage_state *w, - struct bch_inode_info *inode, - u64 sector, - unsigned nr_replicas) -{ - struct bch_write_op *op; - - w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS, - REQ_OP_WRITE, - GFP_KERNEL, - &c->writepage_bioset), - struct bch_writepage_io, op.wbio.bio); - - w->io->inode = inode; - op = &w->io->op; - bch2_write_op_init(op, c, w->opts); - op->target = w->opts.foreground_target; - op->nr_replicas = nr_replicas; - op->res.nr_replicas = nr_replicas; - op->write_point = writepoint_hashed(inode->ei_last_dirtied); - op->subvol = inode->ei_subvol; - op->pos = POS(inode->v.i_ino, sector); - op->end_io = bch2_writepage_io_done; - op->devs_need_flush = &inode->ei_devs_need_flush; - op->wbio.bio.bi_iter.bi_sector = sector; - op->wbio.bio.bi_opf = wbc_to_write_flags(wbc); -} - -static int __bch2_writepage(struct folio *folio, - struct writeback_control *wbc, - void *data) -{ - struct bch_inode_info *inode = to_bch_ei(folio->mapping->host); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_writepage_state *w = data; - struct bch_folio *s; - unsigned i, offset, f_sectors, nr_replicas_this_write = U32_MAX; - loff_t i_size = i_size_read(&inode->v); - int ret; - - EBUG_ON(!folio_test_uptodate(folio)); - - /* Is the folio fully inside i_size? */ - if (folio_end_pos(folio) <= i_size) - goto do_io; - - /* Is the folio fully outside i_size? (truncate in progress) */ - if (folio_pos(folio) >= i_size) { - folio_unlock(folio); - return 0; - } - - /* - * The folio straddles i_size. It must be zeroed out on each and every - * writepage invocation because it may be mmapped. "A file is mapped - * in multiples of the folio size. For a file that is not a multiple of - * the folio size, the remaining memory is zeroed when mapped, and - * writes to that region are not written out to the file." - */ - folio_zero_segment(folio, - i_size - folio_pos(folio), - folio_size(folio)); -do_io: - f_sectors = folio_sectors(folio); - s = bch2_folio(folio); - - if (f_sectors > w->tmp_sectors) { - kfree(w->tmp); - w->tmp = kzalloc(sizeof(struct bch_folio_sector) * - f_sectors, __GFP_NOFAIL); - w->tmp_sectors = f_sectors; - } - - /* - * Things get really hairy with errors during writeback: - */ - ret = bch2_get_folio_disk_reservation(c, inode, folio, false); - BUG_ON(ret); - - /* Before unlocking the page, get copy of reservations: */ - spin_lock(&s->lock); - memcpy(w->tmp, s->s, sizeof(struct bch_folio_sector) * f_sectors); - - for (i = 0; i < f_sectors; i++) { - if (s->s[i].state < SECTOR_dirty) - continue; - - nr_replicas_this_write = - min_t(unsigned, nr_replicas_this_write, - s->s[i].nr_replicas + - s->s[i].replicas_reserved); - } - - for (i = 0; i < f_sectors; i++) { - if (s->s[i].state < SECTOR_dirty) - continue; - - s->s[i].nr_replicas = w->opts.compression - ? 0 : nr_replicas_this_write; - - s->s[i].replicas_reserved = 0; - folio_sector_set(folio, s, i, SECTOR_allocated); - } - spin_unlock(&s->lock); - - BUG_ON(atomic_read(&s->write_count)); - atomic_set(&s->write_count, 1); - - BUG_ON(folio_test_writeback(folio)); - folio_start_writeback(folio); - - folio_unlock(folio); - - offset = 0; - while (1) { - unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0; - u64 sector; - - while (offset < f_sectors && - w->tmp[offset].state < SECTOR_dirty) - offset++; - - if (offset == f_sectors) - break; - - while (offset + sectors < f_sectors && - w->tmp[offset + sectors].state >= SECTOR_dirty) { - reserved_sectors += w->tmp[offset + sectors].replicas_reserved; - dirty_sectors += w->tmp[offset + sectors].state == SECTOR_dirty; - sectors++; - } - BUG_ON(!sectors); - - sector = folio_sector(folio) + offset; - - if (w->io && - (w->io->op.res.nr_replicas != nr_replicas_this_write || - bio_full(&w->io->op.wbio.bio, sectors << 9) || - w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >= - (BIO_MAX_VECS * PAGE_SIZE) || - bio_end_sector(&w->io->op.wbio.bio) != sector)) - bch2_writepage_do_io(w); - - if (!w->io) - bch2_writepage_io_alloc(c, wbc, w, inode, sector, - nr_replicas_this_write); - - atomic_inc(&s->write_count); - - BUG_ON(inode != w->io->inode); - BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio, - sectors << 9, offset << 9)); - - /* Check for writing past i_size: */ - WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) > - round_up(i_size, block_bytes(c)) && - !test_bit(BCH_FS_EMERGENCY_RO, &c->flags), - "writing past i_size: %llu > %llu (unrounded %llu)\n", - bio_end_sector(&w->io->op.wbio.bio) << 9, - round_up(i_size, block_bytes(c)), - i_size); - - w->io->op.res.sectors += reserved_sectors; - w->io->op.i_sectors_delta -= dirty_sectors; - w->io->op.new_i_size = i_size; - - offset += sectors; - } - - if (atomic_dec_and_test(&s->write_count)) - folio_end_writeback(folio); - - return 0; -} - -int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc) -{ - struct bch_fs *c = mapping->host->i_sb->s_fs_info; - struct bch_writepage_state w = - bch_writepage_state_init(c, to_bch_ei(mapping->host)); - struct blk_plug plug; - int ret; - - blk_start_plug(&plug); - ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w); - if (w.io) - bch2_writepage_do_io(&w); - blk_finish_plug(&plug); - kfree(w.tmp); - return bch2_err_class(ret); -} - -/* buffered writes: */ - -int bch2_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct page **pagep, void **fsdata) -{ - struct bch_inode_info *inode = to_bch_ei(mapping->host); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch2_folio_reservation *res; - struct folio *folio; - unsigned offset; - int ret = -ENOMEM; - - res = kmalloc(sizeof(*res), GFP_KERNEL); - if (!res) - return -ENOMEM; - - bch2_folio_reservation_init(c, inode, res); - *fsdata = res; - - bch2_pagecache_add_get(inode); - - folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, - FGP_LOCK|FGP_WRITE|FGP_CREAT|FGP_STABLE, - mapping_gfp_mask(mapping)); - if (IS_ERR_OR_NULL(folio)) - goto err_unlock; - - if (folio_test_uptodate(folio)) - goto out; - - offset = pos - folio_pos(folio); - len = min_t(size_t, len, folio_end_pos(folio) - pos); - - /* If we're writing entire folio, don't need to read it in first: */ - if (!offset && len == folio_size(folio)) - goto out; - - if (!offset && pos + len >= inode->v.i_size) { - folio_zero_segment(folio, len, folio_size(folio)); - flush_dcache_folio(folio); - goto out; - } - - if (folio_pos(folio) >= inode->v.i_size) { - folio_zero_segments(folio, 0, offset, offset + len, folio_size(folio)); - flush_dcache_folio(folio); - goto out; - } -readpage: - ret = bch2_read_single_folio(folio, mapping); - if (ret) - goto err; -out: - ret = bch2_folio_set(c, inode_inum(inode), &folio, 1); - if (ret) - goto err; - - ret = bch2_folio_reservation_get(c, inode, folio, res, offset, len); - if (ret) { - if (!folio_test_uptodate(folio)) { - /* - * If the folio hasn't been read in, we won't know if we - * actually need a reservation - we don't actually need - * to read here, we just need to check if the folio is - * fully backed by uncompressed data: - */ - goto readpage; - } - - goto err; - } - - *pagep = &folio->page; - return 0; -err: - folio_unlock(folio); - folio_put(folio); - *pagep = NULL; -err_unlock: - bch2_pagecache_add_put(inode); - kfree(res); - *fsdata = NULL; - return bch2_err_class(ret); -} - -int bch2_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - struct bch_inode_info *inode = to_bch_ei(mapping->host); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch2_folio_reservation *res = fsdata; - struct folio *folio = page_folio(page); - unsigned offset = pos - folio_pos(folio); - - lockdep_assert_held(&inode->v.i_rwsem); - BUG_ON(offset + copied > folio_size(folio)); - - if (unlikely(copied < len && !folio_test_uptodate(folio))) { - /* - * The folio needs to be read in, but that would destroy - * our partial write - simplest thing is to just force - * userspace to redo the write: - */ - folio_zero_range(folio, 0, folio_size(folio)); - flush_dcache_folio(folio); - copied = 0; - } - - spin_lock(&inode->v.i_lock); - if (pos + copied > inode->v.i_size) - i_size_write(&inode->v, pos + copied); - spin_unlock(&inode->v.i_lock); - - if (copied) { - if (!folio_test_uptodate(folio)) - folio_mark_uptodate(folio); - - bch2_set_folio_dirty(c, inode, folio, res, offset, copied); - - inode->ei_last_dirtied = (unsigned long) current; - } - - folio_unlock(folio); - folio_put(folio); - bch2_pagecache_add_put(inode); - - bch2_folio_reservation_put(c, inode, res); - kfree(res); - - return copied; -} - -static noinline void folios_trunc(folios *folios, struct folio **fi) -{ - while (folios->data + folios->nr > fi) { - struct folio *f = darray_pop(folios); - - folio_unlock(f); - folio_put(f); - } -} - -static int __bch2_buffered_write(struct bch_inode_info *inode, - struct address_space *mapping, - struct iov_iter *iter, - loff_t pos, unsigned len) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch2_folio_reservation res; - folios folios; - struct folio **fi, *f; - unsigned copied = 0, f_offset; - u64 end = pos + len, f_pos; - loff_t last_folio_pos = inode->v.i_size; - int ret = 0; - - BUG_ON(!len); - - bch2_folio_reservation_init(c, inode, &res); - darray_init(&folios); - - ret = filemap_get_contig_folios_d(mapping, pos, end, - FGP_LOCK|FGP_WRITE|FGP_STABLE|FGP_CREAT, - mapping_gfp_mask(mapping), - &folios); - if (ret) - goto out; - - BUG_ON(!folios.nr); - - f = darray_first(folios); - if (pos != folio_pos(f) && !folio_test_uptodate(f)) { - ret = bch2_read_single_folio(f, mapping); - if (ret) - goto out; - } - - f = darray_last(folios); - end = min(end, folio_end_pos(f)); - last_folio_pos = folio_pos(f); - if (end != folio_end_pos(f) && !folio_test_uptodate(f)) { - if (end >= inode->v.i_size) { - folio_zero_range(f, 0, folio_size(f)); - } else { - ret = bch2_read_single_folio(f, mapping); - if (ret) - goto out; - } - } - - ret = bch2_folio_set(c, inode_inum(inode), folios.data, folios.nr); - if (ret) - goto out; - - f_pos = pos; - f_offset = pos - folio_pos(darray_first(folios)); - darray_for_each(folios, fi) { - struct folio *f = *fi; - u64 f_len = min(end, folio_end_pos(f)) - f_pos; - - /* - * XXX: per POSIX and fstests generic/275, on -ENOSPC we're - * supposed to write as much as we have disk space for. - * - * On failure here we should still write out a partial page if - * we aren't completely out of disk space - we don't do that - * yet: - */ - ret = bch2_folio_reservation_get(c, inode, f, &res, f_offset, f_len); - if (unlikely(ret)) { - folios_trunc(&folios, fi); - if (!folios.nr) - goto out; - - end = min(end, folio_end_pos(darray_last(folios))); - break; - } - - f_pos = folio_end_pos(f); - f_offset = 0; - } - - if (mapping_writably_mapped(mapping)) - darray_for_each(folios, fi) - flush_dcache_folio(*fi); - - f_pos = pos; - f_offset = pos - folio_pos(darray_first(folios)); - darray_for_each(folios, fi) { - struct folio *f = *fi; - u64 f_len = min(end, folio_end_pos(f)) - f_pos; - unsigned f_copied = copy_page_from_iter_atomic(&f->page, f_offset, f_len, iter); - - if (!f_copied) { - folios_trunc(&folios, fi); - break; - } - - if (!folio_test_uptodate(f) && - f_copied != folio_size(f) && - pos + copied + f_copied < inode->v.i_size) { - folio_zero_range(f, 0, folio_size(f)); - folios_trunc(&folios, fi); - break; - } - - flush_dcache_folio(f); - copied += f_copied; - - if (f_copied != f_len) { - folios_trunc(&folios, fi + 1); - break; - } - - f_pos = folio_end_pos(f); - f_offset = 0; - } - - if (!copied) - goto out; - - end = pos + copied; - - spin_lock(&inode->v.i_lock); - if (end > inode->v.i_size) - i_size_write(&inode->v, end); - spin_unlock(&inode->v.i_lock); - - f_pos = pos; - f_offset = pos - folio_pos(darray_first(folios)); - darray_for_each(folios, fi) { - struct folio *f = *fi; - u64 f_len = min(end, folio_end_pos(f)) - f_pos; - - if (!folio_test_uptodate(f)) - folio_mark_uptodate(f); - - bch2_set_folio_dirty(c, inode, f, &res, f_offset, f_len); - - f_pos = folio_end_pos(f); - f_offset = 0; - } - - inode->ei_last_dirtied = (unsigned long) current; -out: - darray_for_each(folios, fi) { - folio_unlock(*fi); - folio_put(*fi); - } - - /* - * If the last folio added to the mapping starts beyond current EOF, we - * performed a short write but left around at least one post-EOF folio. - * Clean up the mapping before we return. - */ - if (last_folio_pos >= inode->v.i_size) - truncate_pagecache(&inode->v, inode->v.i_size); - - darray_exit(&folios); - bch2_folio_reservation_put(c, inode, &res); - - return copied ?: ret; -} - -static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) -{ - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct bch_inode_info *inode = file_bch_inode(file); - loff_t pos = iocb->ki_pos; - ssize_t written = 0; - int ret = 0; - - bch2_pagecache_add_get(inode); - - do { - unsigned offset = pos & (PAGE_SIZE - 1); - unsigned bytes = iov_iter_count(iter); -again: - /* - * Bring in the user page that we will copy from _first_. - * Otherwise there's a nasty deadlock on copying from the - * same page as we're writing to, without it being marked - * up-to-date. - * - * Not only is this an optimisation, but it is also required - * to check that the address is actually valid, when atomic - * usercopies are used, below. - */ - if (unlikely(fault_in_iov_iter_readable(iter, bytes))) { - bytes = min_t(unsigned long, iov_iter_count(iter), - PAGE_SIZE - offset); - - if (unlikely(fault_in_iov_iter_readable(iter, bytes))) { - ret = -EFAULT; - break; - } - } - - if (unlikely(fatal_signal_pending(current))) { - ret = -EINTR; - break; - } - - ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes); - if (unlikely(ret < 0)) - break; - - cond_resched(); - - if (unlikely(ret == 0)) { - /* - * If we were unable to copy any data at all, we must - * fall back to a single segment length write. - * - * If we didn't fallback here, we could livelock - * because not all segments in the iov can be copied at - * once without a pagefault. - */ - bytes = min_t(unsigned long, PAGE_SIZE - offset, - iov_iter_single_seg_count(iter)); - goto again; - } - pos += ret; - written += ret; - ret = 0; - - balance_dirty_pages_ratelimited(mapping); - } while (iov_iter_count(iter)); - - bch2_pagecache_add_put(inode); - - return written ? written : ret; -} - -/* O_DIRECT reads */ - -static void bio_check_or_release(struct bio *bio, bool check_dirty) -{ - if (check_dirty) { - bio_check_pages_dirty(bio); - } else { - bio_release_pages(bio, false); - bio_put(bio); - } -} - -static void bch2_dio_read_complete(struct closure *cl) -{ - struct dio_read *dio = container_of(cl, struct dio_read, cl); - - dio->req->ki_complete(dio->req, dio->ret); - bio_check_or_release(&dio->rbio.bio, dio->should_dirty); -} - -static void bch2_direct_IO_read_endio(struct bio *bio) -{ - struct dio_read *dio = bio->bi_private; - - if (bio->bi_status) - dio->ret = blk_status_to_errno(bio->bi_status); - - closure_put(&dio->cl); -} - -static void bch2_direct_IO_read_split_endio(struct bio *bio) -{ - struct dio_read *dio = bio->bi_private; - bool should_dirty = dio->should_dirty; - - bch2_direct_IO_read_endio(bio); - bio_check_or_release(bio, should_dirty); -} - -static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) -{ - struct file *file = req->ki_filp; - struct bch_inode_info *inode = file_bch_inode(file); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_io_opts opts; - struct dio_read *dio; - struct bio *bio; - loff_t offset = req->ki_pos; - bool sync = is_sync_kiocb(req); - size_t shorten; - ssize_t ret; - - bch2_inode_opts_get(&opts, c, &inode->ei_inode); - - if ((offset|iter->count) & (block_bytes(c) - 1)) - return -EINVAL; - - ret = min_t(loff_t, iter->count, - max_t(loff_t, 0, i_size_read(&inode->v) - offset)); - - if (!ret) - return ret; - - shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c)); - iter->count -= shorten; - - bio = bio_alloc_bioset(NULL, - bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), - REQ_OP_READ, - GFP_KERNEL, - &c->dio_read_bioset); - - bio->bi_end_io = bch2_direct_IO_read_endio; - - dio = container_of(bio, struct dio_read, rbio.bio); - closure_init(&dio->cl, NULL); - - /* - * this is a _really_ horrible hack just to avoid an atomic sub at the - * end: - */ - if (!sync) { - set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL); - atomic_set(&dio->cl.remaining, - CLOSURE_REMAINING_INITIALIZER - - CLOSURE_RUNNING + - CLOSURE_DESTRUCTOR); - } else { - atomic_set(&dio->cl.remaining, - CLOSURE_REMAINING_INITIALIZER + 1); - } - - dio->req = req; - dio->ret = ret; - /* - * This is one of the sketchier things I've encountered: we have to skip - * the dirtying of requests that are internal from the kernel (i.e. from - * loopback), because we'll deadlock on page_lock. - */ - dio->should_dirty = iter_is_iovec(iter); - - goto start; - while (iter->count) { - bio = bio_alloc_bioset(NULL, - bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), - REQ_OP_READ, - GFP_KERNEL, - &c->bio_read); - bio->bi_end_io = bch2_direct_IO_read_split_endio; -start: - bio->bi_opf = REQ_OP_READ|REQ_SYNC; - bio->bi_iter.bi_sector = offset >> 9; - bio->bi_private = dio; - - ret = bio_iov_iter_get_pages(bio, iter); - if (ret < 0) { - /* XXX: fault inject this path */ - bio->bi_status = BLK_STS_RESOURCE; - bio_endio(bio); - break; - } - - offset += bio->bi_iter.bi_size; - - if (dio->should_dirty) - bio_set_pages_dirty(bio); - - if (iter->count) - closure_get(&dio->cl); - - bch2_read(c, rbio_init(bio, opts), inode_inum(inode)); - } - - iter->count += shorten; - - if (sync) { - closure_sync(&dio->cl); - closure_debug_destroy(&dio->cl); - ret = dio->ret; - bio_check_or_release(&dio->rbio.bio, dio->should_dirty); - return ret; - } else { - return -EIOCBQUEUED; - } -} - -ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) -{ - struct file *file = iocb->ki_filp; - struct bch_inode_info *inode = file_bch_inode(file); - struct address_space *mapping = file->f_mapping; - size_t count = iov_iter_count(iter); - ssize_t ret; - - if (!count) - return 0; /* skip atime */ - - if (iocb->ki_flags & IOCB_DIRECT) { - struct blk_plug plug; - - if (unlikely(mapping->nrpages)) { - ret = filemap_write_and_wait_range(mapping, - iocb->ki_pos, - iocb->ki_pos + count - 1); - if (ret < 0) - goto out; - } - - file_accessed(file); - - blk_start_plug(&plug); - ret = bch2_direct_IO_read(iocb, iter); - blk_finish_plug(&plug); - - if (ret >= 0) - iocb->ki_pos += ret; - } else { - bch2_pagecache_add_get(inode); - ret = generic_file_read_iter(iocb, iter); - bch2_pagecache_add_put(inode); - } -out: - return bch2_err_class(ret); -} - -/* O_DIRECT writes */ - -static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum, - u64 offset, u64 size, - unsigned nr_replicas, bool compressed) -{ - struct btree_trans trans; - struct btree_iter iter; - struct bkey_s_c k; - u64 end = offset + size; - u32 snapshot; - bool ret = true; - int err; - - bch2_trans_init(&trans, c, 0, 0); -retry: - bch2_trans_begin(&trans); - - err = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); - if (err) - goto err; - - for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, - SPOS(inum.inum, offset, snapshot), - BTREE_ITER_SLOTS, k, err) { - if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end))) - break; - - if (k.k->p.snapshot != snapshot || - nr_replicas > bch2_bkey_replicas(c, k) || - (!compressed && bch2_bkey_sectors_compressed(k))) { - ret = false; - break; - } - } - - offset = iter.pos.offset; - bch2_trans_iter_exit(&trans, &iter); -err: - if (bch2_err_matches(err, BCH_ERR_transaction_restart)) - goto retry; - bch2_trans_exit(&trans); - - return err ? false : ret; -} - -static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio) -{ - struct bch_fs *c = dio->op.c; - struct bch_inode_info *inode = dio->inode; - struct bio *bio = &dio->op.wbio.bio; - - return bch2_check_range_allocated(c, inode_inum(inode), - dio->op.pos.offset, bio_sectors(bio), - dio->op.opts.data_replicas, - dio->op.opts.compression != 0); -} - -static void bch2_dio_write_loop_async(struct bch_write_op *); -static __always_inline long bch2_dio_write_done(struct dio_write *dio); - -/* - * We're going to return -EIOCBQUEUED, but we haven't finished consuming the - * iov_iter yet, so we need to stash a copy of the iovec: it might be on the - * caller's stack, we're not guaranteed that it will live for the duration of - * the IO: - */ -static noinline int bch2_dio_write_copy_iov(struct dio_write *dio) -{ - struct iovec *iov = dio->inline_vecs; - - /* - * iov_iter has a single embedded iovec - nothing to do: - */ - if (iter_is_ubuf(&dio->iter)) - return 0; - - /* - * We don't currently handle non-iovec iov_iters here - return an error, - * and we'll fall back to doing the IO synchronously: - */ - if (!iter_is_iovec(&dio->iter)) - return -1; - - if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) { - iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov), - GFP_KERNEL); - if (unlikely(!iov)) - return -ENOMEM; - - dio->free_iov = true; - } - - memcpy(iov, dio->iter.__iov, dio->iter.nr_segs * sizeof(*iov)); - dio->iter.__iov = iov; - return 0; -} - -static void bch2_dio_write_flush_done(struct closure *cl) -{ - struct dio_write *dio = container_of(cl, struct dio_write, op.cl); - struct bch_fs *c = dio->op.c; - - closure_debug_destroy(cl); - - dio->op.error = bch2_journal_error(&c->journal); - - bch2_dio_write_done(dio); -} +struct nocow_flush { + struct closure *cl; + struct bch_dev *ca; + struct bio bio; +}; -static noinline void bch2_dio_write_flush(struct dio_write *dio) +static void nocow_flush_endio(struct bio *_bio) { - struct bch_fs *c = dio->op.c; - struct bch_inode_unpacked inode; - int ret; - dio->flush = 0; - - closure_init(&dio->op.cl, NULL); - - if (!dio->op.error) { - ret = bch2_inode_find_by_inum(c, inode_inum(dio->inode), &inode); - if (ret) { - dio->op.error = ret; - } else { - bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq, &dio->op.cl); - bch2_inode_flush_nocow_writes_async(c, dio->inode, &dio->op.cl); - } - } + struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio); - if (dio->sync) { - closure_sync(&dio->op.cl); - closure_debug_destroy(&dio->op.cl); - } else { - continue_at(&dio->op.cl, bch2_dio_write_flush_done, NULL); - } + closure_put(bio->cl); + percpu_ref_put(&bio->ca->io_ref); + bio_put(&bio->bio); } -static __always_inline long bch2_dio_write_done(struct dio_write *dio) +void bch2_inode_flush_nocow_writes_async(struct bch_fs *c, + struct bch_inode_info *inode, + struct closure *cl) { - struct kiocb *req = dio->req; - struct bch_inode_info *inode = dio->inode; - bool sync = dio->sync; - long ret; - - if (unlikely(dio->flush)) { - bch2_dio_write_flush(dio); - if (!sync) - return -EIOCBQUEUED; - } - - bch2_pagecache_block_put(inode); + struct nocow_flush *bio; + struct bch_dev *ca; + struct bch_devs_mask devs; + unsigned dev; - if (dio->free_iov) - kfree(dio->iter.__iov); + dev = find_first_bit(inode->ei_devs_need_flush.d, BCH_SB_MEMBERS_MAX); + if (dev == BCH_SB_MEMBERS_MAX) + return; - ret = dio->op.error ?: ((long) dio->written << 9); - bio_put(&dio->op.wbio.bio); + devs = inode->ei_devs_need_flush; + memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); - /* inode->i_dio_count is our ref on inode and thus bch_fs */ - inode_dio_end(&inode->v); + for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) { + rcu_read_lock(); + ca = rcu_dereference(c->devs[dev]); + if (ca && !percpu_ref_tryget(&ca->io_ref)) + ca = NULL; + rcu_read_unlock(); - if (ret < 0) - ret = bch2_err_class(ret); + if (!ca) + continue; - if (!sync) { - req->ki_complete(req, ret); - ret = -EIOCBQUEUED; + bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0, + REQ_OP_FLUSH, + GFP_KERNEL, + &c->nocow_flush_bioset), + struct nocow_flush, bio); + bio->cl = cl; + bio->ca = ca; + bio->bio.bi_end_io = nocow_flush_endio; + closure_bio_submit(&bio->bio, cl); } - return ret; } -static __always_inline void bch2_dio_write_end(struct dio_write *dio) +static int bch2_inode_flush_nocow_writes(struct bch_fs *c, + struct bch_inode_info *inode) { - struct bch_fs *c = dio->op.c; - struct kiocb *req = dio->req; - struct bch_inode_info *inode = dio->inode; - struct bio *bio = &dio->op.wbio.bio; - - req->ki_pos += (u64) dio->op.written << 9; - dio->written += dio->op.written; - - if (dio->extending) { - spin_lock(&inode->v.i_lock); - if (req->ki_pos > inode->v.i_size) - i_size_write(&inode->v, req->ki_pos); - spin_unlock(&inode->v.i_lock); - } - - if (dio->op.i_sectors_delta || dio->quota_res.sectors) { - mutex_lock(&inode->ei_quota_lock); - __i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta); - __bch2_quota_reservation_put(c, inode, &dio->quota_res); - mutex_unlock(&inode->ei_quota_lock); - } + struct closure cl; - bio_release_pages(bio, false); + closure_init_stack(&cl); + bch2_inode_flush_nocow_writes_async(c, inode, &cl); + closure_sync(&cl); - if (unlikely(dio->op.error)) - set_bit(EI_INODE_ERROR, &inode->ei_flags); + return 0; } -static __always_inline long bch2_dio_write_loop(struct dio_write *dio) -{ - struct bch_fs *c = dio->op.c; - struct kiocb *req = dio->req; - struct address_space *mapping = dio->mapping; - struct bch_inode_info *inode = dio->inode; - struct bch_io_opts opts; - struct bio *bio = &dio->op.wbio.bio; - unsigned unaligned, iter_count; - bool sync = dio->sync, dropped_locks; - long ret; - - bch2_inode_opts_get(&opts, c, &inode->ei_inode); - - while (1) { - iter_count = dio->iter.count; - - EBUG_ON(current->faults_disabled_mapping); - current->faults_disabled_mapping = mapping; - - ret = bio_iov_iter_get_pages(bio, &dio->iter); - - dropped_locks = fdm_dropped_locks(); - - current->faults_disabled_mapping = NULL; - - /* - * If the fault handler returned an error but also signalled - * that it dropped & retook ei_pagecache_lock, we just need to - * re-shoot down the page cache and retry: - */ - if (dropped_locks && ret) - ret = 0; - - if (unlikely(ret < 0)) - goto err; - - if (unlikely(dropped_locks)) { - ret = write_invalidate_inode_pages_range(mapping, - req->ki_pos, - req->ki_pos + iter_count - 1); - if (unlikely(ret)) - goto err; - - if (!bio->bi_iter.bi_size) - continue; - } - - unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1); - bio->bi_iter.bi_size -= unaligned; - iov_iter_revert(&dio->iter, unaligned); - - if (!bio->bi_iter.bi_size) { - /* - * bio_iov_iter_get_pages was only able to get < - * blocksize worth of pages: - */ - ret = -EFAULT; - goto err; - } - - bch2_write_op_init(&dio->op, c, opts); - dio->op.end_io = sync - ? NULL - : bch2_dio_write_loop_async; - dio->op.target = dio->op.opts.foreground_target; - dio->op.write_point = writepoint_hashed((unsigned long) current); - dio->op.nr_replicas = dio->op.opts.data_replicas; - dio->op.subvol = inode->ei_subvol; - dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9); - dio->op.devs_need_flush = &inode->ei_devs_need_flush; - - if (sync) - dio->op.flags |= BCH_WRITE_SYNC; - dio->op.flags |= BCH_WRITE_CHECK_ENOSPC; - - ret = bch2_quota_reservation_add(c, inode, &dio->quota_res, - bio_sectors(bio), true); - if (unlikely(ret)) - goto err; - - ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio), - dio->op.opts.data_replicas, 0); - if (unlikely(ret) && - !bch2_dio_write_check_allocated(dio)) - goto err; - - task_io_account_write(bio->bi_iter.bi_size); - - if (unlikely(dio->iter.count) && - !dio->sync && - !dio->loop && - bch2_dio_write_copy_iov(dio)) - dio->sync = sync = true; - - dio->loop = true; - closure_call(&dio->op.cl, bch2_write, NULL, NULL); - - if (!sync) - return -EIOCBQUEUED; - - bch2_dio_write_end(dio); - - if (likely(!dio->iter.count) || dio->op.error) - break; - - bio_reset(bio, NULL, REQ_OP_WRITE); - } -out: - return bch2_dio_write_done(dio); -err: - dio->op.error = ret; - - bio_release_pages(bio, false); +/* i_size updates: */ - bch2_quota_reservation_put(c, inode, &dio->quota_res); - goto out; -} +struct inode_new_size { + loff_t new_size; + u64 now; + unsigned fields; +}; -static noinline __cold void bch2_dio_write_continue(struct dio_write *dio) +static int inode_set_size(struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) { - struct mm_struct *mm = dio->mm; + struct inode_new_size *s = p; - bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE); + bi->bi_size = s->new_size; + if (s->fields & ATTR_ATIME) + bi->bi_atime = s->now; + if (s->fields & ATTR_MTIME) + bi->bi_mtime = s->now; + if (s->fields & ATTR_CTIME) + bi->bi_ctime = s->now; - if (mm) - kthread_use_mm(mm); - bch2_dio_write_loop(dio); - if (mm) - kthread_unuse_mm(mm); + return 0; } -static void bch2_dio_write_loop_async(struct bch_write_op *op) +int __must_check bch2_write_inode_size(struct bch_fs *c, + struct bch_inode_info *inode, + loff_t new_size, unsigned fields) { - struct dio_write *dio = container_of(op, struct dio_write, op); - - bch2_dio_write_end(dio); + struct inode_new_size s = { + .new_size = new_size, + .now = bch2_current_time(c), + .fields = fields, + }; - if (likely(!dio->iter.count) || dio->op.error) - bch2_dio_write_done(dio); - else - bch2_dio_write_continue(dio); + return bch2_write_inode(c, inode, inode_set_size, &s, fields); } -static noinline -ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) +void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, + struct quota_res *quota_res, s64 sectors) { - struct file *file = req->ki_filp; - struct address_space *mapping = file->f_mapping; - struct bch_inode_info *inode = file_bch_inode(file); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct dio_write *dio; - struct bio *bio; - bool locked = true, extending; - ssize_t ret; - - prefetch(&c->opts); - prefetch((void *) &c->opts + 64); - prefetch(&inode->ei_inode); - prefetch((void *) &inode->ei_inode + 64); - - inode_lock(&inode->v); - - ret = generic_write_checks(req, iter); - if (unlikely(ret <= 0)) - goto err; - - ret = file_remove_privs(file); - if (unlikely(ret)) - goto err; - - ret = file_update_time(file); - if (unlikely(ret)) - goto err; - - if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) - goto err; - - inode_dio_begin(&inode->v); - bch2_pagecache_block_get(inode); + bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c, + "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)", + inode->v.i_ino, (u64) inode->v.i_blocks, sectors, + inode->ei_inode.bi_sectors); + inode->v.i_blocks += sectors; - extending = req->ki_pos + iter->count > inode->v.i_size; - if (!extending) { - inode_unlock(&inode->v); - locked = false; - } +#ifdef CONFIG_BCACHEFS_QUOTA + if (quota_res && + !test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags) && + sectors > 0) { + BUG_ON(sectors > quota_res->sectors); + BUG_ON(sectors > inode->ei_quota_reserved); - bio = bio_alloc_bioset(NULL, - bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), - REQ_OP_WRITE, - GFP_KERNEL, - &c->dio_write_bioset); - dio = container_of(bio, struct dio_write, op.wbio.bio); - dio->req = req; - dio->mapping = mapping; - dio->inode = inode; - dio->mm = current->mm; - dio->loop = false; - dio->extending = extending; - dio->sync = is_sync_kiocb(req) || extending; - dio->flush = iocb_is_dsync(req) && !c->opts.journal_flush_disabled; - dio->free_iov = false; - dio->quota_res.sectors = 0; - dio->written = 0; - dio->iter = *iter; - dio->op.c = c; - - if (unlikely(mapping->nrpages)) { - ret = write_invalidate_inode_pages_range(mapping, - req->ki_pos, - req->ki_pos + iter->count - 1); - if (unlikely(ret)) - goto err_put_bio; + quota_res->sectors -= sectors; + inode->ei_quota_reserved -= sectors; + } else { + bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN); } - - ret = bch2_dio_write_loop(dio); -err: - if (locked) - inode_unlock(&inode->v); - return ret; -err_put_bio: - bch2_pagecache_block_put(inode); - bio_put(bio); - inode_dio_end(&inode->v); - goto err; +#endif } -ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) -{ - struct file *file = iocb->ki_filp; - struct bch_inode_info *inode = file_bch_inode(file); - ssize_t ret; - - if (iocb->ki_flags & IOCB_DIRECT) { - ret = bch2_direct_write(iocb, from); - goto out; - } - - /* We can write back this queue in page reclaim */ - current->backing_dev_info = inode_to_bdi(&inode->v); - inode_lock(&inode->v); - - ret = generic_write_checks(iocb, from); - if (ret <= 0) - goto unlock; - - ret = file_remove_privs(file); - if (ret) - goto unlock; - - ret = file_update_time(file); - if (ret) - goto unlock; - - ret = bch2_buffered_write(iocb, from); - if (likely(ret > 0)) - iocb->ki_pos += ret; -unlock: - inode_unlock(&inode->v); - current->backing_dev_info = NULL; - - if (ret > 0) - ret = generic_write_sync(iocb, ret); -out: - return bch2_err_class(ret); -} /* fsync: */ @@ -2911,10 +305,10 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode, s->s[i].nr_replicas = 0; i_sectors_delta -= s->s[i].state == SECTOR_dirty; - folio_sector_set(folio, s, i, SECTOR_unallocated); + bch2_folio_sector_set(folio, s, i, SECTOR_unallocated); } - i_sectors_acct(c, inode, NULL, i_sectors_delta); + bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); /* * Caller needs to know whether this folio will be written out by @@ -3105,7 +499,7 @@ int bch2_truncate(struct mnt_idmap *idmap, ret = bch2_fpunch(c, inode_inum(inode), round_up(iattr->ia_size, block_bytes(c)) >> 9, U64_MAX, &i_sectors_delta); - i_sectors_acct(c, inode, NULL, i_sectors_delta); + bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks && !bch2_journal_error(&c->journal), c, @@ -3159,7 +553,7 @@ static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len ret = bch2_fpunch(c, inode_inum(inode), block_start >> 9, block_end >> 9, &i_sectors_delta); - i_sectors_acct(c, inode, NULL, i_sectors_delta); + bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); } mutex_lock(&inode->ei_update_lock); @@ -3210,7 +604,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, new_size = inode->v.i_size + shift; - ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX); + ret = bch2_write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX); if (ret) return ret; @@ -3226,7 +620,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, ret = bch2_fpunch(c, inode_inum(inode), offset >> 9, (offset + len) >> 9, &i_sectors_delta); - i_sectors_acct(c, inode, NULL, i_sectors_delta); + bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); if (ret) return ret; @@ -3410,6 +804,10 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, } if (!(mode & FALLOC_FL_ZERO_RANGE)) { + /* + * Lock ordering - can't be holding btree locks while + * blocking on a folio lock: + */ if (bch2_clamp_data_hole(&inode->v, &hole_start, &hole_end, @@ -3443,10 +841,10 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, if (ret) goto bkey_err; - i_sectors_acct(c, inode, "a_res, i_sectors_delta); + bch2_i_sectors_acct(c, inode, "a_res, i_sectors_delta); drop_locks_do(&trans, - (mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0)); + (bch2_mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0)); bkey_err: bch2_quota_reservation_put(c, inode, "a_res); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -3459,7 +857,7 @@ bkey_err: bch2_fpunch_at(&trans, &iter, inode_inum(inode), end_sector, &i_sectors_delta); - i_sectors_acct(c, inode, "a_res, i_sectors_delta); + bch2_i_sectors_acct(c, inode, "a_res, i_sectors_delta); bch2_quota_reservation_put(c, inode, "a_res); } @@ -3653,7 +1051,7 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, aligned_len = round_up((u64) len, block_bytes(c)); - ret = write_invalidate_inode_pages_range(dst->v.i_mapping, + ret = bch2_write_invalidate_inode_pages_range(dst->v.i_mapping, pos_dst, pos_dst + len - 1); if (ret) goto err; @@ -3665,7 +1063,7 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, file_update_time(file_dst); - mark_pagecache_unallocated(src, pos_src >> 9, + bch2_mark_pagecache_unallocated(src, pos_src >> 9, (pos_src + aligned_len) >> 9); ret = bch2_remap_range(c, @@ -3681,7 +1079,7 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, */ ret = min((u64) ret << 9, (u64) len); - i_sectors_acct(c, dst, "a_res, i_sectors_delta); + bch2_i_sectors_acct(c, dst, "a_res, i_sectors_delta); spin_lock(&dst->v.i_lock); if (pos_dst + ret > dst->v.i_size) @@ -3700,68 +1098,6 @@ err: /* fseek: */ -static int folio_data_offset(struct folio *folio, loff_t pos, - unsigned min_replicas) -{ - struct bch_folio *s = bch2_folio(folio); - unsigned i, sectors = folio_sectors(folio); - - if (s) - for (i = folio_pos_to_s(folio, pos); i < sectors; i++) - if (s->s[i].state >= SECTOR_dirty && - s->s[i].nr_replicas + s->s[i].replicas_reserved >= min_replicas) - return i << SECTOR_SHIFT; - - return -1; -} - -static loff_t bch2_seek_pagecache_data(struct inode *vinode, - loff_t start_offset, - loff_t end_offset, - unsigned min_replicas, - bool nonblock) -{ - struct folio_batch fbatch; - pgoff_t start_index = start_offset >> PAGE_SHIFT; - pgoff_t end_index = end_offset >> PAGE_SHIFT; - pgoff_t index = start_index; - unsigned i; - loff_t ret; - int offset; - - folio_batch_init(&fbatch); - - while (filemap_get_folios(vinode->i_mapping, - &index, end_index, &fbatch)) { - for (i = 0; i < folio_batch_count(&fbatch); i++) { - struct folio *folio = fbatch.folios[i]; - - if (!nonblock) { - folio_lock(folio); - } else if (!folio_trylock(folio)) { - folio_batch_release(&fbatch); - return -EAGAIN; - } - - offset = folio_data_offset(folio, - max(folio_pos(folio), start_offset), - min_replicas); - if (offset >= 0) { - ret = clamp(folio_pos(folio) + offset, - start_offset, end_offset); - folio_unlock(folio); - folio_batch_release(&fbatch); - return ret; - } - folio_unlock(folio); - } - folio_batch_release(&fbatch); - cond_resched(); - } - - return end_offset; -} - static loff_t bch2_seek_data(struct file *file, u64 offset) { struct bch_inode_info *inode = file_bch_inode(file); @@ -3815,88 +1151,6 @@ err: return vfs_setpos(file, next_data, MAX_LFS_FILESIZE); } -static int folio_hole_offset(struct address_space *mapping, loff_t *offset, - unsigned min_replicas, bool nonblock) -{ - struct folio *folio; - struct bch_folio *s; - unsigned i, sectors; - bool ret = true; - - folio = __filemap_get_folio(mapping, *offset >> PAGE_SHIFT, - !nonblock ? FGP_LOCK : 0, 0); - if (IS_ERR_OR_NULL(folio)) - return true; - - if (nonblock && !folio_trylock(folio)) { - folio_put(folio); - return -EAGAIN; - } - - s = bch2_folio(folio); - if (!s) - goto unlock; - - sectors = folio_sectors(folio); - for (i = folio_pos_to_s(folio, *offset); i < sectors; i++) - if (s->s[i].state < SECTOR_dirty || - s->s[i].nr_replicas + s->s[i].replicas_reserved < min_replicas) { - *offset = max(*offset, - folio_pos(folio) + (i << SECTOR_SHIFT)); - goto unlock; - } - - *offset = folio_end_pos(folio); - ret = false; -unlock: - folio_unlock(folio); - folio_put(folio); - return ret; -} - -static loff_t bch2_seek_pagecache_hole(struct inode *vinode, - loff_t start_offset, - loff_t end_offset, - unsigned min_replicas, - bool nonblock) -{ - struct address_space *mapping = vinode->i_mapping; - loff_t offset = start_offset; - - while (offset < end_offset && - !folio_hole_offset(mapping, &offset, min_replicas, nonblock)) - ; - - return min(offset, end_offset); -} - -static int bch2_clamp_data_hole(struct inode *inode, - u64 *hole_start, - u64 *hole_end, - unsigned min_replicas, - bool nonblock) -{ - loff_t ret; - - ret = bch2_seek_pagecache_hole(inode, - *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9; - if (ret < 0) - return ret; - - *hole_start = ret; - - if (*hole_start == *hole_end) - return 0; - - ret = bch2_seek_pagecache_data(inode, - *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9; - if (ret < 0) - return ret; - - *hole_end = ret; - return 0; -} - static loff_t bch2_seek_hole(struct file *file, u64 offset) { struct bch_inode_info *inode = file_bch_inode(file); @@ -3981,28 +1235,10 @@ loff_t bch2_llseek(struct file *file, loff_t offset, int whence) void bch2_fs_fsio_exit(struct bch_fs *c) { bioset_exit(&c->nocow_flush_bioset); - bioset_exit(&c->dio_write_bioset); - bioset_exit(&c->dio_read_bioset); - bioset_exit(&c->writepage_bioset); } int bch2_fs_fsio_init(struct bch_fs *c) { - if (bioset_init(&c->writepage_bioset, - 4, offsetof(struct bch_writepage_io, op.wbio.bio), - BIOSET_NEED_BVECS)) - return -BCH_ERR_ENOMEM_writepage_bioset_init; - - if (bioset_init(&c->dio_read_bioset, - 4, offsetof(struct dio_read, rbio.bio), - BIOSET_NEED_BVECS)) - return -BCH_ERR_ENOMEM_dio_read_bioset_init; - - if (bioset_init(&c->dio_write_bioset, - 4, offsetof(struct dio_write, op.wbio.bio), - BIOSET_NEED_BVECS)) - return -BCH_ERR_ENOMEM_dio_write_bioset_init; - if (bioset_init(&c->nocow_flush_bioset, 1, offsetof(struct nocow_flush, bio), 0)) return -BCH_ERR_ENOMEM_nocow_flush_bioset_init; diff --git a/libbcachefs/fs-io.h b/libbcachefs/fs-io.h index af905331..c6704618 100644 --- a/libbcachefs/fs-io.h +++ b/libbcachefs/fs-io.h @@ -5,28 +5,163 @@ #ifndef NO_BCACHEFS_FS #include "buckets.h" +#include "fs.h" #include "io_types.h" +#include "quota.h" #include -struct quota_res; +struct folio_vec { + struct folio *fv_folio; + size_t fv_offset; + size_t fv_len; +}; + +static inline struct folio_vec biovec_to_foliovec(struct bio_vec bv) +{ + + struct folio *folio = page_folio(bv.bv_page); + size_t offset = (folio_page_idx(folio, bv.bv_page) << PAGE_SHIFT) + + bv.bv_offset; + size_t len = min_t(size_t, folio_size(folio) - offset, bv.bv_len); + + return (struct folio_vec) { + .fv_folio = folio, + .fv_offset = offset, + .fv_len = len, + }; +} + +static inline struct folio_vec bio_iter_iovec_folio(struct bio *bio, + struct bvec_iter iter) +{ + return biovec_to_foliovec(bio_iter_iovec(bio, iter)); +} + +#define __bio_for_each_folio(bvl, bio, iter, start) \ + for (iter = (start); \ + (iter).bi_size && \ + ((bvl = bio_iter_iovec_folio((bio), (iter))), 1); \ + bio_advance_iter_single((bio), &(iter), (bvl).fv_len)) + +/** + * bio_for_each_folio - iterate over folios within a bio + * + * Like other non-_all versions, this iterates over what bio->bi_iter currently + * points to. This version is for drivers, where the bio may have previously + * been split or cloned. + */ +#define bio_for_each_folio(bvl, bio, iter) \ + __bio_for_each_folio(bvl, bio, iter, (bio)->bi_iter) + +struct quota_res { + u64 sectors; +}; + +#ifdef CONFIG_BCACHEFS_QUOTA + +static inline void __bch2_quota_reservation_put(struct bch_fs *c, + struct bch_inode_info *inode, + struct quota_res *res) +{ + BUG_ON(res->sectors > inode->ei_quota_reserved); + + bch2_quota_acct(c, inode->ei_qid, Q_SPC, + -((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC); + inode->ei_quota_reserved -= res->sectors; + res->sectors = 0; +} + +static inline void bch2_quota_reservation_put(struct bch_fs *c, + struct bch_inode_info *inode, + struct quota_res *res) +{ + if (res->sectors) { + mutex_lock(&inode->ei_quota_lock); + __bch2_quota_reservation_put(c, inode, res); + mutex_unlock(&inode->ei_quota_lock); + } +} + +static inline int bch2_quota_reservation_add(struct bch_fs *c, + struct bch_inode_info *inode, + struct quota_res *res, + u64 sectors, + bool check_enospc) +{ + int ret; + + if (test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags)) + return 0; + + mutex_lock(&inode->ei_quota_lock); + ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, + check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK); + if (likely(!ret)) { + inode->ei_quota_reserved += sectors; + res->sectors += sectors; + } + mutex_unlock(&inode->ei_quota_lock); + + return ret; +} -int __must_check bch2_write_inode_size(struct bch_fs *, - struct bch_inode_info *, - loff_t, unsigned); +#else + +static void __bch2_quota_reservation_put(struct bch_fs *c, + struct bch_inode_info *inode, + struct quota_res *res) {} -int bch2_read_folio(struct file *, struct folio *); +static void bch2_quota_reservation_put(struct bch_fs *c, + struct bch_inode_info *inode, + struct quota_res *res) {} -int bch2_writepages(struct address_space *, struct writeback_control *); -void bch2_readahead(struct readahead_control *); +static int bch2_quota_reservation_add(struct bch_fs *c, + struct bch_inode_info *inode, + struct quota_res *res, + unsigned sectors, + bool check_enospc) +{ + return 0; +} -int bch2_write_begin(struct file *, struct address_space *, loff_t, - unsigned, struct page **, void **); -int bch2_write_end(struct file *, struct address_space *, loff_t, - unsigned, unsigned, struct page *, void *); +#endif -ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *); -ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *); +void __bch2_i_sectors_acct(struct bch_fs *, struct bch_inode_info *, + struct quota_res *, s64); + +static inline void bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, + struct quota_res *quota_res, s64 sectors) +{ + if (sectors) { + mutex_lock(&inode->ei_quota_lock); + __bch2_i_sectors_acct(c, inode, quota_res, sectors); + mutex_unlock(&inode->ei_quota_lock); + } +} + +static inline struct address_space *faults_disabled_mapping(void) +{ + return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL); +} + +static inline void set_fdm_dropped_locks(void) +{ + current->faults_disabled_mapping = + (void *) (((unsigned long) current->faults_disabled_mapping)|1); +} + +static inline bool fdm_dropped_locks(void) +{ + return ((unsigned long) current->faults_disabled_mapping) & 1; +} + +void bch2_inode_flush_nocow_writes_async(struct bch_fs *, + struct bch_inode_info *, struct closure *); + +int __must_check bch2_write_inode_size(struct bch_fs *, + struct bch_inode_info *, + loff_t, unsigned); int bch2_fsync(struct file *, loff_t, loff_t, int); @@ -39,11 +174,6 @@ loff_t bch2_remap_file_range(struct file *, loff_t, struct file *, loff_t bch2_llseek(struct file *, loff_t, int); -vm_fault_t bch2_page_fault(struct vm_fault *); -vm_fault_t bch2_page_mkwrite(struct vm_fault *); -void bch2_invalidate_folio(struct folio *, size_t, size_t); -bool bch2_release_folio(struct folio *, gfp_t); - void bch2_fs_fsio_exit(struct bch_fs *); int bch2_fs_fsio_init(struct bch_fs *); #else diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 8d2f388b..0b550a97 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -14,6 +14,8 @@ #include "fs-common.h" #include "fs-io.h" #include "fs-ioctl.h" +#include "fs-io-buffered.h" +#include "fs-io-pagecache.h" #include "fsck.h" #include "inode.h" #include "io.h" @@ -203,7 +205,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) if (ret) { iget_failed(&inode->v); - return ERR_PTR(ret); + return ERR_PTR(bch2_err_class(ret)); } mutex_lock(&c->vfs_inodes_lock); @@ -1000,11 +1002,16 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) { struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; + int ret; if (!dir_emit_dots(file, ctx)) return 0; - return bch2_readdir(c, inode_inum(inode), ctx); + ret = bch2_readdir(c, inode_inum(inode), ctx); + if (ret) + bch_err_fn(c, ret); + + return bch2_err_class(ret); } static const struct file_operations bch_file_operations = { diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index 0852dbe9..d99c04af 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -11,6 +11,7 @@ #include "fsck.h" #include "inode.h" #include "keylist.h" +#include "recovery.h" #include "subvolume.h" #include "super.h" #include "xattr.h" diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index fea21e1e..e0d41655 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -348,6 +348,8 @@ int bch2_inode_peek(struct btree_trans *trans, return 0; err: bch2_trans_iter_exit(trans, iter); + if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) + bch_err_msg(trans->c, ret, "looking up inum %u:%llu:", inum.subvol, inum.inum); return ret; } @@ -520,23 +522,25 @@ void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c __bch2_inode_unpacked_to_text(out, &inode); } -static inline bool bkey_is_deleted_inode(struct bkey_s_c k) +static inline u64 bkey_inode_flags(struct bkey_s_c k) { switch (k.k->type) { case KEY_TYPE_inode: - return bkey_s_c_to_inode(k).v->bi_flags & - cpu_to_le32(BCH_INODE_UNLINKED); + return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags); case KEY_TYPE_inode_v2: - return bkey_s_c_to_inode_v2(k).v->bi_flags & - cpu_to_le32(BCH_INODE_UNLINKED); + return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags); case KEY_TYPE_inode_v3: - return bkey_s_c_to_inode_v3(k).v->bi_flags & - cpu_to_le64(BCH_INODE_UNLINKED); + return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags); default: - return false; + return 0; } } +static inline bool bkey_is_deleted_inode(struct bkey_s_c k) +{ + return bkey_inode_flags(k) & BCH_INODE_UNLINKED; +} + int bch2_trans_mark_inode(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c old, diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index f861ae2f..d141c749 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -14,6 +14,7 @@ #include "journal_reclaim.h" #include "journal_seq_blacklist.h" #include "replicas.h" +#include "sb-clean.h" #include "trace.h" static struct nonce journal_nonce(const struct jset *jset) @@ -208,33 +209,41 @@ static void journal_entry_null_range(void *start, void *end) #define JOURNAL_ENTRY_BAD 7 static void journal_entry_err_msg(struct printbuf *out, + u32 version, struct jset *jset, struct jset_entry *entry) { - prt_str(out, "invalid journal entry "); - if (entry) - prt_printf(out, "%s ", bch2_jset_entry_types[entry->type]); - - if (!jset) - prt_printf(out, "in superblock"); - else if (!entry) - prt_printf(out, "at seq %llu", le64_to_cpu(jset->seq)); - else - prt_printf(out, "at offset %zi/%u seq %llu", - (u64 *) entry - jset->_data, - le32_to_cpu(jset->u64s), - le64_to_cpu(jset->seq)); + prt_str(out, "invalid journal entry, version="); + bch2_version_to_text(out, version); + + if (entry) { + prt_str(out, " type="); + prt_str(out, bch2_jset_entry_types[entry->type]); + } + + if (!jset) { + prt_printf(out, " in superblock"); + } else { + + prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq)); + + if (entry) + prt_printf(out, " offset=%zi/%u", + (u64 *) entry - jset->_data, + le32_to_cpu(jset->u64s)); + } + prt_str(out, ": "); } -#define journal_entry_err(c, jset, entry, msg, ...) \ +#define journal_entry_err(c, version, jset, entry, msg, ...) \ ({ \ struct printbuf buf = PRINTBUF; \ \ - journal_entry_err_msg(&buf, jset, entry); \ + journal_entry_err_msg(&buf, version, jset, entry); \ prt_printf(&buf, msg, ##__VA_ARGS__); \ \ - switch (write) { \ + switch (flags & BKEY_INVALID_WRITE) { \ case READ: \ mustfix_fsck_err(c, "%s", buf.buf); \ break; \ @@ -251,8 +260,8 @@ static void journal_entry_err_msg(struct printbuf *out, true; \ }) -#define journal_entry_err_on(cond, c, jset, entry, msg, ...) \ - ((cond) ? journal_entry_err(c, jset, entry, msg, ##__VA_ARGS__) : false) +#define journal_entry_err_on(cond, c, version, jset, entry, msg, ...) \ + ((cond) ? journal_entry_err(c, version, jset, entry, msg, ##__VA_ARGS__) : false) #define FSCK_DELETED_KEY 5 @@ -261,13 +270,15 @@ static int journal_validate_key(struct bch_fs *c, struct jset_entry *entry, unsigned level, enum btree_id btree_id, struct bkey_i *k, - unsigned version, int big_endian, int write) + unsigned version, int big_endian, + enum bkey_invalid_flags flags) { + int write = flags & BKEY_INVALID_WRITE; void *next = vstruct_next(entry); struct printbuf buf = PRINTBUF; int ret = 0; - if (journal_entry_err_on(!k->k.u64s, c, jset, entry, "k->u64s 0")) { + if (journal_entry_err_on(!k->k.u64s, c, version, jset, entry, "k->u64s 0")) { entry->u64s = cpu_to_le16((u64 *) k - entry->_data); journal_entry_null_range(vstruct_next(entry), next); return FSCK_DELETED_KEY; @@ -275,7 +286,7 @@ static int journal_validate_key(struct bch_fs *c, if (journal_entry_err_on((void *) bkey_next(k) > (void *) vstruct_next(entry), - c, jset, entry, + c, version, jset, entry, "extends past end of journal entry")) { entry->u64s = cpu_to_le16((u64 *) k - entry->_data); journal_entry_null_range(vstruct_next(entry), next); @@ -283,7 +294,7 @@ static int journal_validate_key(struct bch_fs *c, } if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, - c, jset, entry, + c, version, jset, entry, "bad format %u", k->k.format)) { le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); memmove(k, bkey_next(k), next - (void *) bkey_next(k)); @@ -298,11 +309,7 @@ static int journal_validate_key(struct bch_fs *c, if (bch2_bkey_invalid(c, bkey_i_to_s_c(k), __btree_node_type(level, btree_id), write, &buf)) { printbuf_reset(&buf); - prt_printf(&buf, "invalid journal entry %s at offset %zi/%u seq %llu:", - bch2_jset_entry_types[entry->type], - (u64 *) entry - jset->_data, - le32_to_cpu(jset->u64s), - le64_to_cpu(jset->seq)); + journal_entry_err_msg(&buf, version, jset, entry); prt_newline(&buf); printbuf_indent_add(&buf, 2); @@ -312,6 +319,7 @@ static int journal_validate_key(struct bch_fs *c, __btree_node_type(level, btree_id), write, &buf); mustfix_fsck_err(c, "%s", buf.buf); + BUG(); le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); memmove(k, bkey_next(k), next - (void *) bkey_next(k)); @@ -330,9 +338,10 @@ fsck_err: } static int journal_entry_btree_keys_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, int write) + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) { struct bkey_i *k = entry->start; @@ -341,7 +350,7 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c, entry->level, entry->btree_id, k, version, big_endian, - write|BKEY_INVALID_JOURNAL); + flags|BKEY_INVALID_JOURNAL); if (ret == FSCK_DELETED_KEY) continue; @@ -369,16 +378,17 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs } static int journal_entry_btree_root_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, int write) + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) { struct bkey_i *k = entry->start; int ret = 0; if (journal_entry_err_on(!entry->u64s || le16_to_cpu(entry->u64s) != k->k.u64s, - c, jset, entry, + c, version, jset, entry, "invalid btree root journal entry: wrong number of keys")) { void *next = vstruct_next(entry); /* @@ -392,7 +402,7 @@ static int journal_entry_btree_root_validate(struct bch_fs *c, } return journal_validate_key(c, jset, entry, 1, entry->btree_id, k, - version, big_endian, write); + version, big_endian, flags); fsck_err: return ret; } @@ -404,9 +414,10 @@ static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs } static int journal_entry_prio_ptrs_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, int write) + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) { /* obsolete, don't care: */ return 0; @@ -418,14 +429,15 @@ static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs } static int journal_entry_blacklist_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, int write) + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) { int ret = 0; if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, - c, jset, entry, + c, version, jset, entry, "invalid journal seq blacklist entry: bad size")) { journal_entry_null_range(entry, vstruct_next(entry)); } @@ -443,15 +455,16 @@ static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs } static int journal_entry_blacklist_v2_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, int write) + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) { struct jset_entry_blacklist_v2 *bl_entry; int ret = 0; if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, - c, jset, entry, + c, version, jset, entry, "invalid journal seq blacklist entry: bad size")) { journal_entry_null_range(entry, vstruct_next(entry)); goto out; @@ -461,7 +474,7 @@ static int journal_entry_blacklist_v2_validate(struct bch_fs *c, if (journal_entry_err_on(le64_to_cpu(bl_entry->start) > le64_to_cpu(bl_entry->end), - c, jset, entry, + c, version, jset, entry, "invalid journal seq blacklist entry: start > end")) { journal_entry_null_range(entry, vstruct_next(entry)); } @@ -482,9 +495,10 @@ static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_ } static int journal_entry_usage_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, int write) + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) { struct jset_entry_usage *u = container_of(entry, struct jset_entry_usage, entry); @@ -492,7 +506,7 @@ static int journal_entry_usage_validate(struct bch_fs *c, int ret = 0; if (journal_entry_err_on(bytes < sizeof(*u), - c, jset, entry, + c, version, jset, entry, "invalid journal entry usage: bad size")) { journal_entry_null_range(entry, vstruct_next(entry)); return ret; @@ -514,9 +528,10 @@ static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c, } static int journal_entry_data_usage_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, int write) + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) { struct jset_entry_data_usage *u = container_of(entry, struct jset_entry_data_usage, entry); @@ -525,7 +540,7 @@ static int journal_entry_data_usage_validate(struct bch_fs *c, if (journal_entry_err_on(bytes < sizeof(*u) || bytes < sizeof(*u) + u->r.nr_devs, - c, jset, entry, + c, version, jset, entry, "invalid journal entry usage: bad size")) { journal_entry_null_range(entry, vstruct_next(entry)); return ret; @@ -546,9 +561,10 @@ static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs } static int journal_entry_clock_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, int write) + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) { struct jset_entry_clock *clock = container_of(entry, struct jset_entry_clock, entry); @@ -556,13 +572,13 @@ static int journal_entry_clock_validate(struct bch_fs *c, int ret = 0; if (journal_entry_err_on(bytes != sizeof(*clock), - c, jset, entry, "bad size")) { + c, version, jset, entry, "bad size")) { journal_entry_null_range(entry, vstruct_next(entry)); return ret; } if (journal_entry_err_on(clock->rw > 1, - c, jset, entry, "bad rw")) { + c, version, jset, entry, "bad rw")) { journal_entry_null_range(entry, vstruct_next(entry)); return ret; } @@ -581,9 +597,10 @@ static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c, } static int journal_entry_dev_usage_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, int write) + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) { struct jset_entry_dev_usage *u = container_of(entry, struct jset_entry_dev_usage, entry); @@ -593,7 +610,7 @@ static int journal_entry_dev_usage_validate(struct bch_fs *c, int ret = 0; if (journal_entry_err_on(bytes < expected, - c, jset, entry, "bad size (%u < %u)", + c, version, jset, entry, "bad size (%u < %u)", bytes, expected)) { journal_entry_null_range(entry, vstruct_next(entry)); return ret; @@ -602,13 +619,13 @@ static int journal_entry_dev_usage_validate(struct bch_fs *c, dev = le32_to_cpu(u->dev); if (journal_entry_err_on(!bch2_dev_exists2(c, dev), - c, jset, entry, "bad dev")) { + c, version, jset, entry, "bad dev")) { journal_entry_null_range(entry, vstruct_next(entry)); return ret; } if (journal_entry_err_on(u->pad, - c, jset, entry, "bad pad")) { + c, version, jset, entry, "bad pad")) { journal_entry_null_range(entry, vstruct_next(entry)); return ret; } @@ -641,9 +658,10 @@ static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs } static int journal_entry_log_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, int write) + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) { return 0; } @@ -658,9 +676,10 @@ static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c, } static int journal_entry_overwrite_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, int write) + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) { return journal_entry_btree_keys_validate(c, jset, entry, version, big_endian, READ); @@ -674,7 +693,8 @@ static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs struct jset_entry_ops { int (*validate)(struct bch_fs *, struct jset *, - struct jset_entry *, unsigned, int, int); + struct jset_entry *, unsigned, int, + enum bkey_invalid_flags); void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *); }; @@ -691,11 +711,12 @@ static const struct jset_entry_ops bch2_jset_entry_ops[] = { int bch2_journal_entry_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, - unsigned version, int big_endian, int write) + unsigned version, int big_endian, + enum bkey_invalid_flags flags) { return entry->type < BCH_JSET_ENTRY_NR ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry, - version, big_endian, write) + version, big_endian, flags) : 0; } @@ -711,22 +732,22 @@ void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, } static int jset_validate_entries(struct bch_fs *c, struct jset *jset, - int write) + enum bkey_invalid_flags flags) { struct jset_entry *entry; + unsigned version = le32_to_cpu(jset->version); int ret = 0; vstruct_for_each(jset, entry) { - if (journal_entry_err_on(vstruct_next(entry) > - vstruct_last(jset), c, jset, entry, + if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset), + c, version, jset, entry, "journal entry extends past end of jset")) { jset->u64s = cpu_to_le32((u64 *) entry - jset->_data); break; } ret = bch2_journal_entry_validate(c, jset, entry, - le32_to_cpu(jset->version), - JSET_BIG_ENDIAN(jset), write); + version, JSET_BIG_ENDIAN(jset), flags); if (ret) break; } @@ -737,7 +758,7 @@ fsck_err: static int jset_validate(struct bch_fs *c, struct bch_dev *ca, struct jset *jset, u64 sector, - int write) + enum bkey_invalid_flags flags) { unsigned version; int ret = 0; @@ -746,7 +767,8 @@ static int jset_validate(struct bch_fs *c, return JOURNAL_ENTRY_NONE; version = le32_to_cpu(jset->version); - if (journal_entry_err_on(!bch2_version_compatible(version), c, jset, NULL, + if (journal_entry_err_on(!bch2_version_compatible(version), + c, version, jset, NULL, "%s sector %llu seq %llu: incompatible journal entry version %u.%u", ca ? ca->name : c->name, sector, le64_to_cpu(jset->seq), @@ -757,7 +779,7 @@ static int jset_validate(struct bch_fs *c, } if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), - c, jset, NULL, + c, version, jset, NULL, "%s sector %llu seq %llu: journal entry with unknown csum type %llu", ca ? ca->name : c->name, sector, le64_to_cpu(jset->seq), @@ -767,7 +789,7 @@ static int jset_validate(struct bch_fs *c, /* last_seq is ignored when JSET_NO_FLUSH is true */ if (journal_entry_err_on(!JSET_NO_FLUSH(jset) && le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), - c, jset, NULL, + c, version, jset, NULL, "invalid journal entry: last_seq > seq (%llu > %llu)", le64_to_cpu(jset->last_seq), le64_to_cpu(jset->seq))) { @@ -775,7 +797,7 @@ static int jset_validate(struct bch_fs *c, return JOURNAL_ENTRY_BAD; } - ret = jset_validate_entries(c, jset, write); + ret = jset_validate_entries(c, jset, flags); fsck_err: return ret; } @@ -788,14 +810,15 @@ static int jset_validate_early(struct bch_fs *c, { size_t bytes = vstruct_bytes(jset); unsigned version; - int write = READ; + enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL; int ret = 0; if (le64_to_cpu(jset->magic) != jset_magic(c)) return JOURNAL_ENTRY_NONE; version = le32_to_cpu(jset->version); - if (journal_entry_err_on(!bch2_version_compatible(version), c, jset, NULL, + if (journal_entry_err_on(!bch2_version_compatible(version), + c, version, jset, NULL, "%s sector %llu seq %llu: unknown journal entry version %u.%u", ca ? ca->name : c->name, sector, le64_to_cpu(jset->seq), @@ -810,7 +833,7 @@ static int jset_validate_early(struct bch_fs *c, return JOURNAL_ENTRY_REREAD; if (journal_entry_err_on(bytes > bucket_sectors_left << 9, - c, jset, NULL, + c, version, jset, NULL, "%s sector %llu seq %llu: journal entry too big (%zu bytes)", ca ? ca->name : c->name, sector, le64_to_cpu(jset->seq), bytes)) @@ -1127,7 +1150,7 @@ int bch2_journal_read(struct bch_fs *c, * those entries will be blacklisted: */ genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) { - int write = READ; + enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL; i = *_i; @@ -1149,7 +1172,7 @@ int bch2_journal_read(struct bch_fs *c, } if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq), - c, &i->j, NULL, + c, le32_to_cpu(i->j.version), &i->j, NULL, "invalid journal entry: last_seq > seq (%llu > %llu)", le64_to_cpu(i->j.last_seq), le64_to_cpu(i->j.seq))) diff --git a/libbcachefs/journal_io.h b/libbcachefs/journal_io.h index 8801e981..a88d097b 100644 --- a/libbcachefs/journal_io.h +++ b/libbcachefs/journal_io.h @@ -50,7 +50,8 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, jset_entry_for_each_key(entry, k) int bch2_journal_entry_validate(struct bch_fs *, struct jset *, - struct jset_entry *, unsigned, int, int); + struct jset_entry *, unsigned, int, + enum bkey_invalid_flags); void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *, struct jset_entry *); diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index 8de83e10..7a3139b5 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -3,13 +3,14 @@ #include "bcachefs.h" #include "btree_key_cache.h" #include "btree_update.h" +#include "buckets.h" #include "errcode.h" #include "error.h" #include "journal.h" #include "journal_io.h" #include "journal_reclaim.h" #include "replicas.h" -#include "super.h" +#include "sb-members.h" #include "trace.h" #include diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index 5242f20b..256431a6 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -220,8 +220,10 @@ static int bch2_copygc(struct btree_trans *trans, f = move_bucket_in_flight_add(buckets_in_flight, *i); ret = PTR_ERR_OR_ZERO(f); - if (ret == -EEXIST) /* rare race: copygc_get_buckets returned same bucket more than once */ + if (ret == -EEXIST) { /* rare race: copygc_get_buckets returned same bucket more than once */ + ret = 0; continue; + } if (ret == -ENOMEM) { /* flush IO, continue later */ ret = 0; break; diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 55a233c2..cb2186a7 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -5,6 +5,7 @@ #include "bkey_buf.h" #include "alloc_background.h" #include "btree_gc.h" +#include "btree_journal_iter.h" #include "btree_update.h" #include "btree_update_interior.h" #include "btree_io.h" @@ -23,6 +24,7 @@ #include "quota.h" #include "recovery.h" #include "replicas.h" +#include "sb-clean.h" #include "subvolume.h" #include "super-io.h" @@ -57,524 +59,6 @@ static void zero_out_btree_mem_ptr(struct journal_keys *keys) bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0; } -/* iterate over keys read from the journal: */ - -static int __journal_key_cmp(enum btree_id l_btree_id, - unsigned l_level, - struct bpos l_pos, - const struct journal_key *r) -{ - return (cmp_int(l_btree_id, r->btree_id) ?: - cmp_int(l_level, r->level) ?: - bpos_cmp(l_pos, r->k->k.p)); -} - -static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r) -{ - return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r); -} - -static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx) -{ - size_t gap_size = keys->size - keys->nr; - - if (idx >= keys->gap) - idx += gap_size; - return idx; -} - -static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx) -{ - return keys->d + idx_to_pos(keys, idx); -} - -static size_t __bch2_journal_key_search(struct journal_keys *keys, - enum btree_id id, unsigned level, - struct bpos pos) -{ - size_t l = 0, r = keys->nr, m; - - while (l < r) { - m = l + ((r - l) >> 1); - if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0) - l = m + 1; - else - r = m; - } - - BUG_ON(l < keys->nr && - __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0); - - BUG_ON(l && - __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0); - - return l; -} - -static size_t bch2_journal_key_search(struct journal_keys *keys, - enum btree_id id, unsigned level, - struct bpos pos) -{ - return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos)); -} - -struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id, - unsigned level, struct bpos pos, - struct bpos end_pos, size_t *idx) -{ - struct journal_keys *keys = &c->journal_keys; - unsigned iters = 0; - struct journal_key *k; -search: - if (!*idx) - *idx = __bch2_journal_key_search(keys, btree_id, level, pos); - - while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { - if (__journal_key_cmp(btree_id, level, end_pos, k) < 0) - return NULL; - - if (__journal_key_cmp(btree_id, level, pos, k) <= 0 && - !k->overwritten) - return k->k; - - (*idx)++; - iters++; - if (iters == 10) { - *idx = 0; - goto search; - } - } - - return NULL; -} - -struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id, - unsigned level, struct bpos pos) -{ - size_t idx = 0; - - return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx); -} - -static void journal_iters_fix(struct bch_fs *c) -{ - struct journal_keys *keys = &c->journal_keys; - /* The key we just inserted is immediately before the gap: */ - size_t gap_end = keys->gap + (keys->size - keys->nr); - struct btree_and_journal_iter *iter; - - /* - * If an iterator points one after the key we just inserted, decrement - * the iterator so it points at the key we just inserted - if the - * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will - * handle that: - */ - list_for_each_entry(iter, &c->journal_iters, journal.list) - if (iter->journal.idx == gap_end) - iter->journal.idx = keys->gap - 1; -} - -static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap) -{ - struct journal_keys *keys = &c->journal_keys; - struct journal_iter *iter; - size_t gap_size = keys->size - keys->nr; - - list_for_each_entry(iter, &c->journal_iters, list) { - if (iter->idx > old_gap) - iter->idx -= gap_size; - if (iter->idx >= new_gap) - iter->idx += gap_size; - } -} - -int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, - unsigned level, struct bkey_i *k) -{ - struct journal_key n = { - .btree_id = id, - .level = level, - .k = k, - .allocated = true, - /* - * Ensure these keys are done last by journal replay, to unblock - * journal reclaim: - */ - .journal_seq = U32_MAX, - }; - struct journal_keys *keys = &c->journal_keys; - size_t idx = bch2_journal_key_search(keys, id, level, k->k.p); - - BUG_ON(test_bit(BCH_FS_RW, &c->flags)); - - if (idx < keys->size && - journal_key_cmp(&n, &keys->d[idx]) == 0) { - if (keys->d[idx].allocated) - kfree(keys->d[idx].k); - keys->d[idx] = n; - return 0; - } - - if (idx > keys->gap) - idx -= keys->size - keys->nr; - - if (keys->nr == keys->size) { - struct journal_keys new_keys = { - .nr = keys->nr, - .size = max_t(size_t, keys->size, 8) * 2, - }; - - new_keys.d = kvmalloc_array(new_keys.size, sizeof(new_keys.d[0]), GFP_KERNEL); - if (!new_keys.d) { - bch_err(c, "%s: error allocating new key array (size %zu)", - __func__, new_keys.size); - return -BCH_ERR_ENOMEM_journal_key_insert; - } - - /* Since @keys was full, there was no gap: */ - memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr); - kvfree(keys->d); - *keys = new_keys; - - /* And now the gap is at the end: */ - keys->gap = keys->nr; - } - - journal_iters_move_gap(c, keys->gap, idx); - - move_gap(keys->d, keys->nr, keys->size, keys->gap, idx); - keys->gap = idx; - - keys->nr++; - keys->d[keys->gap++] = n; - - journal_iters_fix(c); - - return 0; -} - -/* - * Can only be used from the recovery thread while we're still RO - can't be - * used once we've got RW, as journal_keys is at that point used by multiple - * threads: - */ -int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, - unsigned level, struct bkey_i *k) -{ - struct bkey_i *n; - int ret; - - n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL); - if (!n) - return -BCH_ERR_ENOMEM_journal_key_insert; - - bkey_copy(n, k); - ret = bch2_journal_key_insert_take(c, id, level, n); - if (ret) - kfree(n); - return ret; -} - -int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id, - unsigned level, struct bpos pos) -{ - struct bkey_i whiteout; - - bkey_init(&whiteout.k); - whiteout.k.p = pos; - - return bch2_journal_key_insert(c, id, level, &whiteout); -} - -void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, - unsigned level, struct bpos pos) -{ - struct journal_keys *keys = &c->journal_keys; - size_t idx = bch2_journal_key_search(keys, btree, level, pos); - - if (idx < keys->size && - keys->d[idx].btree_id == btree && - keys->d[idx].level == level && - bpos_eq(keys->d[idx].k->k.p, pos)) - keys->d[idx].overwritten = true; -} - -static void bch2_journal_iter_advance(struct journal_iter *iter) -{ - if (iter->idx < iter->keys->size) { - iter->idx++; - if (iter->idx == iter->keys->gap) - iter->idx += iter->keys->size - iter->keys->nr; - } -} - -static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) -{ - struct journal_key *k = iter->keys->d + iter->idx; - - while (k < iter->keys->d + iter->keys->size && - k->btree_id == iter->btree_id && - k->level == iter->level) { - if (!k->overwritten) - return bkey_i_to_s_c(k->k); - - bch2_journal_iter_advance(iter); - k = iter->keys->d + iter->idx; - } - - return bkey_s_c_null; -} - -static void bch2_journal_iter_exit(struct journal_iter *iter) -{ - list_del(&iter->list); -} - -static void bch2_journal_iter_init(struct bch_fs *c, - struct journal_iter *iter, - enum btree_id id, unsigned level, - struct bpos pos) -{ - iter->btree_id = id; - iter->level = level; - iter->keys = &c->journal_keys; - iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos); -} - -static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) -{ - return bch2_btree_node_iter_peek_unpack(&iter->node_iter, - iter->b, &iter->unpacked); -} - -static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter) -{ - bch2_btree_node_iter_advance(&iter->node_iter, iter->b); -} - -void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) -{ - if (bpos_eq(iter->pos, SPOS_MAX)) - iter->at_end = true; - else - iter->pos = bpos_successor(iter->pos); -} - -struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) -{ - struct bkey_s_c btree_k, journal_k, ret; -again: - if (iter->at_end) - return bkey_s_c_null; - - while ((btree_k = bch2_journal_iter_peek_btree(iter)).k && - bpos_lt(btree_k.k->p, iter->pos)) - bch2_journal_iter_advance_btree(iter); - - while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k && - bpos_lt(journal_k.k->p, iter->pos)) - bch2_journal_iter_advance(&iter->journal); - - ret = journal_k.k && - (!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p)) - ? journal_k - : btree_k; - - if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key)) - ret = bkey_s_c_null; - - if (ret.k) { - iter->pos = ret.k->p; - if (bkey_deleted(ret.k)) { - bch2_btree_and_journal_iter_advance(iter); - goto again; - } - } else { - iter->pos = SPOS_MAX; - iter->at_end = true; - } - - return ret; -} - -void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter) -{ - bch2_journal_iter_exit(&iter->journal); -} - -void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, - struct bch_fs *c, - struct btree *b, - struct btree_node_iter node_iter, - struct bpos pos) -{ - memset(iter, 0, sizeof(*iter)); - - iter->b = b; - iter->node_iter = node_iter; - bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos); - INIT_LIST_HEAD(&iter->journal.list); - iter->pos = b->data->min_key; - iter->at_end = false; -} - -/* - * this version is used by btree_gc before filesystem has gone RW and - * multithreaded, so uses the journal_iters list: - */ -void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, - struct bch_fs *c, - struct btree *b) -{ - struct btree_node_iter node_iter; - - bch2_btree_node_iter_init_from_start(&node_iter, b); - __bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key); - list_add(&iter->journal.list, &c->journal_iters); -} - -/* sort and dedup all keys in the journal: */ - -void bch2_journal_entries_free(struct bch_fs *c) -{ - struct journal_replay **i; - struct genradix_iter iter; - - genradix_for_each(&c->journal_entries, iter, i) - if (*i) - kvpfree(*i, offsetof(struct journal_replay, j) + - vstruct_bytes(&(*i)->j)); - genradix_free(&c->journal_entries); -} - -/* - * When keys compare equal, oldest compares first: - */ -static int journal_sort_key_cmp(const void *_l, const void *_r) -{ - const struct journal_key *l = _l; - const struct journal_key *r = _r; - - return journal_key_cmp(l, r) ?: - cmp_int(l->journal_seq, r->journal_seq) ?: - cmp_int(l->journal_offset, r->journal_offset); -} - -void bch2_journal_keys_free(struct journal_keys *keys) -{ - struct journal_key *i; - - move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); - keys->gap = keys->nr; - - for (i = keys->d; i < keys->d + keys->nr; i++) - if (i->allocated) - kfree(i->k); - - kvfree(keys->d); - keys->d = NULL; - keys->nr = keys->gap = keys->size = 0; -} - -static void __journal_keys_sort(struct journal_keys *keys) -{ - struct journal_key *src, *dst; - - sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL); - - src = dst = keys->d; - while (src < keys->d + keys->nr) { - while (src + 1 < keys->d + keys->nr && - src[0].btree_id == src[1].btree_id && - src[0].level == src[1].level && - bpos_eq(src[0].k->k.p, src[1].k->k.p)) - src++; - - *dst++ = *src++; - } - - keys->nr = dst - keys->d; -} - -static int journal_keys_sort(struct bch_fs *c) -{ - struct genradix_iter iter; - struct journal_replay *i, **_i; - struct jset_entry *entry; - struct bkey_i *k; - struct journal_keys *keys = &c->journal_keys; - size_t nr_keys = 0, nr_read = 0; - - genradix_for_each(&c->journal_entries, iter, _i) { - i = *_i; - - if (!i || i->ignore) - continue; - - for_each_jset_key(k, entry, &i->j) - nr_keys++; - } - - if (!nr_keys) - return 0; - - keys->size = roundup_pow_of_two(nr_keys); - - keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL); - if (!keys->d) { - bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys); trying slowpath", - nr_keys); - - do { - keys->size >>= 1; - keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL); - } while (!keys->d && keys->size > nr_keys / 8); - - if (!keys->d) { - bch_err(c, "Failed to allocate %zu size buffer for sorted journal keys; exiting", - keys->size); - return -BCH_ERR_ENOMEM_journal_keys_sort; - } - } - - genradix_for_each(&c->journal_entries, iter, _i) { - i = *_i; - - if (!i || i->ignore) - continue; - - cond_resched(); - - for_each_jset_key(k, entry, &i->j) { - if (keys->nr == keys->size) { - __journal_keys_sort(keys); - - if (keys->nr > keys->size * 7 / 8) { - bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu/%zu", - keys->nr, keys->size, nr_read, nr_keys); - return -BCH_ERR_ENOMEM_journal_keys_sort; - } - } - - keys->d[keys->nr++] = (struct journal_key) { - .btree_id = entry->btree_id, - .level = entry->level, - .k = k, - .journal_seq = le64_to_cpu(i->j.seq), - .journal_offset = k->_data - i->j._data, - }; - - nr_read++; - } - } - - __journal_keys_sort(keys); - keys->gap = keys->nr; - - bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_keys, keys->nr); - return 0; -} - /* journal replay: */ static void replay_now_at(struct journal *j, u64 seq) @@ -846,134 +330,6 @@ static int journal_replay_early(struct bch_fs *c, /* sb clean section: */ -static struct bkey_i *btree_root_find(struct bch_fs *c, - struct bch_sb_field_clean *clean, - struct jset *j, - enum btree_id id, unsigned *level) -{ - struct bkey_i *k; - struct jset_entry *entry, *start, *end; - - if (clean) { - start = clean->start; - end = vstruct_end(&clean->field); - } else { - start = j->start; - end = vstruct_last(j); - } - - for (entry = start; entry < end; entry = vstruct_next(entry)) - if (entry->type == BCH_JSET_ENTRY_btree_root && - entry->btree_id == id) - goto found; - - return NULL; -found: - if (!entry->u64s) - return ERR_PTR(-EINVAL); - - k = entry->start; - *level = entry->level; - return k; -} - -static int verify_superblock_clean(struct bch_fs *c, - struct bch_sb_field_clean **cleanp, - struct jset *j) -{ - unsigned i; - struct bch_sb_field_clean *clean = *cleanp; - struct printbuf buf1 = PRINTBUF; - struct printbuf buf2 = PRINTBUF; - int ret = 0; - - if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, - "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown", - le64_to_cpu(clean->journal_seq), - le64_to_cpu(j->seq))) { - kfree(clean); - *cleanp = NULL; - return 0; - } - - for (i = 0; i < BTREE_ID_NR; i++) { - struct bkey_i *k1, *k2; - unsigned l1 = 0, l2 = 0; - - k1 = btree_root_find(c, clean, NULL, i, &l1); - k2 = btree_root_find(c, NULL, j, i, &l2); - - if (!k1 && !k2) - continue; - - printbuf_reset(&buf1); - printbuf_reset(&buf2); - - if (k1) - bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1)); - else - prt_printf(&buf1, "(none)"); - - if (k2) - bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2)); - else - prt_printf(&buf2, "(none)"); - - mustfix_fsck_err_on(!k1 || !k2 || - IS_ERR(k1) || - IS_ERR(k2) || - k1->k.u64s != k2->k.u64s || - memcmp(k1, k2, bkey_bytes(&k1->k)) || - l1 != l2, c, - "superblock btree root %u doesn't match journal after clean shutdown\n" - "sb: l=%u %s\n" - "journal: l=%u %s\n", i, - l1, buf1.buf, - l2, buf2.buf); - } -fsck_err: - printbuf_exit(&buf2); - printbuf_exit(&buf1); - return ret; -} - -static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c) -{ - struct bch_sb_field_clean *clean, *sb_clean; - int ret; - - mutex_lock(&c->sb_lock); - sb_clean = bch2_sb_get_clean(c->disk_sb.sb); - - if (fsck_err_on(!sb_clean, c, - "superblock marked clean but clean section not present")) { - SET_BCH_SB_CLEAN(c->disk_sb.sb, false); - c->sb.clean = false; - mutex_unlock(&c->sb_lock); - return NULL; - } - - clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field), - GFP_KERNEL); - if (!clean) { - mutex_unlock(&c->sb_lock); - return ERR_PTR(-BCH_ERR_ENOMEM_read_superblock_clean); - } - - ret = bch2_sb_clean_validate_late(c, clean, READ); - if (ret) { - mutex_unlock(&c->sb_lock); - return ERR_PTR(ret); - } - - mutex_unlock(&c->sb_lock); - - return clean; -fsck_err: - mutex_unlock(&c->sb_lock); - return ERR_PTR(ret); -} - static bool btree_id_is_alloc(enum btree_id id) { switch (id) { @@ -1120,6 +476,35 @@ static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c) return ret; } +const char * const bch2_recovery_passes[] = { +#define x(_fn, _when) #_fn, + BCH_RECOVERY_PASSES() +#undef x + NULL +}; + +static int bch2_check_allocations(struct bch_fs *c) +{ + return bch2_gc(c, true, c->opts.norecovery); +} + +static int bch2_set_may_go_rw(struct bch_fs *c) +{ + set_bit(BCH_FS_MAY_GO_RW, &c->flags); + return 0; +} + +struct recovery_pass_fn { + int (*fn)(struct bch_fs *); + unsigned when; +}; + +static struct recovery_pass_fn recovery_pass_fns[] = { +#define x(_fn, _when) { .fn = bch2_##_fn, .when = _when }, + BCH_RECOVERY_PASSES() +#undef x +}; + static void check_version_upgrade(struct bch_fs *c) { unsigned latest_compatible = bch2_version_compatible(c->sb.version); @@ -1172,7 +557,12 @@ static void check_version_upgrade(struct bch_fs *c) recovery_passes = bch2_upgrade_recovery_passes(c, old_version, new_version); if (recovery_passes) { - prt_str(&buf, "fsck required"); + if ((recovery_passes & RECOVERY_PASS_ALL_FSCK) == RECOVERY_PASS_ALL_FSCK) + prt_str(&buf, "fsck required"); + else { + prt_str(&buf, "running recovery passses: "); + prt_bitflags(&buf, bch2_recovery_passes, recovery_passes); + } c->recovery_passes_explicit |= recovery_passes; c->opts.fix_errors = FSCK_FIX_yes; @@ -1188,42 +578,19 @@ static void check_version_upgrade(struct bch_fs *c) } } -static int bch2_check_allocations(struct bch_fs *c) -{ - return bch2_gc(c, true, c->opts.norecovery); -} - -static int bch2_set_may_go_rw(struct bch_fs *c) -{ - set_bit(BCH_FS_MAY_GO_RW, &c->flags); - return 0; -} - -struct recovery_pass_fn { - int (*fn)(struct bch_fs *); - const char *name; - unsigned when; -}; - -static struct recovery_pass_fn recovery_passes[] = { -#define x(_fn, _when) { .fn = bch2_##_fn, .name = #_fn, .when = _when }, - BCH_RECOVERY_PASSES() -#undef x -}; - u64 bch2_fsck_recovery_passes(void) { u64 ret = 0; - for (unsigned i = 0; i < ARRAY_SIZE(recovery_passes); i++) - if (recovery_passes[i].when & PASS_FSCK) + for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) + if (recovery_pass_fns[i].when & PASS_FSCK) ret |= BIT_ULL(i); return ret; } static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) { - struct recovery_pass_fn *p = recovery_passes + c->curr_recovery_pass; + struct recovery_pass_fn *p = recovery_pass_fns + c->curr_recovery_pass; if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read) return false; @@ -1245,15 +612,18 @@ static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) c->curr_recovery_pass = pass; if (should_run_recovery_pass(c, pass)) { - struct recovery_pass_fn *p = recovery_passes + pass; + struct recovery_pass_fn *p = recovery_pass_fns + pass; if (!(p->when & PASS_SILENT)) - printk(KERN_INFO bch2_log_msg(c, "%s..."), p->name); + printk(KERN_INFO bch2_log_msg(c, "%s..."), + bch2_recovery_passes[pass]); ret = p->fn(c); if (ret) return ret; if (!(p->when & PASS_SILENT)) printk(KERN_CONT " done\n"); + + c->recovery_passes_complete |= BIT_ULL(pass); } return 0; @@ -1263,7 +633,7 @@ static int bch2_run_recovery_passes(struct bch_fs *c) { int ret = 0; - while (c->curr_recovery_pass < ARRAY_SIZE(recovery_passes)) { + while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) { ret = bch2_run_recovery_pass(c, c->curr_recovery_pass); if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) continue; @@ -1283,17 +653,17 @@ int bch2_fs_recovery(struct bch_fs *c) bool write_sb = false; int ret = 0; - if (c->sb.clean) - clean = read_superblock_clean(c); - ret = PTR_ERR_OR_ZERO(clean); - if (ret) - goto err; + if (c->sb.clean) { + clean = bch2_read_superblock_clean(c); + ret = PTR_ERR_OR_ZERO(clean); + if (ret) + goto err; - if (c->sb.clean) bch_info(c, "recovering from clean shutdown, journal seq %llu", le64_to_cpu(clean->journal_seq)); - else + } else { bch_info(c, "recovering from unclean shutdown"); + } if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) { bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported"); @@ -1308,12 +678,6 @@ int bch2_fs_recovery(struct bch_fs *c) goto err; } - if (!(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) { - bch_err(c, "filesystem may have incompatible bkey formats; run fsck from the compat branch to fix"); - ret = -EINVAL; - goto err; - } - if (c->opts.fsck || !(c->opts.nochanges && c->opts.norecovery)) check_version_upgrade(c); @@ -1373,12 +737,12 @@ int bch2_fs_recovery(struct bch_fs *c) } } - ret = journal_keys_sort(c); + ret = bch2_journal_keys_sort(c); if (ret) goto err; if (c->sb.clean && last_journal_entry) { - ret = verify_superblock_clean(c, &clean, + ret = bch2_verify_superblock_clean(c, &clean, last_journal_entry); if (ret) goto err; @@ -1513,7 +877,6 @@ use_clean: mutex_unlock(&c->sb_lock); if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) || - !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done)) || c->sb.version_min < bcachefs_metadata_version_btree_ptr_sectors_written) { struct bch_move_stats stats; @@ -1581,7 +944,7 @@ int bch2_fs_initialize(struct bch_fs *c) } mutex_unlock(&c->sb_lock); - c->curr_recovery_pass = ARRAY_SIZE(recovery_passes); + c->curr_recovery_pass = ARRAY_SIZE(recovery_pass_fns); set_bit(BCH_FS_MAY_GO_RW, &c->flags); set_bit(BCH_FS_FSCK_DONE, &c->flags); diff --git a/libbcachefs/recovery.h b/libbcachefs/recovery.h index f8e796c0..852d3056 100644 --- a/libbcachefs/recovery.h +++ b/libbcachefs/recovery.h @@ -2,55 +2,28 @@ #ifndef _BCACHEFS_RECOVERY_H #define _BCACHEFS_RECOVERY_H -struct journal_iter { - struct list_head list; - enum btree_id btree_id; - unsigned level; - size_t idx; - struct journal_keys *keys; -}; +extern const char * const bch2_recovery_passes[]; /* - * Iterate over keys in the btree, with keys from the journal overlaid on top: + * For when we need to rewind recovery passes and run a pass we skipped: */ - -struct btree_and_journal_iter { - struct btree *b; - struct btree_node_iter node_iter; - struct bkey unpacked; - - struct journal_iter journal; - struct bpos pos; - bool at_end; -}; - -struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id, - unsigned, struct bpos, struct bpos, size_t *); -struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id, - unsigned, struct bpos); - -int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id, - unsigned, struct bkey_i *); -int bch2_journal_key_insert(struct bch_fs *, enum btree_id, - unsigned, struct bkey_i *); -int bch2_journal_key_delete(struct bch_fs *, enum btree_id, - unsigned, struct bpos); -void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, - unsigned, struct bpos); - -void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); -struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); - -void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *); -void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, - struct bch_fs *, struct btree *, - struct btree_node_iter, struct bpos); -void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, - struct bch_fs *, - struct btree *); - -void bch2_journal_keys_free(struct journal_keys *); -void bch2_journal_entries_free(struct bch_fs *); +static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c, + enum bch_recovery_pass pass) +{ + bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)", + bch2_recovery_passes[pass], pass, + bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass); + + c->recovery_passes_explicit |= BIT_ULL(pass); + + if (c->curr_recovery_pass >= pass) { + c->curr_recovery_pass = pass; + c->recovery_passes_complete &= (1ULL << pass) >> 1; + return -BCH_ERR_restart_recovery; + } else { + return 0; + } +} u64 bch2_fsck_recovery_passes(void); diff --git a/libbcachefs/sb-clean.c b/libbcachefs/sb-clean.c new file mode 100644 index 00000000..a3695e56 --- /dev/null +++ b/libbcachefs/sb-clean.c @@ -0,0 +1,395 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_update_interior.h" +#include "buckets.h" +#include "error.h" +#include "journal_io.h" +#include "replicas.h" +#include "sb-clean.h" +#include "super-io.h" + +/* + * BCH_SB_FIELD_clean: + * + * Btree roots, and a few other things, are recovered from the journal after an + * unclean shutdown - but after a clean shutdown, to avoid having to read the + * journal, we can store them in the superblock. + * + * bch_sb_field_clean simply contains a list of journal entries, stored exactly + * as they would be in the journal: + */ + +int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean, + int write) +{ + struct jset_entry *entry; + int ret; + + for (entry = clean->start; + entry < (struct jset_entry *) vstruct_end(&clean->field); + entry = vstruct_next(entry)) { + ret = bch2_journal_entry_validate(c, NULL, entry, + le16_to_cpu(c->disk_sb.sb->version), + BCH_SB_BIG_ENDIAN(c->disk_sb.sb), + write); + if (ret) + return ret; + } + + return 0; +} + +static struct bkey_i *btree_root_find(struct bch_fs *c, + struct bch_sb_field_clean *clean, + struct jset *j, + enum btree_id id, unsigned *level) +{ + struct bkey_i *k; + struct jset_entry *entry, *start, *end; + + if (clean) { + start = clean->start; + end = vstruct_end(&clean->field); + } else { + start = j->start; + end = vstruct_last(j); + } + + for (entry = start; entry < end; entry = vstruct_next(entry)) + if (entry->type == BCH_JSET_ENTRY_btree_root && + entry->btree_id == id) + goto found; + + return NULL; +found: + if (!entry->u64s) + return ERR_PTR(-EINVAL); + + k = entry->start; + *level = entry->level; + return k; +} + +int bch2_verify_superblock_clean(struct bch_fs *c, + struct bch_sb_field_clean **cleanp, + struct jset *j) +{ + unsigned i; + struct bch_sb_field_clean *clean = *cleanp; + struct printbuf buf1 = PRINTBUF; + struct printbuf buf2 = PRINTBUF; + int ret = 0; + + if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, + "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown", + le64_to_cpu(clean->journal_seq), + le64_to_cpu(j->seq))) { + kfree(clean); + *cleanp = NULL; + return 0; + } + + for (i = 0; i < BTREE_ID_NR; i++) { + struct bkey_i *k1, *k2; + unsigned l1 = 0, l2 = 0; + + k1 = btree_root_find(c, clean, NULL, i, &l1); + k2 = btree_root_find(c, NULL, j, i, &l2); + + if (!k1 && !k2) + continue; + + printbuf_reset(&buf1); + printbuf_reset(&buf2); + + if (k1) + bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1)); + else + prt_printf(&buf1, "(none)"); + + if (k2) + bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2)); + else + prt_printf(&buf2, "(none)"); + + mustfix_fsck_err_on(!k1 || !k2 || + IS_ERR(k1) || + IS_ERR(k2) || + k1->k.u64s != k2->k.u64s || + memcmp(k1, k2, bkey_bytes(&k1->k)) || + l1 != l2, c, + "superblock btree root %u doesn't match journal after clean shutdown\n" + "sb: l=%u %s\n" + "journal: l=%u %s\n", i, + l1, buf1.buf, + l2, buf2.buf); + } +fsck_err: + printbuf_exit(&buf2); + printbuf_exit(&buf1); + return ret; +} + +struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c) +{ + struct bch_sb_field_clean *clean, *sb_clean; + int ret; + + mutex_lock(&c->sb_lock); + sb_clean = bch2_sb_get_clean(c->disk_sb.sb); + + if (fsck_err_on(!sb_clean, c, + "superblock marked clean but clean section not present")) { + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); + c->sb.clean = false; + mutex_unlock(&c->sb_lock); + return NULL; + } + + clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field), + GFP_KERNEL); + if (!clean) { + mutex_unlock(&c->sb_lock); + return ERR_PTR(-BCH_ERR_ENOMEM_read_superblock_clean); + } + + ret = bch2_sb_clean_validate_late(c, clean, READ); + if (ret) { + mutex_unlock(&c->sb_lock); + return ERR_PTR(ret); + } + + mutex_unlock(&c->sb_lock); + + return clean; +fsck_err: + mutex_unlock(&c->sb_lock); + return ERR_PTR(ret); +} + +static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size) +{ + struct jset_entry *entry = *end; + unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); + + memset(entry, 0, u64s * sizeof(u64)); + /* + * The u64s field counts from the start of data, ignoring the shared + * fields. + */ + entry->u64s = cpu_to_le16(u64s - 1); + + *end = vstruct_next(*end); + return entry; +} + +void bch2_journal_super_entries_add_common(struct bch_fs *c, + struct jset_entry **end, + u64 journal_seq) +{ + struct bch_dev *ca; + unsigned i, dev; + + percpu_down_read(&c->mark_lock); + + if (!journal_seq) { + for (i = 0; i < ARRAY_SIZE(c->usage); i++) + bch2_fs_usage_acc_to_base(c, i); + } else { + bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK); + } + + { + struct jset_entry_usage *u = + container_of(jset_entry_init(end, sizeof(*u)), + struct jset_entry_usage, entry); + + u->entry.type = BCH_JSET_ENTRY_usage; + u->entry.btree_id = BCH_FS_USAGE_inodes; + u->v = cpu_to_le64(c->usage_base->nr_inodes); + } + + { + struct jset_entry_usage *u = + container_of(jset_entry_init(end, sizeof(*u)), + struct jset_entry_usage, entry); + + u->entry.type = BCH_JSET_ENTRY_usage; + u->entry.btree_id = BCH_FS_USAGE_key_version; + u->v = cpu_to_le64(atomic64_read(&c->key_version)); + } + + for (i = 0; i < BCH_REPLICAS_MAX; i++) { + struct jset_entry_usage *u = + container_of(jset_entry_init(end, sizeof(*u)), + struct jset_entry_usage, entry); + + u->entry.type = BCH_JSET_ENTRY_usage; + u->entry.btree_id = BCH_FS_USAGE_reserved; + u->entry.level = i; + u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]); + } + + for (i = 0; i < c->replicas.nr; i++) { + struct bch_replicas_entry *e = + cpu_replicas_entry(&c->replicas, i); + struct jset_entry_data_usage *u = + container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs), + struct jset_entry_data_usage, entry); + + u->entry.type = BCH_JSET_ENTRY_data_usage; + u->v = cpu_to_le64(c->usage_base->replicas[i]); + unsafe_memcpy(&u->r, e, replicas_entry_bytes(e), + "embedded variable length struct"); + } + + for_each_member_device(ca, c, dev) { + unsigned b = sizeof(struct jset_entry_dev_usage) + + sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR; + struct jset_entry_dev_usage *u = + container_of(jset_entry_init(end, b), + struct jset_entry_dev_usage, entry); + + u->entry.type = BCH_JSET_ENTRY_dev_usage; + u->dev = cpu_to_le32(dev); + u->buckets_ec = cpu_to_le64(ca->usage_base->buckets_ec); + + for (i = 0; i < BCH_DATA_NR; i++) { + u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets); + u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors); + u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented); + } + } + + percpu_up_read(&c->mark_lock); + + for (i = 0; i < 2; i++) { + struct jset_entry_clock *clock = + container_of(jset_entry_init(end, sizeof(*clock)), + struct jset_entry_clock, entry); + + clock->entry.type = BCH_JSET_ENTRY_clock; + clock->rw = i; + clock->time = cpu_to_le64(atomic64_read(&c->io_clock[i].now)); + } +} + +static int bch2_sb_clean_validate(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) +{ + struct bch_sb_field_clean *clean = field_to_type(f, clean); + + if (vstruct_bytes(&clean->field) < sizeof(*clean)) { + prt_printf(err, "wrong size (got %zu should be %zu)", + vstruct_bytes(&clean->field), sizeof(*clean)); + return -BCH_ERR_invalid_sb_clean; + } + + return 0; +} + +static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_clean *clean = field_to_type(f, clean); + struct jset_entry *entry; + + prt_printf(out, "flags: %x", le32_to_cpu(clean->flags)); + prt_newline(out); + prt_printf(out, "journal_seq: %llu", le64_to_cpu(clean->journal_seq)); + prt_newline(out); + + for (entry = clean->start; + entry != vstruct_end(&clean->field); + entry = vstruct_next(entry)) { + if (entry->type == BCH_JSET_ENTRY_btree_keys && + !entry->u64s) + continue; + + bch2_journal_entry_to_text(out, NULL, entry); + prt_newline(out); + } +} + +const struct bch_sb_field_ops bch_sb_field_ops_clean = { + .validate = bch2_sb_clean_validate, + .to_text = bch2_sb_clean_to_text, +}; + +int bch2_fs_mark_dirty(struct bch_fs *c) +{ + int ret; + + /* + * Unconditionally write superblock, to verify it hasn't changed before + * we go rw: + */ + + mutex_lock(&c->sb_lock); + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); + + bch2_sb_maybe_downgrade(c); + c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS); + + ret = bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + return ret; +} + +void bch2_fs_mark_clean(struct bch_fs *c) +{ + struct bch_sb_field_clean *sb_clean; + struct jset_entry *entry; + unsigned u64s; + int ret; + + mutex_lock(&c->sb_lock); + if (BCH_SB_CLEAN(c->disk_sb.sb)) + goto out; + + SET_BCH_SB_CLEAN(c->disk_sb.sb, true); + + c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info); + c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_metadata); + c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_extents_above_btree_updates)); + c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_btree_updates_journalled)); + + u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved; + + sb_clean = bch2_sb_resize_clean(&c->disk_sb, u64s); + if (!sb_clean) { + bch_err(c, "error resizing superblock while setting filesystem clean"); + goto out; + } + + sb_clean->flags = 0; + sb_clean->journal_seq = cpu_to_le64(atomic64_read(&c->journal.seq)); + + /* Trying to catch outstanding bug: */ + BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX); + + entry = sb_clean->start; + bch2_journal_super_entries_add_common(c, &entry, 0); + entry = bch2_btree_roots_to_journal_entries(c, entry, entry); + BUG_ON((void *) entry > vstruct_end(&sb_clean->field)); + + memset(entry, 0, + vstruct_end(&sb_clean->field) - (void *) entry); + + /* + * this should be in the write path, and we should be validating every + * superblock section: + */ + ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE); + if (ret) { + bch_err(c, "error writing marking filesystem clean: validate error"); + goto out; + } + + bch2_write_super(c); +out: + mutex_unlock(&c->sb_lock); +} diff --git a/libbcachefs/sb-clean.h b/libbcachefs/sb-clean.h new file mode 100644 index 00000000..71caef28 --- /dev/null +++ b/libbcachefs/sb-clean.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SB_CLEAN_H +#define _BCACHEFS_SB_CLEAN_H + +int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int); +int bch2_verify_superblock_clean(struct bch_fs *, struct bch_sb_field_clean **, + struct jset *); +struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *); +void bch2_journal_super_entries_add_common(struct bch_fs *, struct jset_entry **, u64); + +extern const struct bch_sb_field_ops bch_sb_field_ops_clean; + +int bch2_fs_mark_dirty(struct bch_fs *); +void bch2_fs_mark_clean(struct bch_fs *); + +#endif /* _BCACHEFS_SB_CLEAN_H */ diff --git a/libbcachefs/sb-members.c b/libbcachefs/sb-members.c new file mode 100644 index 00000000..16a2b338 --- /dev/null +++ b/libbcachefs/sb-members.c @@ -0,0 +1,173 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "disk_groups.h" +#include "replicas.h" +#include "sb-members.h" +#include "super-io.h" + +/* Code for bch_sb_field_members: */ + +static int bch2_sb_members_validate(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) +{ + struct bch_sb_field_members *mi = field_to_type(f, members); + unsigned i; + + if ((void *) (mi->members + sb->nr_devices) > + vstruct_end(&mi->field)) { + prt_printf(err, "too many devices for section size"); + return -BCH_ERR_invalid_sb_members; + } + + for (i = 0; i < sb->nr_devices; i++) { + struct bch_member *m = mi->members + i; + + if (!bch2_member_exists(m)) + continue; + + if (le64_to_cpu(m->nbuckets) > LONG_MAX) { + prt_printf(err, "device %u: too many buckets (got %llu, max %lu)", + i, le64_to_cpu(m->nbuckets), LONG_MAX); + return -BCH_ERR_invalid_sb_members; + } + + if (le64_to_cpu(m->nbuckets) - + le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) { + prt_printf(err, "device %u: not enough buckets (got %llu, max %u)", + i, le64_to_cpu(m->nbuckets), BCH_MIN_NR_NBUCKETS); + return -BCH_ERR_invalid_sb_members; + } + + if (le16_to_cpu(m->bucket_size) < + le16_to_cpu(sb->block_size)) { + prt_printf(err, "device %u: bucket size %u smaller than block size %u", + i, le16_to_cpu(m->bucket_size), le16_to_cpu(sb->block_size)); + return -BCH_ERR_invalid_sb_members; + } + + if (le16_to_cpu(m->bucket_size) < + BCH_SB_BTREE_NODE_SIZE(sb)) { + prt_printf(err, "device %u: bucket size %u smaller than btree node size %llu", + i, le16_to_cpu(m->bucket_size), BCH_SB_BTREE_NODE_SIZE(sb)); + return -BCH_ERR_invalid_sb_members; + } + } + + return 0; +} + +static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_members *mi = field_to_type(f, members); + struct bch_sb_field_disk_groups *gi = bch2_sb_get_disk_groups(sb); + unsigned i; + + for (i = 0; i < sb->nr_devices; i++) { + struct bch_member *m = mi->members + i; + unsigned data_have = bch2_sb_dev_has_data(sb, i); + u64 bucket_size = le16_to_cpu(m->bucket_size); + u64 device_size = le64_to_cpu(m->nbuckets) * bucket_size; + + if (!bch2_member_exists(m)) + continue; + + prt_printf(out, "Device:"); + prt_tab(out); + prt_printf(out, "%u", i); + prt_newline(out); + + printbuf_indent_add(out, 2); + + prt_printf(out, "UUID:"); + prt_tab(out); + pr_uuid(out, m->uuid.b); + prt_newline(out); + + prt_printf(out, "Size:"); + prt_tab(out); + prt_units_u64(out, device_size << 9); + prt_newline(out); + + prt_printf(out, "Bucket size:"); + prt_tab(out); + prt_units_u64(out, bucket_size << 9); + prt_newline(out); + + prt_printf(out, "First bucket:"); + prt_tab(out); + prt_printf(out, "%u", le16_to_cpu(m->first_bucket)); + prt_newline(out); + + prt_printf(out, "Buckets:"); + prt_tab(out); + prt_printf(out, "%llu", le64_to_cpu(m->nbuckets)); + prt_newline(out); + + prt_printf(out, "Last mount:"); + prt_tab(out); + if (m->last_mount) + pr_time(out, le64_to_cpu(m->last_mount)); + else + prt_printf(out, "(never)"); + prt_newline(out); + + prt_printf(out, "State:"); + prt_tab(out); + prt_printf(out, "%s", + BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR + ? bch2_member_states[BCH_MEMBER_STATE(m)] + : "unknown"); + prt_newline(out); + + prt_printf(out, "Label:"); + prt_tab(out); + if (BCH_MEMBER_GROUP(m)) { + unsigned idx = BCH_MEMBER_GROUP(m) - 1; + + if (idx < disk_groups_nr(gi)) + prt_printf(out, "%s (%u)", + gi->entries[idx].label, idx); + else + prt_printf(out, "(bad disk labels section)"); + } else { + prt_printf(out, "(none)"); + } + prt_newline(out); + + prt_printf(out, "Data allowed:"); + prt_tab(out); + if (BCH_MEMBER_DATA_ALLOWED(m)) + prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(m)); + else + prt_printf(out, "(none)"); + prt_newline(out); + + prt_printf(out, "Has data:"); + prt_tab(out); + if (data_have) + prt_bitflags(out, bch2_data_types, data_have); + else + prt_printf(out, "(none)"); + prt_newline(out); + + prt_printf(out, "Discard:"); + prt_tab(out); + prt_printf(out, "%llu", BCH_MEMBER_DISCARD(m)); + prt_newline(out); + + prt_printf(out, "Freespace initialized:"); + prt_tab(out); + prt_printf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(m)); + prt_newline(out); + + printbuf_indent_sub(out, 2); + } +} + +const struct bch_sb_field_ops bch_sb_field_ops_members = { + .validate = bch2_sb_members_validate, + .to_text = bch2_sb_members_to_text, +}; diff --git a/libbcachefs/sb-members.h b/libbcachefs/sb-members.h new file mode 100644 index 00000000..34e1cf60 --- /dev/null +++ b/libbcachefs/sb-members.h @@ -0,0 +1,176 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SB_MEMBERS_H +#define _BCACHEFS_SB_MEMBERS_H + +static inline bool bch2_dev_is_online(struct bch_dev *ca) +{ + return !percpu_ref_is_zero(&ca->io_ref); +} + +static inline bool bch2_dev_is_readable(struct bch_dev *ca) +{ + return bch2_dev_is_online(ca) && + ca->mi.state != BCH_MEMBER_STATE_failed; +} + +static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw) +{ + if (!percpu_ref_tryget(&ca->io_ref)) + return false; + + if (ca->mi.state == BCH_MEMBER_STATE_rw || + (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ)) + return true; + + percpu_ref_put(&ca->io_ref); + return false; +} + +static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs) +{ + return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX); +} + +static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs, + unsigned dev) +{ + unsigned i; + + for (i = 0; i < devs.nr; i++) + if (devs.devs[i] == dev) + return true; + + return false; +} + +static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs, + unsigned dev) +{ + unsigned i; + + for (i = 0; i < devs->nr; i++) + if (devs->devs[i] == dev) { + array_remove_item(devs->devs, devs->nr, i); + return; + } +} + +static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs, + unsigned dev) +{ + if (!bch2_dev_list_has_dev(*devs, dev)) { + BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs)); + devs->devs[devs->nr++] = dev; + } +} + +static inline struct bch_devs_list bch2_dev_list_single(unsigned dev) +{ + return (struct bch_devs_list) { .nr = 1, .devs[0] = dev }; +} + +static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter, + const struct bch_devs_mask *mask) +{ + struct bch_dev *ca = NULL; + + while ((*iter = mask + ? find_next_bit(mask->d, c->sb.nr_devices, *iter) + : *iter) < c->sb.nr_devices && + !(ca = rcu_dereference_check(c->devs[*iter], + lockdep_is_held(&c->state_lock)))) + (*iter)++; + + return ca; +} + +#define for_each_member_device_rcu(ca, c, iter, mask) \ + for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++) + +static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter) +{ + struct bch_dev *ca; + + rcu_read_lock(); + if ((ca = __bch2_next_dev(c, iter, NULL))) + percpu_ref_get(&ca->ref); + rcu_read_unlock(); + + return ca; +} + +/* + * If you break early, you must drop your ref on the current device + */ +#define for_each_member_device(ca, c, iter) \ + for ((iter) = 0; \ + (ca = bch2_get_next_dev(c, &(iter))); \ + percpu_ref_put(&ca->ref), (iter)++) + +static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, + unsigned *iter, + int state_mask) +{ + struct bch_dev *ca; + + rcu_read_lock(); + while ((ca = __bch2_next_dev(c, iter, NULL)) && + (!((1 << ca->mi.state) & state_mask) || + !percpu_ref_tryget(&ca->io_ref))) + (*iter)++; + rcu_read_unlock(); + + return ca; +} + +#define __for_each_online_member(ca, c, iter, state_mask) \ + for ((iter) = 0; \ + (ca = bch2_get_next_online_dev(c, &(iter), state_mask)); \ + percpu_ref_put(&ca->io_ref), (iter)++) + +#define for_each_online_member(ca, c, iter) \ + __for_each_online_member(ca, c, iter, ~0) + +#define for_each_rw_member(ca, c, iter) \ + __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_rw) + +#define for_each_readable_member(ca, c, iter) \ + __for_each_online_member(ca, c, iter, \ + (1 << BCH_MEMBER_STATE_rw)|(1 << BCH_MEMBER_STATE_ro)) + +/* + * If a key exists that references a device, the device won't be going away and + * we can omit rcu_read_lock(): + */ +static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx) +{ + EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); + + return rcu_dereference_check(c->devs[idx], 1); +} + +static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx) +{ + EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); + + return rcu_dereference_protected(c->devs[idx], + lockdep_is_held(&c->sb_lock) || + lockdep_is_held(&c->state_lock)); +} + +/* XXX kill, move to struct bch_fs */ +static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) +{ + struct bch_devs_mask devs; + struct bch_dev *ca; + unsigned i; + + memset(&devs, 0, sizeof(devs)); + for_each_online_member(ca, c, i) + __set_bit(ca->dev_idx, devs.d); + return devs; +} + +extern const struct bch_sb_field_ops bch_sb_field_ops_members; + +#endif /* _BCACHEFS_SB_MEMBERS_H */ diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index c9a5a7cb..86270288 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -1,8 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" -#include "btree_update_interior.h" -#include "buckets.h" #include "checksum.h" #include "counters.h" #include "disk_groups.h" @@ -10,12 +8,13 @@ #include "error.h" #include "io.h" #include "journal.h" -#include "journal_io.h" #include "journal_sb.h" #include "journal_seq_blacklist.h" #include "recovery.h" #include "replicas.h" #include "quota.h" +#include "sb-clean.h" +#include "sb-members.h" #include "super-io.h" #include "super.h" #include "trace.h" @@ -1005,235 +1004,6 @@ void __bch2_check_set_feature(struct bch_fs *c, unsigned feat) mutex_unlock(&c->sb_lock); } -/* BCH_SB_FIELD_members: */ - -static int bch2_sb_members_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) -{ - struct bch_sb_field_members *mi = field_to_type(f, members); - unsigned i; - - if ((void *) (mi->members + sb->nr_devices) > - vstruct_end(&mi->field)) { - prt_printf(err, "too many devices for section size"); - return -BCH_ERR_invalid_sb_members; - } - - for (i = 0; i < sb->nr_devices; i++) { - struct bch_member *m = mi->members + i; - - if (!bch2_member_exists(m)) - continue; - - if (le64_to_cpu(m->nbuckets) > LONG_MAX) { - prt_printf(err, "device %u: too many buckets (got %llu, max %lu)", - i, le64_to_cpu(m->nbuckets), LONG_MAX); - return -BCH_ERR_invalid_sb_members; - } - - if (le64_to_cpu(m->nbuckets) - - le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) { - prt_printf(err, "device %u: not enough buckets (got %llu, max %u)", - i, le64_to_cpu(m->nbuckets), BCH_MIN_NR_NBUCKETS); - return -BCH_ERR_invalid_sb_members; - } - - if (le16_to_cpu(m->bucket_size) < - le16_to_cpu(sb->block_size)) { - prt_printf(err, "device %u: bucket size %u smaller than block size %u", - i, le16_to_cpu(m->bucket_size), le16_to_cpu(sb->block_size)); - return -BCH_ERR_invalid_sb_members; - } - - if (le16_to_cpu(m->bucket_size) < - BCH_SB_BTREE_NODE_SIZE(sb)) { - prt_printf(err, "device %u: bucket size %u smaller than btree node size %llu", - i, le16_to_cpu(m->bucket_size), BCH_SB_BTREE_NODE_SIZE(sb)); - return -BCH_ERR_invalid_sb_members; - } - } - - return 0; -} - -static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_members *mi = field_to_type(f, members); - struct bch_sb_field_disk_groups *gi = bch2_sb_get_disk_groups(sb); - unsigned i; - - for (i = 0; i < sb->nr_devices; i++) { - struct bch_member *m = mi->members + i; - unsigned data_have = bch2_sb_dev_has_data(sb, i); - u64 bucket_size = le16_to_cpu(m->bucket_size); - u64 device_size = le64_to_cpu(m->nbuckets) * bucket_size; - - if (!bch2_member_exists(m)) - continue; - - prt_printf(out, "Device:"); - prt_tab(out); - prt_printf(out, "%u", i); - prt_newline(out); - - printbuf_indent_add(out, 2); - - prt_printf(out, "UUID:"); - prt_tab(out); - pr_uuid(out, m->uuid.b); - prt_newline(out); - - prt_printf(out, "Size:"); - prt_tab(out); - prt_units_u64(out, device_size << 9); - prt_newline(out); - - prt_printf(out, "Bucket size:"); - prt_tab(out); - prt_units_u64(out, bucket_size << 9); - prt_newline(out); - - prt_printf(out, "First bucket:"); - prt_tab(out); - prt_printf(out, "%u", le16_to_cpu(m->first_bucket)); - prt_newline(out); - - prt_printf(out, "Buckets:"); - prt_tab(out); - prt_printf(out, "%llu", le64_to_cpu(m->nbuckets)); - prt_newline(out); - - prt_printf(out, "Last mount:"); - prt_tab(out); - if (m->last_mount) - pr_time(out, le64_to_cpu(m->last_mount)); - else - prt_printf(out, "(never)"); - prt_newline(out); - - prt_printf(out, "State:"); - prt_tab(out); - prt_printf(out, "%s", - BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR - ? bch2_member_states[BCH_MEMBER_STATE(m)] - : "unknown"); - prt_newline(out); - - prt_printf(out, "Label:"); - prt_tab(out); - if (BCH_MEMBER_GROUP(m)) { - unsigned idx = BCH_MEMBER_GROUP(m) - 1; - - if (idx < disk_groups_nr(gi)) - prt_printf(out, "%s (%u)", - gi->entries[idx].label, idx); - else - prt_printf(out, "(bad disk labels section)"); - } else { - prt_printf(out, "(none)"); - } - prt_newline(out); - - prt_printf(out, "Data allowed:"); - prt_tab(out); - if (BCH_MEMBER_DATA_ALLOWED(m)) - prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(m)); - else - prt_printf(out, "(none)"); - prt_newline(out); - - prt_printf(out, "Has data:"); - prt_tab(out); - if (data_have) - prt_bitflags(out, bch2_data_types, data_have); - else - prt_printf(out, "(none)"); - prt_newline(out); - - prt_printf(out, "Discard:"); - prt_tab(out); - prt_printf(out, "%llu", BCH_MEMBER_DISCARD(m)); - prt_newline(out); - - prt_printf(out, "Freespace initialized:"); - prt_tab(out); - prt_printf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(m)); - prt_newline(out); - - printbuf_indent_sub(out, 2); - } -} - -static const struct bch_sb_field_ops bch_sb_field_ops_members = { - .validate = bch2_sb_members_validate, - .to_text = bch2_sb_members_to_text, -}; - -/* BCH_SB_FIELD_crypt: */ - -static int bch2_sb_crypt_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) -{ - struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); - - if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) { - prt_printf(err, "wrong size (got %zu should be %zu)", - vstruct_bytes(&crypt->field), sizeof(*crypt)); - return -BCH_ERR_invalid_sb_crypt; - } - - if (BCH_CRYPT_KDF_TYPE(crypt)) { - prt_printf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt)); - return -BCH_ERR_invalid_sb_crypt; - } - - return 0; -} - -static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); - - prt_printf(out, "KFD: %llu", BCH_CRYPT_KDF_TYPE(crypt)); - prt_newline(out); - prt_printf(out, "scrypt n: %llu", BCH_KDF_SCRYPT_N(crypt)); - prt_newline(out); - prt_printf(out, "scrypt r: %llu", BCH_KDF_SCRYPT_R(crypt)); - prt_newline(out); - prt_printf(out, "scrypt p: %llu", BCH_KDF_SCRYPT_P(crypt)); - prt_newline(out); -} - -static const struct bch_sb_field_ops bch_sb_field_ops_crypt = { - .validate = bch2_sb_crypt_validate, - .to_text = bch2_sb_crypt_to_text, -}; - -/* BCH_SB_FIELD_clean: */ - -int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean, int write) -{ - struct jset_entry *entry; - int ret; - - for (entry = clean->start; - entry < (struct jset_entry *) vstruct_end(&clean->field); - entry = vstruct_next(entry)) { - ret = bch2_journal_entry_validate(c, NULL, entry, - le16_to_cpu(c->disk_sb.sb->version), - BCH_SB_BIG_ENDIAN(c->disk_sb.sb), - write); - if (ret) - return ret; - } - - return 0; -} - /* Downgrade if superblock is at a higher version than currently supported: */ void bch2_sb_maybe_downgrade(struct bch_fs *c) { @@ -1260,232 +1030,6 @@ void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version) c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); } -int bch2_fs_mark_dirty(struct bch_fs *c) -{ - int ret; - - /* - * Unconditionally write superblock, to verify it hasn't changed before - * we go rw: - */ - - mutex_lock(&c->sb_lock); - SET_BCH_SB_CLEAN(c->disk_sb.sb, false); - - bch2_sb_maybe_downgrade(c); - c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS); - - ret = bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - return ret; -} - -static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size) -{ - struct jset_entry *entry = *end; - unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); - - memset(entry, 0, u64s * sizeof(u64)); - /* - * The u64s field counts from the start of data, ignoring the shared - * fields. - */ - entry->u64s = cpu_to_le16(u64s - 1); - - *end = vstruct_next(*end); - return entry; -} - -void bch2_journal_super_entries_add_common(struct bch_fs *c, - struct jset_entry **end, - u64 journal_seq) -{ - struct bch_dev *ca; - unsigned i, dev; - - percpu_down_read(&c->mark_lock); - - if (!journal_seq) { - for (i = 0; i < ARRAY_SIZE(c->usage); i++) - bch2_fs_usage_acc_to_base(c, i); - } else { - bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK); - } - - { - struct jset_entry_usage *u = - container_of(jset_entry_init(end, sizeof(*u)), - struct jset_entry_usage, entry); - - u->entry.type = BCH_JSET_ENTRY_usage; - u->entry.btree_id = BCH_FS_USAGE_inodes; - u->v = cpu_to_le64(c->usage_base->nr_inodes); - } - - { - struct jset_entry_usage *u = - container_of(jset_entry_init(end, sizeof(*u)), - struct jset_entry_usage, entry); - - u->entry.type = BCH_JSET_ENTRY_usage; - u->entry.btree_id = BCH_FS_USAGE_key_version; - u->v = cpu_to_le64(atomic64_read(&c->key_version)); - } - - for (i = 0; i < BCH_REPLICAS_MAX; i++) { - struct jset_entry_usage *u = - container_of(jset_entry_init(end, sizeof(*u)), - struct jset_entry_usage, entry); - - u->entry.type = BCH_JSET_ENTRY_usage; - u->entry.btree_id = BCH_FS_USAGE_reserved; - u->entry.level = i; - u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]); - } - - for (i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry *e = - cpu_replicas_entry(&c->replicas, i); - struct jset_entry_data_usage *u = - container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs), - struct jset_entry_data_usage, entry); - - u->entry.type = BCH_JSET_ENTRY_data_usage; - u->v = cpu_to_le64(c->usage_base->replicas[i]); - unsafe_memcpy(&u->r, e, replicas_entry_bytes(e), - "embedded variable length struct"); - } - - for_each_member_device(ca, c, dev) { - unsigned b = sizeof(struct jset_entry_dev_usage) + - sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR; - struct jset_entry_dev_usage *u = - container_of(jset_entry_init(end, b), - struct jset_entry_dev_usage, entry); - - u->entry.type = BCH_JSET_ENTRY_dev_usage; - u->dev = cpu_to_le32(dev); - u->buckets_ec = cpu_to_le64(ca->usage_base->buckets_ec); - - for (i = 0; i < BCH_DATA_NR; i++) { - u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets); - u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors); - u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented); - } - } - - percpu_up_read(&c->mark_lock); - - for (i = 0; i < 2; i++) { - struct jset_entry_clock *clock = - container_of(jset_entry_init(end, sizeof(*clock)), - struct jset_entry_clock, entry); - - clock->entry.type = BCH_JSET_ENTRY_clock; - clock->rw = i; - clock->time = cpu_to_le64(atomic64_read(&c->io_clock[i].now)); - } -} - -void bch2_fs_mark_clean(struct bch_fs *c) -{ - struct bch_sb_field_clean *sb_clean; - struct jset_entry *entry; - unsigned u64s; - int ret; - - mutex_lock(&c->sb_lock); - if (BCH_SB_CLEAN(c->disk_sb.sb)) - goto out; - - SET_BCH_SB_CLEAN(c->disk_sb.sb, true); - - c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info); - c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_metadata); - c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_extents_above_btree_updates)); - c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_btree_updates_journalled)); - - u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved; - - sb_clean = bch2_sb_resize_clean(&c->disk_sb, u64s); - if (!sb_clean) { - bch_err(c, "error resizing superblock while setting filesystem clean"); - goto out; - } - - sb_clean->flags = 0; - sb_clean->journal_seq = cpu_to_le64(atomic64_read(&c->journal.seq)); - - /* Trying to catch outstanding bug: */ - BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX); - - entry = sb_clean->start; - bch2_journal_super_entries_add_common(c, &entry, 0); - entry = bch2_btree_roots_to_journal_entries(c, entry, entry); - BUG_ON((void *) entry > vstruct_end(&sb_clean->field)); - - memset(entry, 0, - vstruct_end(&sb_clean->field) - (void *) entry); - - /* - * this should be in the write path, and we should be validating every - * superblock section: - */ - ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE); - if (ret) { - bch_err(c, "error writing marking filesystem clean: validate error"); - goto out; - } - - bch2_write_super(c); -out: - mutex_unlock(&c->sb_lock); -} - -static int bch2_sb_clean_validate(struct bch_sb *sb, - struct bch_sb_field *f, - struct printbuf *err) -{ - struct bch_sb_field_clean *clean = field_to_type(f, clean); - - if (vstruct_bytes(&clean->field) < sizeof(*clean)) { - prt_printf(err, "wrong size (got %zu should be %zu)", - vstruct_bytes(&clean->field), sizeof(*clean)); - return -BCH_ERR_invalid_sb_clean; - } - - return 0; -} - -static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb, - struct bch_sb_field *f) -{ - struct bch_sb_field_clean *clean = field_to_type(f, clean); - struct jset_entry *entry; - - prt_printf(out, "flags: %x", le32_to_cpu(clean->flags)); - prt_newline(out); - prt_printf(out, "journal_seq: %llu", le64_to_cpu(clean->journal_seq)); - prt_newline(out); - - for (entry = clean->start; - entry != vstruct_end(&clean->field); - entry = vstruct_next(entry)) { - if (entry->type == BCH_JSET_ENTRY_btree_keys && - !entry->u64s) - continue; - - bch2_journal_entry_to_text(out, NULL, entry); - prt_newline(out); - } -} - -static const struct bch_sb_field_ops bch_sb_field_ops_clean = { - .validate = bch2_sb_clean_validate, - .to_text = bch2_sb_clean_to_text, -}; - static const struct bch_sb_field_ops *bch2_sb_field_ops[] = { #define x(f, nr) \ [BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f, diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h index 904adea6..5181dcb3 100644 --- a/libbcachefs/super-io.h +++ b/libbcachefs/super-io.h @@ -121,19 +121,9 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) }; } -/* BCH_SB_FIELD_clean: */ - -void bch2_journal_super_entries_add_common(struct bch_fs *, - struct jset_entry **, u64); - -int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int); - void bch2_sb_maybe_downgrade(struct bch_fs *); void bch2_sb_upgrade(struct bch_fs *, unsigned); -int bch2_fs_mark_dirty(struct bch_fs *); -void bch2_fs_mark_clean(struct bch_fs *); - void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *, struct bch_sb_field *); void bch2_sb_layout_to_text(struct printbuf *, struct bch_sb_layout *); diff --git a/libbcachefs/super.c b/libbcachefs/super.c index eee56969..82e992b3 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -13,6 +13,7 @@ #include "bkey_sort.h" #include "btree_cache.h" #include "btree_gc.h" +#include "btree_journal_iter.h" #include "btree_key_cache.h" #include "btree_update_interior.h" #include "btree_io.h" @@ -30,6 +31,8 @@ #include "error.h" #include "fs.h" #include "fs-io.h" +#include "fs-io-buffered.h" +#include "fs-io-direct.h" #include "fsck.h" #include "inode.h" #include "io.h" @@ -44,6 +47,7 @@ #include "rebalance.h" #include "recovery.h" #include "replicas.h" +#include "sb-clean.h" #include "subvolume.h" #include "super.h" #include "super-io.h" @@ -469,6 +473,8 @@ static void __bch2_fs_free(struct bch_fs *c) bch2_fs_counters_exit(c); bch2_fs_snapshots_exit(c); bch2_fs_quota_exit(c); + bch2_fs_fs_io_direct_exit(c); + bch2_fs_fs_io_buffered_exit(c); bch2_fs_fsio_exit(c); bch2_fs_ec_exit(c); bch2_fs_encryption_exit(c); @@ -844,7 +850,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_encryption_init(c) ?: bch2_fs_compress_init(c) ?: bch2_fs_ec_init(c) ?: - bch2_fs_fsio_init(c); + bch2_fs_fsio_init(c) ?: + bch2_fs_fs_io_buffered_init(c); + bch2_fs_fs_io_direct_init(c); if (ret) goto err; @@ -2000,6 +2008,7 @@ err: BCH_DEBUG_PARAMS() #undef BCH_DEBUG_PARAM +__maybe_unused static unsigned bch2_metadata_version = bcachefs_metadata_version_current; module_param_named(version, bch2_metadata_version, uint, 0400); diff --git a/libbcachefs/super.h b/libbcachefs/super.h index 36bcb9ec..bf762df1 100644 --- a/libbcachefs/super.h +++ b/libbcachefs/super.h @@ -8,220 +8,6 @@ #include -static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s) -{ - return div_u64(s, ca->mi.bucket_size); -} - -static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b) -{ - return ((sector_t) b) * ca->mi.bucket_size; -} - -static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s) -{ - u32 remainder; - - div_u64_rem(s, ca->mi.bucket_size, &remainder); - return remainder; -} - -static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s, - u32 *offset) -{ - return div_u64_rem(s, ca->mi.bucket_size, offset); -} - -static inline bool bch2_dev_is_online(struct bch_dev *ca) -{ - return !percpu_ref_is_zero(&ca->io_ref); -} - -static inline bool bch2_dev_is_readable(struct bch_dev *ca) -{ - return bch2_dev_is_online(ca) && - ca->mi.state != BCH_MEMBER_STATE_failed; -} - -static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw) -{ - if (!percpu_ref_tryget(&ca->io_ref)) - return false; - - if (ca->mi.state == BCH_MEMBER_STATE_rw || - (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ)) - return true; - - percpu_ref_put(&ca->io_ref); - return false; -} - -static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs) -{ - return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX); -} - -static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs, - unsigned dev) -{ - unsigned i; - - for (i = 0; i < devs.nr; i++) - if (devs.devs[i] == dev) - return true; - - return false; -} - -static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs, - unsigned dev) -{ - unsigned i; - - for (i = 0; i < devs->nr; i++) - if (devs->devs[i] == dev) { - array_remove_item(devs->devs, devs->nr, i); - return; - } -} - -static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs, - unsigned dev) -{ - if (!bch2_dev_list_has_dev(*devs, dev)) { - BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs)); - devs->devs[devs->nr++] = dev; - } -} - -static inline struct bch_devs_list bch2_dev_list_single(unsigned dev) -{ - return (struct bch_devs_list) { .nr = 1, .devs[0] = dev }; -} - -static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter, - const struct bch_devs_mask *mask) -{ - struct bch_dev *ca = NULL; - - while ((*iter = mask - ? find_next_bit(mask->d, c->sb.nr_devices, *iter) - : *iter) < c->sb.nr_devices && - !(ca = rcu_dereference_check(c->devs[*iter], - lockdep_is_held(&c->state_lock)))) - (*iter)++; - - return ca; -} - -#define for_each_member_device_rcu(ca, c, iter, mask) \ - for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++) - -static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter) -{ - struct bch_dev *ca; - - rcu_read_lock(); - if ((ca = __bch2_next_dev(c, iter, NULL))) - percpu_ref_get(&ca->ref); - rcu_read_unlock(); - - return ca; -} - -/* - * If you break early, you must drop your ref on the current device - */ -#define for_each_member_device(ca, c, iter) \ - for ((iter) = 0; \ - (ca = bch2_get_next_dev(c, &(iter))); \ - percpu_ref_put(&ca->ref), (iter)++) - -static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, - unsigned *iter, - int state_mask) -{ - struct bch_dev *ca; - - rcu_read_lock(); - while ((ca = __bch2_next_dev(c, iter, NULL)) && - (!((1 << ca->mi.state) & state_mask) || - !percpu_ref_tryget(&ca->io_ref))) - (*iter)++; - rcu_read_unlock(); - - return ca; -} - -#define __for_each_online_member(ca, c, iter, state_mask) \ - for ((iter) = 0; \ - (ca = bch2_get_next_online_dev(c, &(iter), state_mask)); \ - percpu_ref_put(&ca->io_ref), (iter)++) - -#define for_each_online_member(ca, c, iter) \ - __for_each_online_member(ca, c, iter, ~0) - -#define for_each_rw_member(ca, c, iter) \ - __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_rw) - -#define for_each_readable_member(ca, c, iter) \ - __for_each_online_member(ca, c, iter, \ - (1 << BCH_MEMBER_STATE_rw)|(1 << BCH_MEMBER_STATE_ro)) - -/* - * If a key exists that references a device, the device won't be going away and - * we can omit rcu_read_lock(): - */ -static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx) -{ - EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); - - return rcu_dereference_check(c->devs[idx], 1); -} - -static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx) -{ - EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); - - return rcu_dereference_protected(c->devs[idx], - lockdep_is_held(&c->sb_lock) || - lockdep_is_held(&c->state_lock)); -} - -/* XXX kill, move to struct bch_fs */ -static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) -{ - struct bch_devs_mask devs; - struct bch_dev *ca; - unsigned i; - - memset(&devs, 0, sizeof(devs)); - for_each_online_member(ca, c, i) - __set_bit(ca->dev_idx, devs.d); - return devs; -} - -static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b) -{ - struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; - u64 b_offset = bucket_to_sector(ca, b); - u64 b_end = bucket_to_sector(ca, b + 1); - unsigned i; - - if (!b) - return true; - - for (i = 0; i < layout->nr_superblocks; i++) { - u64 offset = le64_to_cpu(layout->sb_offset[i]); - u64 end = offset + (1 << layout->sb_max_size_bits); - - if (!(offset >= b_end || end <= b_offset)) - return true; - } - - return false; -} - struct bch_fs *bch2_dev_to_fs(dev_t); struct bch_fs *bch2_uuid_to_fs(__uuid_t); -- cgit 1.2.3-korg