diff options
author | Coly Li <colyli@suse.de> | 2019-05-14 23:14:27 +0800 |
---|---|---|
committer | Coly Li <colyli@suse.de> | 2019-05-14 23:14:27 +0800 |
commit | 03d4e899a667696d44b08d3564e1fa57d181fab2 (patch) | |
tree | c0d3ae717b5517bc31db4936df5b605266e665db | |
parent | d78c68017a888eebb745cbf405d1bcb3d655372a (diff) | |
download | bcache-patches-03d4e899a667696d44b08d3564e1fa57d181fab2.tar.gz |
for-test:
- move v2 journal-deadlock fixes into journal-deadlock/v2/
- add more testing patches
42 files changed, 1344 insertions, 0 deletions
diff --git a/for-next/0000-cover-letter.patch b/for-current/0000-cover-letter.patch index 4c50e79..4c50e79 100644 --- a/for-next/0000-cover-letter.patch +++ b/for-current/0000-cover-letter.patch diff --git a/for-next/0001-bcache-fix-crashes-stopping-bcache-device-before-rea.patch b/for-current/0001-bcache-fix-crashes-stopping-bcache-device-before-rea.patch index e4b4e30..e4b4e30 100644 --- a/for-next/0001-bcache-fix-crashes-stopping-bcache-device-before-rea.patch +++ b/for-current/0001-bcache-fix-crashes-stopping-bcache-device-before-rea.patch diff --git a/for-next/0002-bcache-fix-inaccurate-result-of-unused-buckets.patch b/for-current/0002-bcache-fix-inaccurate-result-of-unused-buckets.patch index 9ade407..9ade407 100644 --- a/for-next/0002-bcache-fix-inaccurate-result-of-unused-buckets.patch +++ b/for-current/0002-bcache-fix-inaccurate-result-of-unused-buckets.patch diff --git a/for-next/0003-bcache-avoid-clang-Wunintialized-warning.patch b/for-current/0003-bcache-avoid-clang-Wunintialized-warning.patch index 0fc146b..0fc146b 100644 --- a/for-next/0003-bcache-avoid-clang-Wunintialized-warning.patch +++ b/for-current/0003-bcache-avoid-clang-Wunintialized-warning.patch diff --git a/for-next/0004-bcache-use-kmemdup_nul-for-CACHED_LABEL-buffer.patch b/for-current/0004-bcache-use-kmemdup_nul-for-CACHED_LABEL-buffer.patch index 2525d6d..2525d6d 100644 --- a/for-next/0004-bcache-use-kmemdup_nul-for-CACHED_LABEL-buffer.patch +++ b/for-current/0004-bcache-use-kmemdup_nul-for-CACHED_LABEL-buffer.patch diff --git a/for-next/0005-bcache-Clean-up-bch_get_congested.patch b/for-current/0005-bcache-Clean-up-bch_get_congested.patch index 37915b0..37915b0 100644 --- a/for-next/0005-bcache-Clean-up-bch_get_congested.patch +++ b/for-current/0005-bcache-Clean-up-bch_get_congested.patch diff --git a/for-next/0006-bcache-fix-a-race-between-cache-register-and-cachese.patch b/for-current/0006-bcache-fix-a-race-between-cache-register-and-cachese.patch index 1e0a60a..1e0a60a 100644 --- a/for-next/0006-bcache-fix-a-race-between-cache-register-and-cachese.patch +++ b/for-current/0006-bcache-fix-a-race-between-cache-register-and-cachese.patch diff --git a/for-next/0007-bcache-move-definition-of-int-ret-out-of-macro-read_.patch b/for-current/0007-bcache-move-definition-of-int-ret-out-of-macro-read_.patch index 8ec6b9f..8ec6b9f 100644 --- a/for-next/0007-bcache-move-definition-of-int-ret-out-of-macro-read_.patch +++ b/for-current/0007-bcache-move-definition-of-int-ret-out-of-macro-read_.patch diff --git a/for-next/0008-bcache-never-set-KEY_PTRS-of-jouranl-key-to-0-in-jou.patch b/for-current/0008-bcache-never-set-KEY_PTRS-of-jouranl-key-to-0-in-jou.patch index c6e2834..c6e2834 100644 --- a/for-next/0008-bcache-never-set-KEY_PTRS-of-jouranl-key-to-0-in-jou.patch +++ b/for-current/0008-bcache-never-set-KEY_PTRS-of-jouranl-key-to-0-in-jou.patch diff --git a/for-next/0009-bcache-add-failure-check-to-run_cache_set-for-journa.patch b/for-current/0009-bcache-add-failure-check-to-run_cache_set-for-journa.patch index aa0bd63..aa0bd63 100644 --- a/for-next/0009-bcache-add-failure-check-to-run_cache_set-for-journa.patch +++ b/for-current/0009-bcache-add-failure-check-to-run_cache_set-for-journa.patch diff --git a/for-next/0010-bcache-add-comments-for-kobj-release-callback-routin.patch b/for-current/0010-bcache-add-comments-for-kobj-release-callback-routin.patch index 3376600..3376600 100644 --- a/for-next/0010-bcache-add-comments-for-kobj-release-callback-routin.patch +++ b/for-current/0010-bcache-add-comments-for-kobj-release-callback-routin.patch diff --git a/for-next/0011-bcache-return-error-immediately-in-bch_journal_repla.patch b/for-current/0011-bcache-return-error-immediately-in-bch_journal_repla.patch index a0a26d7..a0a26d7 100644 --- a/for-next/0011-bcache-return-error-immediately-in-bch_journal_repla.patch +++ b/for-current/0011-bcache-return-error-immediately-in-bch_journal_repla.patch diff --git a/for-next/0012-bcache-add-error-check-for-calling-register_bdev.patch b/for-current/0012-bcache-add-error-check-for-calling-register_bdev.patch index 2ecf470..2ecf470 100644 --- a/for-next/0012-bcache-add-error-check-for-calling-register_bdev.patch +++ b/for-current/0012-bcache-add-error-check-for-calling-register_bdev.patch diff --git a/for-next/0013-bcache-Add-comments-for-blkdev_put-in-registration-c.patch b/for-current/0013-bcache-Add-comments-for-blkdev_put-in-registration-c.patch index d668b43..d668b43 100644 --- a/for-next/0013-bcache-Add-comments-for-blkdev_put-in-registration-c.patch +++ b/for-current/0013-bcache-Add-comments-for-blkdev_put-in-registration-c.patch diff --git a/for-next/0014-bcache-add-comments-for-closure_fn-to-be-called-in-c.patch b/for-current/0014-bcache-add-comments-for-closure_fn-to-be-called-in-c.patch index 547e03a..547e03a 100644 --- a/for-next/0014-bcache-add-comments-for-closure_fn-to-be-called-in-c.patch +++ b/for-current/0014-bcache-add-comments-for-closure_fn-to-be-called-in-c.patch diff --git a/for-next/0015-bcache-improve-bcache_reboot.patch b/for-current/0015-bcache-improve-bcache_reboot.patch index 8b570d1..8b570d1 100644 --- a/for-next/0015-bcache-improve-bcache_reboot.patch +++ b/for-current/0015-bcache-improve-bcache_reboot.patch diff --git a/for-next/0016-bcache-fix-failure-in-journal-relplay.patch b/for-current/0016-bcache-fix-failure-in-journal-relplay.patch index ad82e9e..ad82e9e 100644 --- a/for-next/0016-bcache-fix-failure-in-journal-relplay.patch +++ b/for-current/0016-bcache-fix-failure-in-journal-relplay.patch diff --git a/for-next/0017-bcache-fix-wrong-usage-use-after-freed-on-keylist-in.patch b/for-current/0017-bcache-fix-wrong-usage-use-after-freed-on-keylist-in.patch index 2d7b79c..2d7b79c 100644 --- a/for-next/0017-bcache-fix-wrong-usage-use-after-freed-on-keylist-in.patch +++ b/for-current/0017-bcache-fix-wrong-usage-use-after-freed-on-keylist-in.patch diff --git a/for-next/0018-bcache-avoid-potential-memleak-of-list-of-journal_re.patch b/for-current/0018-bcache-avoid-potential-memleak-of-list-of-journal_re.patch index 9f113ec..9f113ec 100644 --- a/for-next/0018-bcache-avoid-potential-memleak-of-list-of-journal_re.patch +++ b/for-current/0018-bcache-avoid-potential-memleak-of-list-of-journal_re.patch diff --git a/for-test/0001-bcache-check-return-value-of-prio_read.patch b/for-test/0001-bcache-check-return-value-of-prio_read.patch new file mode 100644 index 0000000..7d4c016 --- /dev/null +++ b/for-test/0001-bcache-check-return-value-of-prio_read.patch @@ -0,0 +1,76 @@ +From d6bda91f5c00320df4fc1dbdf5b95d5d1c22d606 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Thu, 14 Feb 2019 15:44:57 +0800 +Subject: [PATCH] bcache: check return value of prio_read() + +Then we can print out error message in run_cache_set() properly. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/super.c | 21 ++++++++++++++++----- + 1 file changed, 16 insertions(+), 5 deletions(-) + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 4dee119c3664..1147ed26febf 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -591,12 +591,13 @@ void bch_prio_write(struct cache *ca) + } + } + +-static void prio_read(struct cache *ca, uint64_t bucket) ++static int prio_read(struct cache *ca, uint64_t bucket) + { + struct prio_set *p = ca->disk_buckets; + struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d; + struct bucket *b; + unsigned int bucket_nr = 0; ++ int ret = -EIO; + + for (b = ca->buckets; + b < ca->buckets + ca->sb.nbuckets; +@@ -609,11 +610,15 @@ static void prio_read(struct cache *ca, uint64_t bucket) + prio_io(ca, bucket, REQ_OP_READ, 0); + + if (p->csum != +- bch_crc64(&p->magic, bucket_bytes(ca) - 8)) ++ bch_crc64(&p->magic, bucket_bytes(ca) - 8)) { + pr_warn("bad csum reading priorities"); ++ goto out; ++ } + +- if (p->magic != pset_magic(&ca->sb)) ++ if (p->magic != pset_magic(&ca->sb)) { + pr_warn("bad magic reading priorities"); ++ goto out; ++ } + + bucket = p->next_bucket; + d = p->data; +@@ -622,6 +627,10 @@ static void prio_read(struct cache *ca, uint64_t bucket) + b->prio = le16_to_cpu(d->prio); + b->gen = b->last_gc = d->gen; + } ++ ++ ret = 0; ++out: ++ return ret; + } + + /* Bcache device */ +@@ -1807,8 +1816,10 @@ static void run_cache_set(struct cache_set *c) + j = &list_entry(journal.prev, struct journal_replay, list)->j; + + err = "IO error reading priorities"; +- for_each_cache(ca, c, i) +- prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]); ++ for_each_cache(ca, c, i) { ++ if (prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev])) ++ goto err; ++ } + + /* + * If prio_read() fails it'll call cache_set_error and we'll +-- +2.16.4 + diff --git a/for-test/0001-bcache-ignore-read-ahead-request-failure-on-backing-.patch b/for-test/0001-bcache-ignore-read-ahead-request-failure-on-backing-.patch new file mode 100644 index 0000000..1338418 --- /dev/null +++ b/for-test/0001-bcache-ignore-read-ahead-request-failure-on-backing-.patch @@ -0,0 +1,55 @@ +From 31dc685d78b6f77ddd3d4ffa97478431a6602ed9 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Mon, 13 May 2019 22:48:09 +0800 +Subject: [PATCH 1/5] bcache: ignore read-ahead request failure on backing + device + +When md raid device (e.g. raid456) is used as backing device, read-ahead +requests on a degrading and recovering md raid device might be failured +immediately by md raid code, but indeed this md raid array can still be +read or write for normal I/O requests. Therefore such failed read-ahead +request are not real hardware failure. Further more, after degrading and +recovering accomplished, read-ahead requests will be handled by md raid +array again. + +For such condition, I/O failures of read-ahead requests don't indicate +real health status (because normal I/O still be served), they should not +be counted into I/O error counter dc->io_errors. + +Since there is no simple way to detect whether the backing divice is a +md raid device, this patch simply ignores I/O failures for read-ahead +bios on backing device, to avoid bogus backing device failure on a +degrading md raid array. + +Suggested-by: Thorsten Knabe <linux@thorsten-knabe.de> +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/io.c | 12 ++++++++++++ + 1 file changed, 12 insertions(+) + +diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c +index c25097968319..4d93f07f63e5 100644 +--- a/drivers/md/bcache/io.c ++++ b/drivers/md/bcache/io.c +@@ -58,6 +58,18 @@ void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio) + + WARN_ONCE(!dc, "NULL pointer of struct cached_dev"); + ++ /* ++ * Read-ahead requests on a degrading and recovering md raid ++ * (e.g. raid6) device might be failured immediately by md ++ * raid code, which is not a real hardware media failure. So ++ * we shouldn't count failed REQ_RAHEAD bio to dc->io_errors. ++ */ ++ if (bio->bi_opf & REQ_RAHEAD) { ++ pr_warn_ratelimited("%s: Read-ahead I/O failed on backing device, ignore", ++ dc->backing_dev_name); ++ return; ++ } ++ + errors = atomic_add_return(1, &dc->io_errors); + if (errors < dc->error_limit) + pr_err("%s: IO error on backing device, unrecoverable", +-- +2.16.4 + diff --git a/for-test/0002-bcache-add-io-error-counting-in-write_bdev_super_end.patch b/for-test/0002-bcache-add-io-error-counting-in-write_bdev_super_end.patch new file mode 100644 index 0000000..21e2ad5 --- /dev/null +++ b/for-test/0002-bcache-add-io-error-counting-in-write_bdev_super_end.patch @@ -0,0 +1,37 @@ +From 1ccada2ebb2f37fbe2b0a3705a3166e4f3f8d2fb Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Mon, 13 May 2019 23:42:39 +0800 +Subject: [PATCH 2/5] bcache: add io error counting in write_bdev_super_endio() + +When backing device super block is written by bch_write_bdev_super(), +the bio complete callback write_bdev_super_endio() simply ignores I/O +status. Indeed such write request also contribute to backing device +health status if the request failed. + +This patch checkes bio->bi_status in write_bdev_super_endio(), if there +is error, bch_count_backing_io_errors() will be called to count an I/O +error to dc->io_errors. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/super.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 1b63ac876169..2858682cce14 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -197,7 +197,9 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, + static void write_bdev_super_endio(struct bio *bio) + { + struct cached_dev *dc = bio->bi_private; +- /* XXX: error checking */ ++ ++ if (bio->bi_status) ++ bch_count_backing_io_errors(dc, bio); + + closure_put(&dc->sb_write); + } +-- +2.16.4 + diff --git a/for-test/0003-bcache-remove-XXX-comment-line-from-run_cache_set.patch b/for-test/0003-bcache-remove-XXX-comment-line-from-run_cache_set.patch new file mode 100644 index 0000000..9243605 --- /dev/null +++ b/for-test/0003-bcache-remove-XXX-comment-line-from-run_cache_set.patch @@ -0,0 +1,31 @@ +From 5e92305f8838785b2c42ed2cb8c5f2bc03103e94 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Mon, 13 May 2019 23:47:38 +0800 +Subject: [PATCH 3/5] bcache: remove "XXX:" comment line from run_cache_set() + +In previous bcache patches for Linux v5.2, the failure code path of +run_cache_set() is tested and fixed. So now the following comment +line can be removed from run_cache_set(), + /* XXX: test this, it's broken */ + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/super.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 2858682cce14..9d9f852852c6 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -1959,7 +1959,7 @@ static int run_cache_set(struct cache_set *c) + } + + closure_sync(&cl); +- /* XXX: test this, it's broken */ ++ + bch_cache_set_error(c, "%s", err); + + return -EIO; +-- +2.16.4 + diff --git a/for-test/0004-bcache-remove-unnecessary-prefetch-in-bset_search_tr.patch b/for-test/0004-bcache-remove-unnecessary-prefetch-in-bset_search_tr.patch new file mode 100644 index 0000000..d8f996a --- /dev/null +++ b/for-test/0004-bcache-remove-unnecessary-prefetch-in-bset_search_tr.patch @@ -0,0 +1,56 @@ +From 77980a54c7e90525e8cada5b75bc44daa214d9e5 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Tue, 14 May 2019 22:23:35 +0800 +Subject: [PATCH 4/5] bcache: remove unnecessary prefetch() in + bset_search_tree() + +In function bset_search_tree(), when p >= t->size, t->tree[0] will be +prefetched by the following code piece, + 974 unsigned int p = n << 4; + 975 + 976 p &= ((int) (p - t->size)) >> 31; + 977 + 978 prefetch(&t->tree[p]); + +The purpose of the above code is to avoid a branch instruction, but +when p >= t->size, prefetch(&t->tree[0]) has no positive performance +contribution at all. This patch avoids the unncessary prefetch by only +calling prefetch() when p < t->size. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/bset.c | 16 ++-------------- + 1 file changed, 2 insertions(+), 14 deletions(-) + +diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c +index 8f07fa6e1739..aa2e4ab0fab9 100644 +--- a/drivers/md/bcache/bset.c ++++ b/drivers/md/bcache/bset.c +@@ -960,22 +960,10 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t, + unsigned int inorder, j, n = 1; + + do { +- /* +- * A bit trick here. +- * If p < t->size, (int)(p - t->size) is a minus value and +- * the most significant bit is set, right shifting 31 bits +- * gets 1. If p >= t->size, the most significant bit is +- * not set, right shifting 31 bits gets 0. +- * So the following 2 lines equals to +- * if (p >= t->size) +- * p = 0; +- * but a branch instruction is avoided. +- */ + unsigned int p = n << 4; + +- p &= ((int) (p - t->size)) >> 31; +- +- prefetch(&t->tree[p]); ++ if (p < t->size) ++ prefetch(&t->tree[p]); + + j = n; + f = &t->tree[j]; +-- +2.16.4 + diff --git a/for-test/0005-bcache-make-bset_search_tree-be-more-understandable.patch b/for-test/0005-bcache-make-bset_search_tree-be-more-understandable.patch new file mode 100644 index 0000000..1ed7fae --- /dev/null +++ b/for-test/0005-bcache-make-bset_search_tree-be-more-understandable.patch @@ -0,0 +1,58 @@ +From 5e31e419f54eb8db7f4e95bf9328523e801c1dfb Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Tue, 14 May 2019 22:51:40 +0800 +Subject: [PATCH 5/5] bcache: make bset_search_tree() be more understandable + +The purpose of following code in bset_search_tree() is to avoid a branch +instruction, + 994 if (likely(f->exponent != 127)) + 995 n = j * 2 + (((unsigned int) + 996 (f->mantissa - + 997 bfloat_mantissa(search, f))) >> 31); + 998 else + 999 n = (bkey_cmp(tree_to_bkey(t, j), search) > 0) +1000 ? j * 2 +1001 : j * 2 + 1; + +This piece of code is not very clear to understand, even when I tried to +add code comment for it, I made mistake. This patch removes the implict +bit operation and uses explicit branch to calculate next location in +binary tree search. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/bset.c | 17 +++-------------- + 1 file changed, 3 insertions(+), 14 deletions(-) + +diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c +index aa2e4ab0fab9..f752cc791f50 100644 +--- a/drivers/md/bcache/bset.c ++++ b/drivers/md/bcache/bset.c +@@ -968,21 +968,10 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t, + j = n; + f = &t->tree[j]; + +- /* +- * Similar bit trick, use subtract operation to avoid a branch +- * instruction. +- * +- * n = (f->mantissa > bfloat_mantissa()) +- * ? j * 2 +- * : j * 2 + 1; +- * +- * We need to subtract 1 from f->mantissa for the sign bit trick +- * to work - that's done in make_bfloat() +- */ + if (likely(f->exponent != 127)) +- n = j * 2 + (((unsigned int) +- (f->mantissa - +- bfloat_mantissa(search, f))) >> 31); ++ n = (f->mantissa >= bfloat_mantissa(search, f)) ++ ? j * 2 ++ : j * 2 + 1; + else + n = (bkey_cmp(tree_to_bkey(t, j), search) > 0) + ? j * 2 +-- +2.16.4 + diff --git a/for-test/jouranl-deadlock/v2-0000-cover-letter.patch b/for-test/jouranl-deadlock/v2/v2-0000-cover-letter.patch index 19d3c21..19d3c21 100644 --- a/for-test/jouranl-deadlock/v2-0000-cover-letter.patch +++ b/for-test/jouranl-deadlock/v2/v2-0000-cover-letter.patch diff --git a/for-test/jouranl-deadlock/v2-0001-bcache-move-definition-of-int-ret-out-of-macro-re.patch b/for-test/jouranl-deadlock/v2/v2-0001-bcache-move-definition-of-int-ret-out-of-macro-re.patch index 6f5e2da..6f5e2da 100644 --- a/for-test/jouranl-deadlock/v2-0001-bcache-move-definition-of-int-ret-out-of-macro-re.patch +++ b/for-test/jouranl-deadlock/v2/v2-0001-bcache-move-definition-of-int-ret-out-of-macro-re.patch diff --git a/for-test/jouranl-deadlock/v2-0002-bcache-never-set-0-to-KEY_PTRS-of-jouranl-key-in-.patch b/for-test/jouranl-deadlock/v2/v2-0002-bcache-never-set-0-to-KEY_PTRS-of-jouranl-key-in-.patch index fcb490d..fcb490d 100644 --- a/for-test/jouranl-deadlock/v2-0002-bcache-never-set-0-to-KEY_PTRS-of-jouranl-key-in-.patch +++ b/for-test/jouranl-deadlock/v2/v2-0002-bcache-never-set-0-to-KEY_PTRS-of-jouranl-key-in-.patch diff --git a/for-test/jouranl-deadlock/v2/v2-0003-bcache-reload-jouranl-key-information-during-jour.patch b/for-test/jouranl-deadlock/v2/v2-0003-bcache-reload-jouranl-key-information-during-jour.patch new file mode 100644 index 0000000..cfe5323 --- /dev/null +++ b/for-test/jouranl-deadlock/v2/v2-0003-bcache-reload-jouranl-key-information-during-jour.patch @@ -0,0 +1,161 @@ +From e3c194808a99446e9bf69ac0707c7d3f473be518 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Wed, 27 Feb 2019 20:32:22 +0800 +Subject: [RFC PATCH v2 03/16] bcache: reload jouranl key information during + journal replay + +When bcache journal initiates during running cache set, cache set +journal.blocks_free is initiated as 0. Then during journal replay if +journal_meta() is called and an empty jset is written to cache device, +journal_reclaim() is called. If there is available journal bucket to +reclaim, c->journal.blocks_free is set to numbers of blocks of a journal +bucket, which is c->sb.bucket_size >> c->block_bits. + +Most of time the above process works correctly, expect the condtion +when journal space is almost full. "Almost full" means there is no free +journal bucket, but there are still free blocks in last available +bucket indexed by ja->cur_idx. + +If system crashes or reboots when journal space is almost full, problem +comes. During cache set reload after the reboot, c->journal.blocks_free +is initialized as 0, when jouranl replay process writes bcache jouranl, +journal_reclaim() will be called to reclaim available journal bucket and +set c->journal.blocks_free to c->sb.bucket_size >> c->block_bits. But +there is no fully free bucket to reclaim in journal_reclaim(), so value +of c->journal.blocks_free will keep 0. If the first journal entry +processed by journal_replay() causes btree split and requires writing +journal space by journal_meta(), journal_meta() has to go into an +infinite loop to reclaim jouranl bucket, and blocks the whole cache set +to run. + +Such buggy situation can be solved if we do following things before +journal replay starts, +- Recover previous value of c->journal.blocks_free in last run time, + and set it to current c->journal.blocks_free as initial value. +- Recover previous value of ja->cur_idx in last run time, and set it to + KEY_PTR of current c->journal.key as initial value. + +After c->journal.blocks_free and c->journal.key are recovered, in +condition when jouranl space is almost full and cache set is reloaded, +meta journal entry from journal reply can be written into free blocks of +the last available journal bucket, then old jouranl entries can be +replayed and reclaimed for further journaling request. + +This patch adds bch_journal_key_reload() to recover journal blocks_free +and key ptr value for above purpose. bch_journal_key_reload() is called +in bch_journal_read() before replying journal by bch_journal_replay(). + +Cc: stable@vger.kernel.org +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/journal.c | 87 +++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 87 insertions(+) + +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index 5180bed911ef..a6deb16c15c8 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -143,6 +143,89 @@ reread: left = ca->sb.bucket_size - offset; + return ret; + } + ++static int bch_journal_key_reload(struct cache_set *c) ++{ ++ struct cache *ca; ++ unsigned int iter, n = 0; ++ struct bkey *k = &c->journal.key; ++ int ret = 0; ++ ++ for_each_cache(ca, c, iter) { ++ struct journal_device *ja = &ca->journal; ++ struct bio *bio = &ja->bio; ++ struct jset *j, *data = c->journal.w[0].data; ++ struct closure cl; ++ unsigned int len, left; ++ unsigned int offset = 0, used_blocks = 0; ++ sector_t bucket = bucket_to_sector(c, ca->sb.d[ja->cur_idx]); ++ ++ closure_init_stack(&cl); ++ ++ while (offset < ca->sb.bucket_size) { ++reread: left = ca->sb.bucket_size - offset; ++ len = min_t(unsigned int, ++ left, PAGE_SECTORS << JSET_BITS); ++ ++ bio_reset(bio); ++ bio->bi_iter.bi_sector = bucket + offset; ++ bio_set_dev(bio, ca->bdev); ++ bio->bi_iter.bi_size = len << 9; ++ ++ bio->bi_end_io = journal_read_endio; ++ bio->bi_private = &cl; ++ bio_set_op_attrs(bio, REQ_OP_READ, 0); ++ bch_bio_map(bio, data); ++ ++ closure_bio_submit(c, bio, &cl); ++ closure_sync(&cl); ++ ++ j = data; ++ while (len) { ++ size_t blocks, bytes = set_bytes(j); ++ ++ if (j->magic != jset_magic(&ca->sb)) ++ goto out; ++ ++ if (bytes > left << 9 || ++ bytes > PAGE_SIZE << JSET_BITS) { ++ pr_err("jset may be correpted: too big"); ++ ret = -EIO; ++ goto err; ++ } ++ ++ if (bytes > len << 9) ++ goto reread; ++ ++ if (j->csum != csum_set(j)) { ++ pr_err("jset may be corrupted: bad csum"); ++ ret = -EIO; ++ goto err; ++ } ++ ++ blocks = set_blocks(j, block_bytes(c)); ++ used_blocks += blocks; ++ ++ offset += blocks * ca->sb.block_size; ++ len -= blocks * ca->sb.block_size; ++ j = ((void *) j) + blocks * block_bytes(ca); ++ } ++ } ++out: ++ c->journal.blocks_free = ++ (c->sb.bucket_size >> c->block_bits) - ++ used_blocks; ++ ++ k->ptr[n++] = MAKE_PTR(0, bucket, ca->sb.nr_this_dev); ++ } ++ ++ BUG_ON(n == 0); ++ bkey_init(k); ++ SET_KEY_PTRS(k, n); ++ ++err: ++ return ret; ++} ++ + int bch_journal_read(struct cache_set *c, struct list_head *list) + { + #define read_bucket(b) \ +@@ -268,6 +351,10 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) + struct journal_replay, + list)->j.seq; + ++ /* Initial value of c->journal.blocks_free should be 0 */ ++ BUG_ON(c->journal.blocks_free != 0); ++ ret = bch_journal_key_reload(c); ++ + return ret; + #undef read_bucket + } +-- +2.16.4 + diff --git a/for-test/jouranl-deadlock/v2/v2-0004-bcache-fix-journal-deadlock-during-jouranl-replay.patch b/for-test/jouranl-deadlock/v2/v2-0004-bcache-fix-journal-deadlock-during-jouranl-replay.patch new file mode 100644 index 0000000..39b9873 --- /dev/null +++ b/for-test/jouranl-deadlock/v2/v2-0004-bcache-fix-journal-deadlock-during-jouranl-replay.patch @@ -0,0 +1,276 @@ +From 97898c33b4126381cb08f8560623325cc23291e5 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Wed, 27 Feb 2019 20:35:02 +0800 +Subject: [RFC PATCH v2 04/16] bcache: fix journal deadlock during jouranl + replay + +A deadlock of bcache jouranling may happen during journal replay. Such +deadlock happens when, +- Journal space is totally full (no any free blocks) and system crashes + or reboots. +- After reboot, the first journal entry handled by jouranl replay causes + btree split and jouranl_meta() is called to write an empty jset to + journal space. +- There is no journal space to write and journal_reclaim() fails to get + any available bucket because this is the first replayed journal entry + to be blocked. +Then the whole cache set is blocked from running. + +This patch is an effort to fix such journal replay deadlock in a simpler +way, +- Add a bool varialbe 'in_replay' in struct journal, set it to true when + journal replay starts, and set it to false when journal replay + completed. in_replay is initialized to be false. +- Reserve 6 sectors in journal bucket, do not use them in normal bcache + runtime. These sectors are only permitted to use during journal + replay (when c->jouranl.in_replay is true) + +Then in normal bcache runtime, journal space won't be totally full and +there are 6 sectors are always reserved for journal replay time. After +system reboots, if bch_btree_insert() in bch_journal_replay() causes +btree split and bch_journal_beta() gets called to require 1 sector +from journal buckets to write an empty jset, there are enough reserved +space to serve. + +The reason to reserve 6 sectors is, we should choose a number that won't +fix into a bucket size. If the reserved space happens to be a whole +bucket, more logic has to be added in journal_replay() to handle +journal.blocks_free with reserved spaces in journal replay time. This is +why 6 sectors is choosed, it is 3KB and won't be any proper block size +or bucket size. + +The bcache btree node size is quite large, so btree node split won't be +a frequent event. And when btree node split happens, new added key will +be insert directly into uppper level or neighbor nodes and won't go into +journal again, only bch_journal_meta() is called to write jset metadata +which occupies 1 block in journal space. If blocksize is set to 4K size, +reserve 6 sectors indeed is 2 blocks, so there can be two continuously +btree splitting happen during journal replay, this is very very rare in +practice. As default blocksize is set to sector size, that equals to +6 blocks reserved. Contiously splitting the btree for 6 times in journal +replay is almost impossible, so the reserved space seems to be enough +in my humble opinion. + +If in future the reserved space turns out to be not enough, let's extend +it then. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/journal.c | 100 ++++++++++++++++++++++++++++++++++++++++---- + drivers/md/bcache/journal.h | 4 ++ + 2 files changed, 97 insertions(+), 7 deletions(-) + +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index a6deb16c15c8..c60a702f53a9 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -415,6 +415,8 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) + uint64_t start = i->j.last_seq, end = i->j.seq, n = start; + struct keylist keylist; + ++ s->journal.in_replay = true; ++ + list_for_each_entry(i, list, list) { + BUG_ON(i->pin && atomic_read(i->pin) != 1); + +@@ -448,6 +450,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) + pr_info("journal replay done, %i keys in %i entries, seq %llu", + keys, entries, end); + err: ++ s->journal.in_replay = false; + while (!list_empty(list)) { + i = list_first_entry(list, struct journal_replay, list); + list_del(&i->list); +@@ -577,6 +580,22 @@ static void do_journal_discard(struct cache *ca) + } + } + ++static inline bool last_available_journal_bucket(struct cache_set *c) ++{ ++ struct cache *ca; ++ unsigned int iter; ++ struct journal_device *ja; ++ ++ for_each_cache(ca, c, iter) { ++ ja = &ca->journal; ++ if (unlikely((ja->cur_idx + 1) % ca->sb.njournal_buckets == ++ ja->last_idx)) ++ return true; ++ } ++ ++ return false; ++} ++ + static void journal_reclaim(struct cache_set *c) + { + struct bkey *k = &c->journal.key; +@@ -584,6 +603,7 @@ static void journal_reclaim(struct cache_set *c) + uint64_t last_seq; + unsigned int iter, n = 0; + atomic_t p __maybe_unused; ++ bool last, do_wakeup = false; + + atomic_long_inc(&c->reclaim); + +@@ -606,8 +626,13 @@ static void journal_reclaim(struct cache_set *c) + for_each_cache(ca, c, iter) + do_journal_discard(ca); + +- if (c->journal.blocks_free) ++ last = last_available_journal_bucket(c); ++ if ((!last && c->journal.blocks_free) || ++ (last && (c->journal.blocks_free * c->sb.block_size) > ++ BCH_JOURNAL_RPLY_RESERVE)) { ++ do_wakeup = true; + goto out; ++ } + + /* + * Allocate: +@@ -632,9 +657,10 @@ static void journal_reclaim(struct cache_set *c) + bkey_init(k); + SET_KEY_PTRS(k, n); + c->journal.blocks_free = c->sb.bucket_size >> c->block_bits; ++ do_wakeup = true; + } + out: +- if (!journal_full(&c->journal)) ++ if (do_wakeup && !journal_full(&c->journal)) + __closure_wake_up(&c->journal.wait); + } + +@@ -692,6 +718,21 @@ static void journal_write_unlock(struct closure *cl) + spin_unlock(&c->journal.lock); + } + ++static bool should_reclaim(struct cache_set *c, ++ struct journal_write *w) ++{ ++ if (unlikely(journal_full(&c->journal))) ++ return true; ++ ++ if (unlikely(last_available_journal_bucket(c) && ++ (!c->journal.in_replay) && ++ (c->journal.blocks_free * c->sb.block_size <= ++ BCH_JOURNAL_RPLY_RESERVE))) ++ return true; ++ ++ return false; ++} ++ + static void journal_write_unlocked(struct closure *cl) + __releases(c->journal.lock) + { +@@ -710,7 +751,7 @@ static void journal_write_unlocked(struct closure *cl) + if (!w->need_write) { + closure_return_with_destructor(cl, journal_write_unlock); + return; +- } else if (journal_full(&c->journal)) { ++ } else if (should_reclaim(c, w)) { + journal_reclaim(c); + spin_unlock(&c->journal.lock); + +@@ -798,6 +839,52 @@ static void journal_try_write(struct cache_set *c) + } + } + ++static bool no_journal_wait(struct cache_set *c, ++ size_t sectors) ++{ ++ bool last = last_available_journal_bucket(c); ++ size_t reserved_sectors = 0; ++ size_t n = min_t(size_t, ++ c->journal.blocks_free * c->sb.block_size, ++ PAGE_SECTORS << JSET_BITS); ++ ++ if (last && !c->journal.in_replay) ++ reserved_sectors = BCH_JOURNAL_RPLY_RESERVE; ++ ++ if (sectors <= (n - reserved_sectors)) ++ return true; ++ ++ return false; ++} ++ ++static bool should_try_write(struct cache_set *c, ++ struct journal_write *w) ++{ ++ size_t reserved_sectors, n, sectors; ++ ++ if (journal_full(&c->journal)) ++ return false; ++ ++ if (!last_available_journal_bucket(c)) ++ return true; ++ ++ /* the check in no_journal_wait exceeds BCH_JOURNAL_RPLY_RESERVE */ ++ if (w->data->keys == 0) ++ return false; ++ ++ reserved_sectors = BCH_JOURNAL_RPLY_RESERVE; ++ n = min_t(size_t, ++ (c->journal.blocks_free * c->sb.block_size), ++ PAGE_SECTORS << JSET_BITS); ++ sectors = __set_blocks(w->data, w->data->keys, ++ block_bytes(c)) * c->sb.block_size; ++ if (sectors <= (n - reserved_sectors)) ++ return true; ++ ++ return false; ++} ++ ++ + static struct journal_write *journal_wait_for_write(struct cache_set *c, + unsigned int nkeys) + __acquires(&c->journal.lock) +@@ -816,15 +903,13 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c, + sectors = __set_blocks(w->data, w->data->keys + nkeys, + block_bytes(c)) * c->sb.block_size; + +- if (sectors <= min_t(size_t, +- c->journal.blocks_free * c->sb.block_size, +- PAGE_SECTORS << JSET_BITS)) ++ if (no_journal_wait(c, sectors)) + return w; + + if (wait) + closure_wait(&c->journal.wait, &cl); + +- if (!journal_full(&c->journal)) { ++ if (should_try_write(c, w)) { + if (wait) + trace_bcache_journal_entry_full(c); + +@@ -933,6 +1018,7 @@ int bch_journal_alloc(struct cache_set *c) + INIT_DELAYED_WORK(&j->work, journal_write_work); + + c->journal_delay_ms = 100; ++ j->in_replay = false; + + j->w[0].c = c; + j->w[1].c = c; +diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h +index 66f0facff84b..54408e248a39 100644 +--- a/drivers/md/bcache/journal.h ++++ b/drivers/md/bcache/journal.h +@@ -108,6 +108,7 @@ struct journal { + struct closure io; + int io_in_flight; + struct delayed_work work; ++ bool in_replay; + + /* Number of blocks free in the bucket(s) we're currently writing to */ + unsigned int blocks_free; +@@ -159,6 +160,9 @@ struct journal_device { + + #define JOURNAL_PIN 20000 + ++/* Reserved jouranl space in sectors */ ++#define BCH_JOURNAL_RPLY_RESERVE 6U ++ + #define journal_full(j) \ + (!(j)->blocks_free || fifo_free(&(j)->pin) <= 1) + +-- +2.16.4 + diff --git a/for-test/jouranl-deadlock/v2/v2-0005-bcache-reserve-space-for-journal_meta-in-run-time.patch b/for-test/jouranl-deadlock/v2/v2-0005-bcache-reserve-space-for-journal_meta-in-run-time.patch new file mode 100644 index 0000000..07050e9 --- /dev/null +++ b/for-test/jouranl-deadlock/v2/v2-0005-bcache-reserve-space-for-journal_meta-in-run-time.patch @@ -0,0 +1,241 @@ +From 4d3d26818916654397a930e8ce082b650dc809eb Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Thu, 28 Feb 2019 20:29:00 +0800 +Subject: [RFC PATCH v2 05/16] bcache: reserve space for journal_meta() in run + time + +Another journal deadlock of bcache jouranling can happen in normal +bcache runtime. It is very rare to happen but there are people report +bkey insert work queue blocked which caused by such deadlock. + +This is how such jouranling deadlock in runtime happens, +- Journal space is totally full and no free space to reclaim, jouranling + tasks waiting for space to write in journal_wait_for_write(). +- In order to have free journal space, btree_flush_write() is called to + flush earlest journaled in-memory btree key into btree node. Then all + journaled bkey in early used journal buckets are flushed to on-disk + btree, this journal bucket can be reclaimed for new coming jouranl + request. +- But if the earlest jouranled bkey causes a btree node split during + insert it into btree node, finally journal_meta() will be called to + journal btree root (and other information) into the journal space. +- Unfortunately the journal space is full, and the jouranl entries has + to be flushed in linear turn. So bch_journal_meta() from bkey insert + is blocked too. +Then jouranling deadlock during bcache run time happens. + +A method to fix such deadlock is to reserve some journal space too. The +reserved space can only be used when, +- Current journal bucket is the last journal bucket which has available + space to write into. +- When calling bch_journal(), current jset is empty and there is no key + in the inserting key list. This means the journal request if from + bch_journal_meta() and no non-reserved space can be used. + +Then if such journaling request is from bch_journal_meta() of inserting +the earlest journaled bkey back into btree, the deadlock condition won't +happen any more because the reserved space can be used for such +scenario. + +Since there are already 6 sectors reserved for journal replay, here we +reserve 7 sectors for runtime meta journal from btree split caused by +flushing journal entries back to btree node. Depends on block size from +1 sector to 4KB, the reserved space can serve for form 7 to 2 journal +blocks. Indeed only one journal block reserved for such journal deadlock +scenario is enough, 2 continuous btree splits cause by two adjoin bkey +flushing from journal is very very rare to happen. So reserve 7 sectors +should works. + +Another reason for reserving 7 sectors is, there are already 6 sectors +reserved fo journal repley, so in total there are 13 sectors reserved in +last available journal bucket. 13 sectors won't be a proper bucket size, +so we don't need to add more code to handle journal.blocks_free +initialization for whole reserved jouranl bucket. Even such code logic +is simple, less code is better in my humble opinion. + +Again, if in future the reserved space turns out to be not enough, let's +extend it then. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/journal.c | 89 +++++++++++++++++++++++++++++++++------------ + drivers/md/bcache/journal.h | 1 + + 2 files changed, 66 insertions(+), 24 deletions(-) + +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index c60a702f53a9..6aa68ab7cd78 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -629,7 +629,7 @@ static void journal_reclaim(struct cache_set *c) + last = last_available_journal_bucket(c); + if ((!last && c->journal.blocks_free) || + (last && (c->journal.blocks_free * c->sb.block_size) > +- BCH_JOURNAL_RPLY_RESERVE)) { ++ (BCH_JOURNAL_RESERVE + BCH_JOURNAL_RPLY_RESERVE))) { + do_wakeup = true; + goto out; + } +@@ -718,18 +718,27 @@ static void journal_write_unlock(struct closure *cl) + spin_unlock(&c->journal.lock); + } + +-static bool should_reclaim(struct cache_set *c, +- struct journal_write *w) ++static inline bool should_reclaim(struct cache_set *c, ++ struct journal_write *w) + { +- if (unlikely(journal_full(&c->journal))) +- return true; ++ bool last = last_available_journal_bucket(c); + +- if (unlikely(last_available_journal_bucket(c) && +- (!c->journal.in_replay) && +- (c->journal.blocks_free * c->sb.block_size <= +- BCH_JOURNAL_RPLY_RESERVE))) ++ if (!last && journal_full(&c->journal)) + return true; + ++ if (unlikely(last)) { ++ size_t n = c->journal.blocks_free * c->sb.block_size; ++ ++ if (!c->journal.in_replay) { ++ if (n <= BCH_JOURNAL_RESERVE + ++ BCH_JOURNAL_RPLY_RESERVE) ++ return true; ++ } else { ++ if (n <= BCH_JOURNAL_RPLY_RESERVE) ++ return true; ++ } ++ } ++ + return false; + } + +@@ -751,7 +760,9 @@ static void journal_write_unlocked(struct closure *cl) + if (!w->need_write) { + closure_return_with_destructor(cl, journal_write_unlock); + return; +- } else if (should_reclaim(c, w)) { ++ } ++ ++ if (should_reclaim(c, w)) { + journal_reclaim(c); + spin_unlock(&c->journal.lock); + +@@ -840,16 +851,26 @@ static void journal_try_write(struct cache_set *c) + } + + static bool no_journal_wait(struct cache_set *c, +- size_t sectors) ++ size_t sectors, ++ int nkeys) + { ++ bool is_journal_meta = (nkeys == 0) ? true : false; + bool last = last_available_journal_bucket(c); + size_t reserved_sectors = 0; +- size_t n = min_t(size_t, +- c->journal.blocks_free * c->sb.block_size, +- PAGE_SECTORS << JSET_BITS); ++ size_t n; ++ ++ if (unlikely(last)) { ++ if (!is_journal_meta) ++ reserved_sectors = BCH_JOURNAL_RESERVE + ++ BCH_JOURNAL_RPLY_RESERVE; ++ else ++ reserved_sectors = (!c->journal.in_replay) ? ++ BCH_JOURNAL_RPLY_RESERVE : 0; ++ } + +- if (last && !c->journal.in_replay) +- reserved_sectors = BCH_JOURNAL_RPLY_RESERVE; ++ n = min_t(size_t, ++ c->journal.blocks_free * c->sb.block_size, ++ PAGE_SECTORS << JSET_BITS); + + if (sectors <= (n - reserved_sectors)) + return true; +@@ -858,26 +879,46 @@ static bool no_journal_wait(struct cache_set *c, + } + + static bool should_try_write(struct cache_set *c, +- struct journal_write *w) ++ struct journal_write *w, ++ int nkeys) + { + size_t reserved_sectors, n, sectors; ++ bool last, empty_jset; + + if (journal_full(&c->journal)) + return false; + +- if (!last_available_journal_bucket(c)) ++ last = last_available_journal_bucket(c); ++ empty_jset = (w->data->keys == 0) ? true : false; ++ ++ if (!last) { ++ /* ++ * Not last available journal bucket, no reserved journal ++ * space restriction, an empty jset should not be here. ++ */ ++ BUG_ON(empty_jset); + return true; ++ } + +- /* the check in no_journal_wait exceeds BCH_JOURNAL_RPLY_RESERVE */ +- if (w->data->keys == 0) ++ if (empty_jset) { ++ /* ++ * If nkeys is 0 it means the journaling request is for meta ++ * data, which should be returned in journal_wait_for_write() ++ * by checking no_journal_wait(), and won't get here. ++ */ ++ BUG_ON(nkeys == 0); + return false; ++ } + +- reserved_sectors = BCH_JOURNAL_RPLY_RESERVE; ++ reserved_sectors = BCH_JOURNAL_RESERVE + ++ BCH_JOURNAL_RPLY_RESERVE; + n = min_t(size_t, + (c->journal.blocks_free * c->sb.block_size), + PAGE_SECTORS << JSET_BITS); +- sectors = __set_blocks(w->data, w->data->keys, ++ sectors = __set_blocks(w->data, ++ w->data->keys, + block_bytes(c)) * c->sb.block_size; ++ + if (sectors <= (n - reserved_sectors)) + return true; + +@@ -903,13 +944,13 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c, + sectors = __set_blocks(w->data, w->data->keys + nkeys, + block_bytes(c)) * c->sb.block_size; + +- if (no_journal_wait(c, sectors)) ++ if (no_journal_wait(c, sectors, nkeys)) + return w; + + if (wait) + closure_wait(&c->journal.wait, &cl); + +- if (should_try_write(c, w)) { ++ if (should_try_write(c, w, nkeys)) { + if (wait) + trace_bcache_journal_entry_full(c); + +diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h +index 54408e248a39..55f81443f304 100644 +--- a/drivers/md/bcache/journal.h ++++ b/drivers/md/bcache/journal.h +@@ -162,6 +162,7 @@ struct journal_device { + + /* Reserved jouranl space in sectors */ + #define BCH_JOURNAL_RPLY_RESERVE 6U ++#define BCH_JOURNAL_RESERVE 7U + + #define journal_full(j) \ + (!(j)->blocks_free || fifo_free(&(j)->pin) <= 1) +-- +2.16.4 + diff --git a/for-test/jouranl-deadlock/v2-0006-bcache-add-failure-check-to-run_cache_set-for-jou.patch b/for-test/jouranl-deadlock/v2/v2-0006-bcache-add-failure-check-to-run_cache_set-for-jou.patch index 47fee81..47fee81 100644 --- a/for-test/jouranl-deadlock/v2-0006-bcache-add-failure-check-to-run_cache_set-for-jou.patch +++ b/for-test/jouranl-deadlock/v2/v2-0006-bcache-add-failure-check-to-run_cache_set-for-jou.patch diff --git a/for-test/jouranl-deadlock/v2-0007-bcache-add-comments-for-kobj-release-callback-rou.patch b/for-test/jouranl-deadlock/v2/v2-0007-bcache-add-comments-for-kobj-release-callback-rou.patch index c675a6d..c675a6d 100644 --- a/for-test/jouranl-deadlock/v2-0007-bcache-add-comments-for-kobj-release-callback-rou.patch +++ b/for-test/jouranl-deadlock/v2/v2-0007-bcache-add-comments-for-kobj-release-callback-rou.patch diff --git a/for-test/jouranl-deadlock/v2-0008-bcache-return-error-immediately-in-bch_journal_re.patch b/for-test/jouranl-deadlock/v2/v2-0008-bcache-return-error-immediately-in-bch_journal_re.patch index 01f188c..01f188c 100644 --- a/for-test/jouranl-deadlock/v2-0008-bcache-return-error-immediately-in-bch_journal_re.patch +++ b/for-test/jouranl-deadlock/v2/v2-0008-bcache-return-error-immediately-in-bch_journal_re.patch diff --git a/for-test/jouranl-deadlock/v2-0009-bcache-add-error-check-for-calling-register_bdev.patch b/for-test/jouranl-deadlock/v2/v2-0009-bcache-add-error-check-for-calling-register_bdev.patch index 4d342e2..4d342e2 100644 --- a/for-test/jouranl-deadlock/v2-0009-bcache-add-error-check-for-calling-register_bdev.patch +++ b/for-test/jouranl-deadlock/v2/v2-0009-bcache-add-error-check-for-calling-register_bdev.patch diff --git a/for-test/jouranl-deadlock/v2-0010-bcache-Add-comments-for-blkdev_put-in-registratio.patch b/for-test/jouranl-deadlock/v2/v2-0010-bcache-Add-comments-for-blkdev_put-in-registratio.patch index 191177d..191177d 100644 --- a/for-test/jouranl-deadlock/v2-0010-bcache-Add-comments-for-blkdev_put-in-registratio.patch +++ b/for-test/jouranl-deadlock/v2/v2-0010-bcache-Add-comments-for-blkdev_put-in-registratio.patch diff --git a/for-test/jouranl-deadlock/v2-0011-bcache-add-comments-for-closure_fn-to-be-called-i.patch b/for-test/jouranl-deadlock/v2/v2-0011-bcache-add-comments-for-closure_fn-to-be-called-i.patch index 3b0c2e3..3b0c2e3 100644 --- a/for-test/jouranl-deadlock/v2-0011-bcache-add-comments-for-closure_fn-to-be-called-i.patch +++ b/for-test/jouranl-deadlock/v2/v2-0011-bcache-add-comments-for-closure_fn-to-be-called-i.patch diff --git a/for-test/jouranl-deadlock/v2/v2-0012-bcache-add-pendings_cleanup-to-stop-pending-bcach.patch b/for-test/jouranl-deadlock/v2/v2-0012-bcache-add-pendings_cleanup-to-stop-pending-bcach.patch new file mode 100644 index 0000000..d81c648 --- /dev/null +++ b/for-test/jouranl-deadlock/v2/v2-0012-bcache-add-pendings_cleanup-to-stop-pending-bcach.patch @@ -0,0 +1,107 @@ +From 6da8faaaf5e2ecd2fb3d11ae6bd8ab8ee19b39bc Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Wed, 20 Mar 2019 23:11:59 +0800 +Subject: [RFC PATCH v2 12/16] bcache: add pendings_cleanup to stop pending + bcache device + +If a bcache device is in dirty state and its cache set is not +registered, this bcache deivce will not appear in /dev/bcache<N>, +and there is no way to stop it or remove the bcache kernel module. + +This is an as-designed behavior, but sometimes people has to reboot +whole system to release or stop the pending backing device. + +This sysfs interface may remove such pending bcache devices when +write anything into the sysfs file manually. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/super.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 55 insertions(+) + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 9b41e0b62cc0..e988e46a6479 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -2246,9 +2246,13 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page, + + static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, + const char *buffer, size_t size); ++static ssize_t bch_pending_bdevs_cleanup(struct kobject *k, ++ struct kobj_attribute *attr, ++ const char *buffer, size_t size); + + kobj_attribute_write(register, register_bcache); + kobj_attribute_write(register_quiet, register_bcache); ++kobj_attribute_write(pendings_cleanup, bch_pending_bdevs_cleanup); + + static bool bch_is_open_backing(struct block_device *bdev) + { +@@ -2373,6 +2377,56 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, + goto out; + } + ++ ++struct pdev { ++ struct list_head list; ++ struct cached_dev *dc; ++}; ++ ++static ssize_t bch_pending_bdevs_cleanup(struct kobject *k, ++ struct kobj_attribute *attr, ++ const char *buffer, ++ size_t size) ++{ ++ LIST_HEAD(pending_devs); ++ ssize_t ret = size; ++ struct cached_dev *dc, *tdc; ++ struct pdev *pdev, *tpdev; ++ struct cache_set *c, *tc; ++ ++ mutex_lock(&bch_register_lock); ++ list_for_each_entry_safe(dc, tdc, &uncached_devices, list) { ++ pdev = kmalloc(sizeof(struct pdev), GFP_KERNEL); ++ if (!pdev) ++ break; ++ pdev->dc = dc; ++ list_add(&pdev->list, &pending_devs); ++ } ++ ++ list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) { ++ list_for_each_entry_safe(c, tc, &bch_cache_sets, list) { ++ char *pdev_set_uuid = pdev->dc->sb.set_uuid; ++ char *set_uuid = c->sb.uuid; ++ ++ if (!memcmp(pdev_set_uuid, set_uuid, 16)) { ++ list_del(&pdev->list); ++ kfree(pdev); ++ break; ++ } ++ } ++ } ++ mutex_unlock(&bch_register_lock); ++ ++ list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) { ++ pr_info("delete pdev %p", pdev); ++ list_del(&pdev->list); ++ bcache_device_stop(&pdev->dc->disk); ++ kfree(pdev); ++ } ++ ++ return ret; ++} ++ + static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) + { + if (code == SYS_DOWN || +@@ -2483,6 +2537,7 @@ static int __init bcache_init(void) + static const struct attribute *files[] = { + &ksysfs_register.attr, + &ksysfs_register_quiet.attr, ++ &ksysfs_pendings_cleanup.attr, + NULL + }; + +-- +2.16.4 + diff --git a/for-test/jouranl-deadlock/v2/v2-0013-bcache-fix-fifo-index-swapping-condition-in-btree.patch b/for-test/jouranl-deadlock/v2/v2-0013-bcache-fix-fifo-index-swapping-condition-in-btree.patch new file mode 100644 index 0000000..d76c955 --- /dev/null +++ b/for-test/jouranl-deadlock/v2/v2-0013-bcache-fix-fifo-index-swapping-condition-in-btree.patch @@ -0,0 +1,90 @@ +From e6ac565cfb5676a9e833e62570fb8a9d786eda47 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sat, 23 Mar 2019 22:54:35 +0800 +Subject: [RFC PATCH v2 13/16] bcache: fix fifo index swapping condition in + btree_flush_write() + +Current journal_max_cmp() and journal_min_cmp() assume that smaller fifo +index indicating elder journal entries, but this is only true when fifo +index is not swapped. + +Fifo structure journal.pin is implemented by a cycle buffer, if the head +index reaches highest location of the cycle buffer, it will be swapped +to 0. Once the swapping happens, it means a smaller fifo index might be +associated to a newer journal entry. So the btree node with oldest +journal entry won't be selected by btree_flush_write() to flush out to +cache device. The result is, the oldest journal entries may always has +no chance to be written into cache device, and after a reboot +bch_journal_replay() may complain some journal entries are missing. + +This patch handles the fifo index swapping conditions properly, then in +btree_flush_write() the btree node with oldest journal entry can be +slected from c->flush_btree correctly. + +Cc: stable@vger.kernel.org +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/journal.c | 47 +++++++++++++++++++++++++++++++++++++++------ + 1 file changed, 41 insertions(+), 6 deletions(-) + +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index bdb6f9cefe48..bc0e01151155 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -464,12 +464,47 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) + } + + /* Journalling */ +-#define journal_max_cmp(l, r) \ +- (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) < \ +- fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal)) +-#define journal_min_cmp(l, r) \ +- (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) > \ +- fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal)) ++#define journal_max_cmp(l, r) \ ++({ \ ++ int l_idx, r_idx, f_idx, b_idx; \ ++ bool _ret = true; \ ++ \ ++ l_idx = fifo_idx(&c->journal.pin, btree_current_write(l)->journal); \ ++ r_idx = fifo_idx(&c->journal.pin, btree_current_write(r)->journal); \ ++ f_idx = c->journal.pin.front; \ ++ b_idx = c->journal.pin.back; \ ++ \ ++ _ret = (l_idx < r_idx); \ ++ /* in case fifo back pointer is swapped */ \ ++ if (b_idx < f_idx) { \ ++ if (l_idx <= b_idx && r_idx >= f_idx) \ ++ _ret = false; \ ++ else if (l_idx >= f_idx && r_idx <= b_idx) \ ++ _ret = true; \ ++ } \ ++ _ret; \ ++}) ++ ++#define journal_min_cmp(l, r) \ ++({ \ ++ int l_idx, r_idx, f_idx, b_idx; \ ++ bool _ret = true; \ ++ \ ++ l_idx = fifo_idx(&c->journal.pin, btree_current_write(l)->journal); \ ++ r_idx = fifo_idx(&c->journal.pin, btree_current_write(r)->journal); \ ++ f_idx = c->journal.pin.front; \ ++ b_idx = c->journal.pin.back; \ ++ \ ++ _ret = (l_idx > r_idx); \ ++ /* in case fifo back pointer is swapped */ \ ++ if (b_idx < f_idx) { \ ++ if (l_idx <= b_idx && r_idx >= f_idx) \ ++ _ret = true; \ ++ else if (l_idx >= f_idx && r_idx <= b_idx) \ ++ _ret = false; \ ++ } \ ++ _ret; \ ++}) + + static void btree_flush_write(struct cache_set *c) + { +-- +2.16.4 + diff --git a/for-test/jouranl-deadlock/v2/v2-0014-bcache-try-to-flush-btree-nodes-as-many-as-possib.patch b/for-test/jouranl-deadlock/v2/v2-0014-bcache-try-to-flush-btree-nodes-as-many-as-possib.patch new file mode 100644 index 0000000..4955ef8 --- /dev/null +++ b/for-test/jouranl-deadlock/v2/v2-0014-bcache-try-to-flush-btree-nodes-as-many-as-possib.patch @@ -0,0 +1,82 @@ +From d5786e57fca69b65b4b334e34d9ec8033ed6721f Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sun, 24 Mar 2019 00:06:05 +0800 +Subject: [RFC PATCH v2 14/16] bcache: try to flush btree nodes as many as + possible + +When btree_flush_write() is called, it means the journal space is +exhuasted already. Current code only selects a single btree node to +write out, which may introduce huge cache bounce from the spinlock on +multiple cpu cores, when a lot of kworkers on journaling code path to +call btree_flush_write() for journal space reclaiming. + +This patch tries to flush as many btree node as possible inside +a single call to btree_flush_write(), then the frequence of calling +btree_flush_write() can be reduced, which in turn reduces the cache +bounce from spinlock on multiple cpu cores. Please notice that this +patch does not reduce the total times of acquiring spinlock, a spin +lock is still acquired when select every single btree node to write +out, but this patch will try best to hold the spinlock on same cpu +core, which avoids the cache bounce where the spinlock is acquired by +multiple different cpu cores. + +After the patch applied, in my pressure testing, 'top' shows more than +50% sys cpu time reduced from the kworks which competing spinlock +inside btree_flush_write(). + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/journal.c | 7 ++++++- + drivers/md/bcache/journal.h | 4 ++-- + 2 files changed, 8 insertions(+), 3 deletions(-) + +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index bc0e01151155..8536e76fcac9 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -514,6 +514,7 @@ static void btree_flush_write(struct cache_set *c) + */ + struct btree *b; + int i; ++ int n = FLUSH_BTREE_HEAP; + + atomic_long_inc(&c->flush_write); + +@@ -552,6 +553,10 @@ static void btree_flush_write(struct cache_set *c) + + __bch_btree_node_write(b, NULL); + mutex_unlock(&b->write_lock); ++ ++ /* try to flush btree nodes as many as possible */ ++ if (--n > 0) ++ goto retry; + } + } + +@@ -1102,7 +1107,7 @@ int bch_journal_alloc(struct cache_set *c) + j->w[0].c = c; + j->w[1].c = c; + +- if (!(init_heap(&c->flush_btree, 128, GFP_KERNEL)) || ++ if (!(init_heap(&c->flush_btree, FLUSH_BTREE_HEAP, GFP_KERNEL)) || + !(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || + !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) || + !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS))) +diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h +index 55f81443f304..a8be14c6f6d9 100644 +--- a/drivers/md/bcache/journal.h ++++ b/drivers/md/bcache/journal.h +@@ -158,8 +158,8 @@ struct journal_device { + #define journal_pin_cmp(c, l, r) \ + (fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r))) + +-#define JOURNAL_PIN 20000 +- ++#define FLUSH_BTREE_HEAP 128 ++#define JOURNAL_PIN 20000 + /* Reserved jouranl space in sectors */ + #define BCH_JOURNAL_RPLY_RESERVE 6U + #define BCH_JOURNAL_RESERVE 7U +-- +2.16.4 + diff --git a/for-test/jouranl-deadlock/v2-0015-bcache-improve-bcache_reboot.patch b/for-test/jouranl-deadlock/v2/v2-0015-bcache-improve-bcache_reboot.patch index 3c92f1d..3c92f1d 100644 --- a/for-test/jouranl-deadlock/v2-0015-bcache-improve-bcache_reboot.patch +++ b/for-test/jouranl-deadlock/v2/v2-0015-bcache-improve-bcache_reboot.patch diff --git a/for-test/jouranl-deadlock/v2/v2-0016-bcache-introduce-spinlock_t-flush_write_lock-in-s.patch b/for-test/jouranl-deadlock/v2/v2-0016-bcache-introduce-spinlock_t-flush_write_lock-in-s.patch new file mode 100644 index 0000000..a3d6691 --- /dev/null +++ b/for-test/jouranl-deadlock/v2/v2-0016-bcache-introduce-spinlock_t-flush_write_lock-in-s.patch @@ -0,0 +1,74 @@ +From 24539bb78565d784ddabb81f24968c13835eb000 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sun, 24 Mar 2019 23:55:27 +0800 +Subject: [RFC PATCH v2 16/16] bcache: introduce spinlock_t flush_write_lock in + struct journal + +In btree_flush_write(), iterating all cached btree nodes and adding them +into ordered heap c->flush_btree takes quite long time. In order to +protect ordered heap c->flush_btree, spin lock c->journal.lock is held +for all the iteration and heap ordering. When journal space is fully +occupied, btree_flush_write() might be called frequently, if the cached +btree node iteration takes too much time, kenrel will complain that +normal journal kworkers are blocked too long. Of cause write performance +drops at this moment. + +This patch introduces a new spin lock member in struct journal, named +flush_write_lock. This lock is only used in btree_flush_write() and +protect the ordered heap c->flush_btree during all the cached btree node +iteration. Then there won't be lock contention on c->journal.lock. + +After this fix, when journal space is fully occupied, it is very rare to +observe the journal kworker blocking timeout warning. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/journal.c | 5 +++-- + drivers/md/bcache/journal.h | 1 + + 2 files changed, 4 insertions(+), 2 deletions(-) + +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index 8536e76fcac9..6e38470f6924 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -519,7 +519,7 @@ static void btree_flush_write(struct cache_set *c) + atomic_long_inc(&c->flush_write); + + retry: +- spin_lock(&c->journal.lock); ++ spin_lock(&c->journal.flush_write_lock); + if (heap_empty(&c->flush_btree)) { + for_each_cached_btree(b, c, i) + if (btree_current_write(b)->journal) { +@@ -540,7 +540,7 @@ static void btree_flush_write(struct cache_set *c) + + b = NULL; + heap_pop(&c->flush_btree, b, journal_min_cmp); +- spin_unlock(&c->journal.lock); ++ spin_unlock(&c->journal.flush_write_lock); + + if (b) { + mutex_lock(&b->write_lock); +@@ -1099,6 +1099,7 @@ int bch_journal_alloc(struct cache_set *c) + struct journal *j = &c->journal; + + spin_lock_init(&j->lock); ++ spin_lock_init(&j->flush_write_lock); + INIT_DELAYED_WORK(&j->work, journal_write_work); + + c->journal_delay_ms = 100; +diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h +index a8be14c6f6d9..d8ad99f6191b 100644 +--- a/drivers/md/bcache/journal.h ++++ b/drivers/md/bcache/journal.h +@@ -103,6 +103,7 @@ struct journal_write { + /* Embedded in struct cache_set */ + struct journal { + spinlock_t lock; ++ spinlock_t flush_write_lock; + /* used when waiting because the journal was full */ + struct closure_waitlist wait; + struct closure io; +-- +2.16.4 + |