aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorColy Li <colyli@suse.de>2019-05-14 23:14:27 +0800
committerColy Li <colyli@suse.de>2019-05-14 23:14:27 +0800
commit03d4e899a667696d44b08d3564e1fa57d181fab2 (patch)
treec0d3ae717b5517bc31db4936df5b605266e665db
parentd78c68017a888eebb745cbf405d1bcb3d655372a (diff)
downloadbcache-patches-03d4e899a667696d44b08d3564e1fa57d181fab2.tar.gz
for-test:
- move v2 journal-deadlock fixes into journal-deadlock/v2/ - add more testing patches
-rw-r--r--for-current/0000-cover-letter.patch (renamed from for-next/0000-cover-letter.patch)0
-rw-r--r--for-current/0001-bcache-fix-crashes-stopping-bcache-device-before-rea.patch (renamed from for-next/0001-bcache-fix-crashes-stopping-bcache-device-before-rea.patch)0
-rw-r--r--for-current/0002-bcache-fix-inaccurate-result-of-unused-buckets.patch (renamed from for-next/0002-bcache-fix-inaccurate-result-of-unused-buckets.patch)0
-rw-r--r--for-current/0003-bcache-avoid-clang-Wunintialized-warning.patch (renamed from for-next/0003-bcache-avoid-clang-Wunintialized-warning.patch)0
-rw-r--r--for-current/0004-bcache-use-kmemdup_nul-for-CACHED_LABEL-buffer.patch (renamed from for-next/0004-bcache-use-kmemdup_nul-for-CACHED_LABEL-buffer.patch)0
-rw-r--r--for-current/0005-bcache-Clean-up-bch_get_congested.patch (renamed from for-next/0005-bcache-Clean-up-bch_get_congested.patch)0
-rw-r--r--for-current/0006-bcache-fix-a-race-between-cache-register-and-cachese.patch (renamed from for-next/0006-bcache-fix-a-race-between-cache-register-and-cachese.patch)0
-rw-r--r--for-current/0007-bcache-move-definition-of-int-ret-out-of-macro-read_.patch (renamed from for-next/0007-bcache-move-definition-of-int-ret-out-of-macro-read_.patch)0
-rw-r--r--for-current/0008-bcache-never-set-KEY_PTRS-of-jouranl-key-to-0-in-jou.patch (renamed from for-next/0008-bcache-never-set-KEY_PTRS-of-jouranl-key-to-0-in-jou.patch)0
-rw-r--r--for-current/0009-bcache-add-failure-check-to-run_cache_set-for-journa.patch (renamed from for-next/0009-bcache-add-failure-check-to-run_cache_set-for-journa.patch)0
-rw-r--r--for-current/0010-bcache-add-comments-for-kobj-release-callback-routin.patch (renamed from for-next/0010-bcache-add-comments-for-kobj-release-callback-routin.patch)0
-rw-r--r--for-current/0011-bcache-return-error-immediately-in-bch_journal_repla.patch (renamed from for-next/0011-bcache-return-error-immediately-in-bch_journal_repla.patch)0
-rw-r--r--for-current/0012-bcache-add-error-check-for-calling-register_bdev.patch (renamed from for-next/0012-bcache-add-error-check-for-calling-register_bdev.patch)0
-rw-r--r--for-current/0013-bcache-Add-comments-for-blkdev_put-in-registration-c.patch (renamed from for-next/0013-bcache-Add-comments-for-blkdev_put-in-registration-c.patch)0
-rw-r--r--for-current/0014-bcache-add-comments-for-closure_fn-to-be-called-in-c.patch (renamed from for-next/0014-bcache-add-comments-for-closure_fn-to-be-called-in-c.patch)0
-rw-r--r--for-current/0015-bcache-improve-bcache_reboot.patch (renamed from for-next/0015-bcache-improve-bcache_reboot.patch)0
-rw-r--r--for-current/0016-bcache-fix-failure-in-journal-relplay.patch (renamed from for-next/0016-bcache-fix-failure-in-journal-relplay.patch)0
-rw-r--r--for-current/0017-bcache-fix-wrong-usage-use-after-freed-on-keylist-in.patch (renamed from for-next/0017-bcache-fix-wrong-usage-use-after-freed-on-keylist-in.patch)0
-rw-r--r--for-current/0018-bcache-avoid-potential-memleak-of-list-of-journal_re.patch (renamed from for-next/0018-bcache-avoid-potential-memleak-of-list-of-journal_re.patch)0
-rw-r--r--for-test/0001-bcache-check-return-value-of-prio_read.patch76
-rw-r--r--for-test/0001-bcache-ignore-read-ahead-request-failure-on-backing-.patch55
-rw-r--r--for-test/0002-bcache-add-io-error-counting-in-write_bdev_super_end.patch37
-rw-r--r--for-test/0003-bcache-remove-XXX-comment-line-from-run_cache_set.patch31
-rw-r--r--for-test/0004-bcache-remove-unnecessary-prefetch-in-bset_search_tr.patch56
-rw-r--r--for-test/0005-bcache-make-bset_search_tree-be-more-understandable.patch58
-rw-r--r--for-test/jouranl-deadlock/v2/v2-0000-cover-letter.patch (renamed from for-test/jouranl-deadlock/v2-0000-cover-letter.patch)0
-rw-r--r--for-test/jouranl-deadlock/v2/v2-0001-bcache-move-definition-of-int-ret-out-of-macro-re.patch (renamed from for-test/jouranl-deadlock/v2-0001-bcache-move-definition-of-int-ret-out-of-macro-re.patch)0
-rw-r--r--for-test/jouranl-deadlock/v2/v2-0002-bcache-never-set-0-to-KEY_PTRS-of-jouranl-key-in-.patch (renamed from for-test/jouranl-deadlock/v2-0002-bcache-never-set-0-to-KEY_PTRS-of-jouranl-key-in-.patch)0
-rw-r--r--for-test/jouranl-deadlock/v2/v2-0003-bcache-reload-jouranl-key-information-during-jour.patch161
-rw-r--r--for-test/jouranl-deadlock/v2/v2-0004-bcache-fix-journal-deadlock-during-jouranl-replay.patch276
-rw-r--r--for-test/jouranl-deadlock/v2/v2-0005-bcache-reserve-space-for-journal_meta-in-run-time.patch241
-rw-r--r--for-test/jouranl-deadlock/v2/v2-0006-bcache-add-failure-check-to-run_cache_set-for-jou.patch (renamed from for-test/jouranl-deadlock/v2-0006-bcache-add-failure-check-to-run_cache_set-for-jou.patch)0
-rw-r--r--for-test/jouranl-deadlock/v2/v2-0007-bcache-add-comments-for-kobj-release-callback-rou.patch (renamed from for-test/jouranl-deadlock/v2-0007-bcache-add-comments-for-kobj-release-callback-rou.patch)0
-rw-r--r--for-test/jouranl-deadlock/v2/v2-0008-bcache-return-error-immediately-in-bch_journal_re.patch (renamed from for-test/jouranl-deadlock/v2-0008-bcache-return-error-immediately-in-bch_journal_re.patch)0
-rw-r--r--for-test/jouranl-deadlock/v2/v2-0009-bcache-add-error-check-for-calling-register_bdev.patch (renamed from for-test/jouranl-deadlock/v2-0009-bcache-add-error-check-for-calling-register_bdev.patch)0
-rw-r--r--for-test/jouranl-deadlock/v2/v2-0010-bcache-Add-comments-for-blkdev_put-in-registratio.patch (renamed from for-test/jouranl-deadlock/v2-0010-bcache-Add-comments-for-blkdev_put-in-registratio.patch)0
-rw-r--r--for-test/jouranl-deadlock/v2/v2-0011-bcache-add-comments-for-closure_fn-to-be-called-i.patch (renamed from for-test/jouranl-deadlock/v2-0011-bcache-add-comments-for-closure_fn-to-be-called-i.patch)0
-rw-r--r--for-test/jouranl-deadlock/v2/v2-0012-bcache-add-pendings_cleanup-to-stop-pending-bcach.patch107
-rw-r--r--for-test/jouranl-deadlock/v2/v2-0013-bcache-fix-fifo-index-swapping-condition-in-btree.patch90
-rw-r--r--for-test/jouranl-deadlock/v2/v2-0014-bcache-try-to-flush-btree-nodes-as-many-as-possib.patch82
-rw-r--r--for-test/jouranl-deadlock/v2/v2-0015-bcache-improve-bcache_reboot.patch (renamed from for-test/jouranl-deadlock/v2-0015-bcache-improve-bcache_reboot.patch)0
-rw-r--r--for-test/jouranl-deadlock/v2/v2-0016-bcache-introduce-spinlock_t-flush_write_lock-in-s.patch74
42 files changed, 1344 insertions, 0 deletions
diff --git a/for-next/0000-cover-letter.patch b/for-current/0000-cover-letter.patch
index 4c50e79..4c50e79 100644
--- a/for-next/0000-cover-letter.patch
+++ b/for-current/0000-cover-letter.patch
diff --git a/for-next/0001-bcache-fix-crashes-stopping-bcache-device-before-rea.patch b/for-current/0001-bcache-fix-crashes-stopping-bcache-device-before-rea.patch
index e4b4e30..e4b4e30 100644
--- a/for-next/0001-bcache-fix-crashes-stopping-bcache-device-before-rea.patch
+++ b/for-current/0001-bcache-fix-crashes-stopping-bcache-device-before-rea.patch
diff --git a/for-next/0002-bcache-fix-inaccurate-result-of-unused-buckets.patch b/for-current/0002-bcache-fix-inaccurate-result-of-unused-buckets.patch
index 9ade407..9ade407 100644
--- a/for-next/0002-bcache-fix-inaccurate-result-of-unused-buckets.patch
+++ b/for-current/0002-bcache-fix-inaccurate-result-of-unused-buckets.patch
diff --git a/for-next/0003-bcache-avoid-clang-Wunintialized-warning.patch b/for-current/0003-bcache-avoid-clang-Wunintialized-warning.patch
index 0fc146b..0fc146b 100644
--- a/for-next/0003-bcache-avoid-clang-Wunintialized-warning.patch
+++ b/for-current/0003-bcache-avoid-clang-Wunintialized-warning.patch
diff --git a/for-next/0004-bcache-use-kmemdup_nul-for-CACHED_LABEL-buffer.patch b/for-current/0004-bcache-use-kmemdup_nul-for-CACHED_LABEL-buffer.patch
index 2525d6d..2525d6d 100644
--- a/for-next/0004-bcache-use-kmemdup_nul-for-CACHED_LABEL-buffer.patch
+++ b/for-current/0004-bcache-use-kmemdup_nul-for-CACHED_LABEL-buffer.patch
diff --git a/for-next/0005-bcache-Clean-up-bch_get_congested.patch b/for-current/0005-bcache-Clean-up-bch_get_congested.patch
index 37915b0..37915b0 100644
--- a/for-next/0005-bcache-Clean-up-bch_get_congested.patch
+++ b/for-current/0005-bcache-Clean-up-bch_get_congested.patch
diff --git a/for-next/0006-bcache-fix-a-race-between-cache-register-and-cachese.patch b/for-current/0006-bcache-fix-a-race-between-cache-register-and-cachese.patch
index 1e0a60a..1e0a60a 100644
--- a/for-next/0006-bcache-fix-a-race-between-cache-register-and-cachese.patch
+++ b/for-current/0006-bcache-fix-a-race-between-cache-register-and-cachese.patch
diff --git a/for-next/0007-bcache-move-definition-of-int-ret-out-of-macro-read_.patch b/for-current/0007-bcache-move-definition-of-int-ret-out-of-macro-read_.patch
index 8ec6b9f..8ec6b9f 100644
--- a/for-next/0007-bcache-move-definition-of-int-ret-out-of-macro-read_.patch
+++ b/for-current/0007-bcache-move-definition-of-int-ret-out-of-macro-read_.patch
diff --git a/for-next/0008-bcache-never-set-KEY_PTRS-of-jouranl-key-to-0-in-jou.patch b/for-current/0008-bcache-never-set-KEY_PTRS-of-jouranl-key-to-0-in-jou.patch
index c6e2834..c6e2834 100644
--- a/for-next/0008-bcache-never-set-KEY_PTRS-of-jouranl-key-to-0-in-jou.patch
+++ b/for-current/0008-bcache-never-set-KEY_PTRS-of-jouranl-key-to-0-in-jou.patch
diff --git a/for-next/0009-bcache-add-failure-check-to-run_cache_set-for-journa.patch b/for-current/0009-bcache-add-failure-check-to-run_cache_set-for-journa.patch
index aa0bd63..aa0bd63 100644
--- a/for-next/0009-bcache-add-failure-check-to-run_cache_set-for-journa.patch
+++ b/for-current/0009-bcache-add-failure-check-to-run_cache_set-for-journa.patch
diff --git a/for-next/0010-bcache-add-comments-for-kobj-release-callback-routin.patch b/for-current/0010-bcache-add-comments-for-kobj-release-callback-routin.patch
index 3376600..3376600 100644
--- a/for-next/0010-bcache-add-comments-for-kobj-release-callback-routin.patch
+++ b/for-current/0010-bcache-add-comments-for-kobj-release-callback-routin.patch
diff --git a/for-next/0011-bcache-return-error-immediately-in-bch_journal_repla.patch b/for-current/0011-bcache-return-error-immediately-in-bch_journal_repla.patch
index a0a26d7..a0a26d7 100644
--- a/for-next/0011-bcache-return-error-immediately-in-bch_journal_repla.patch
+++ b/for-current/0011-bcache-return-error-immediately-in-bch_journal_repla.patch
diff --git a/for-next/0012-bcache-add-error-check-for-calling-register_bdev.patch b/for-current/0012-bcache-add-error-check-for-calling-register_bdev.patch
index 2ecf470..2ecf470 100644
--- a/for-next/0012-bcache-add-error-check-for-calling-register_bdev.patch
+++ b/for-current/0012-bcache-add-error-check-for-calling-register_bdev.patch
diff --git a/for-next/0013-bcache-Add-comments-for-blkdev_put-in-registration-c.patch b/for-current/0013-bcache-Add-comments-for-blkdev_put-in-registration-c.patch
index d668b43..d668b43 100644
--- a/for-next/0013-bcache-Add-comments-for-blkdev_put-in-registration-c.patch
+++ b/for-current/0013-bcache-Add-comments-for-blkdev_put-in-registration-c.patch
diff --git a/for-next/0014-bcache-add-comments-for-closure_fn-to-be-called-in-c.patch b/for-current/0014-bcache-add-comments-for-closure_fn-to-be-called-in-c.patch
index 547e03a..547e03a 100644
--- a/for-next/0014-bcache-add-comments-for-closure_fn-to-be-called-in-c.patch
+++ b/for-current/0014-bcache-add-comments-for-closure_fn-to-be-called-in-c.patch
diff --git a/for-next/0015-bcache-improve-bcache_reboot.patch b/for-current/0015-bcache-improve-bcache_reboot.patch
index 8b570d1..8b570d1 100644
--- a/for-next/0015-bcache-improve-bcache_reboot.patch
+++ b/for-current/0015-bcache-improve-bcache_reboot.patch
diff --git a/for-next/0016-bcache-fix-failure-in-journal-relplay.patch b/for-current/0016-bcache-fix-failure-in-journal-relplay.patch
index ad82e9e..ad82e9e 100644
--- a/for-next/0016-bcache-fix-failure-in-journal-relplay.patch
+++ b/for-current/0016-bcache-fix-failure-in-journal-relplay.patch
diff --git a/for-next/0017-bcache-fix-wrong-usage-use-after-freed-on-keylist-in.patch b/for-current/0017-bcache-fix-wrong-usage-use-after-freed-on-keylist-in.patch
index 2d7b79c..2d7b79c 100644
--- a/for-next/0017-bcache-fix-wrong-usage-use-after-freed-on-keylist-in.patch
+++ b/for-current/0017-bcache-fix-wrong-usage-use-after-freed-on-keylist-in.patch
diff --git a/for-next/0018-bcache-avoid-potential-memleak-of-list-of-journal_re.patch b/for-current/0018-bcache-avoid-potential-memleak-of-list-of-journal_re.patch
index 9f113ec..9f113ec 100644
--- a/for-next/0018-bcache-avoid-potential-memleak-of-list-of-journal_re.patch
+++ b/for-current/0018-bcache-avoid-potential-memleak-of-list-of-journal_re.patch
diff --git a/for-test/0001-bcache-check-return-value-of-prio_read.patch b/for-test/0001-bcache-check-return-value-of-prio_read.patch
new file mode 100644
index 0000000..7d4c016
--- /dev/null
+++ b/for-test/0001-bcache-check-return-value-of-prio_read.patch
@@ -0,0 +1,76 @@
+From d6bda91f5c00320df4fc1dbdf5b95d5d1c22d606 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Thu, 14 Feb 2019 15:44:57 +0800
+Subject: [PATCH] bcache: check return value of prio_read()
+
+Then we can print out error message in run_cache_set() properly.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+---
+ drivers/md/bcache/super.c | 21 ++++++++++++++++-----
+ 1 file changed, 16 insertions(+), 5 deletions(-)
+
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index 4dee119c3664..1147ed26febf 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -591,12 +591,13 @@ void bch_prio_write(struct cache *ca)
+ }
+ }
+
+-static void prio_read(struct cache *ca, uint64_t bucket)
++static int prio_read(struct cache *ca, uint64_t bucket)
+ {
+ struct prio_set *p = ca->disk_buckets;
+ struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d;
+ struct bucket *b;
+ unsigned int bucket_nr = 0;
++ int ret = -EIO;
+
+ for (b = ca->buckets;
+ b < ca->buckets + ca->sb.nbuckets;
+@@ -609,11 +610,15 @@ static void prio_read(struct cache *ca, uint64_t bucket)
+ prio_io(ca, bucket, REQ_OP_READ, 0);
+
+ if (p->csum !=
+- bch_crc64(&p->magic, bucket_bytes(ca) - 8))
++ bch_crc64(&p->magic, bucket_bytes(ca) - 8)) {
+ pr_warn("bad csum reading priorities");
++ goto out;
++ }
+
+- if (p->magic != pset_magic(&ca->sb))
++ if (p->magic != pset_magic(&ca->sb)) {
+ pr_warn("bad magic reading priorities");
++ goto out;
++ }
+
+ bucket = p->next_bucket;
+ d = p->data;
+@@ -622,6 +627,10 @@ static void prio_read(struct cache *ca, uint64_t bucket)
+ b->prio = le16_to_cpu(d->prio);
+ b->gen = b->last_gc = d->gen;
+ }
++
++ ret = 0;
++out:
++ return ret;
+ }
+
+ /* Bcache device */
+@@ -1807,8 +1816,10 @@ static void run_cache_set(struct cache_set *c)
+ j = &list_entry(journal.prev, struct journal_replay, list)->j;
+
+ err = "IO error reading priorities";
+- for_each_cache(ca, c, i)
+- prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]);
++ for_each_cache(ca, c, i) {
++ if (prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]))
++ goto err;
++ }
+
+ /*
+ * If prio_read() fails it'll call cache_set_error and we'll
+--
+2.16.4
+
diff --git a/for-test/0001-bcache-ignore-read-ahead-request-failure-on-backing-.patch b/for-test/0001-bcache-ignore-read-ahead-request-failure-on-backing-.patch
new file mode 100644
index 0000000..1338418
--- /dev/null
+++ b/for-test/0001-bcache-ignore-read-ahead-request-failure-on-backing-.patch
@@ -0,0 +1,55 @@
+From 31dc685d78b6f77ddd3d4ffa97478431a6602ed9 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Mon, 13 May 2019 22:48:09 +0800
+Subject: [PATCH 1/5] bcache: ignore read-ahead request failure on backing
+ device
+
+When md raid device (e.g. raid456) is used as backing device, read-ahead
+requests on a degrading and recovering md raid device might be failured
+immediately by md raid code, but indeed this md raid array can still be
+read or write for normal I/O requests. Therefore such failed read-ahead
+request are not real hardware failure. Further more, after degrading and
+recovering accomplished, read-ahead requests will be handled by md raid
+array again.
+
+For such condition, I/O failures of read-ahead requests don't indicate
+real health status (because normal I/O still be served), they should not
+be counted into I/O error counter dc->io_errors.
+
+Since there is no simple way to detect whether the backing divice is a
+md raid device, this patch simply ignores I/O failures for read-ahead
+bios on backing device, to avoid bogus backing device failure on a
+degrading md raid array.
+
+Suggested-by: Thorsten Knabe <linux@thorsten-knabe.de>
+Signed-off-by: Coly Li <colyli@suse.de>
+---
+ drivers/md/bcache/io.c | 12 ++++++++++++
+ 1 file changed, 12 insertions(+)
+
+diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
+index c25097968319..4d93f07f63e5 100644
+--- a/drivers/md/bcache/io.c
++++ b/drivers/md/bcache/io.c
+@@ -58,6 +58,18 @@ void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio)
+
+ WARN_ONCE(!dc, "NULL pointer of struct cached_dev");
+
++ /*
++ * Read-ahead requests on a degrading and recovering md raid
++ * (e.g. raid6) device might be failured immediately by md
++ * raid code, which is not a real hardware media failure. So
++ * we shouldn't count failed REQ_RAHEAD bio to dc->io_errors.
++ */
++ if (bio->bi_opf & REQ_RAHEAD) {
++ pr_warn_ratelimited("%s: Read-ahead I/O failed on backing device, ignore",
++ dc->backing_dev_name);
++ return;
++ }
++
+ errors = atomic_add_return(1, &dc->io_errors);
+ if (errors < dc->error_limit)
+ pr_err("%s: IO error on backing device, unrecoverable",
+--
+2.16.4
+
diff --git a/for-test/0002-bcache-add-io-error-counting-in-write_bdev_super_end.patch b/for-test/0002-bcache-add-io-error-counting-in-write_bdev_super_end.patch
new file mode 100644
index 0000000..21e2ad5
--- /dev/null
+++ b/for-test/0002-bcache-add-io-error-counting-in-write_bdev_super_end.patch
@@ -0,0 +1,37 @@
+From 1ccada2ebb2f37fbe2b0a3705a3166e4f3f8d2fb Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Mon, 13 May 2019 23:42:39 +0800
+Subject: [PATCH 2/5] bcache: add io error counting in write_bdev_super_endio()
+
+When backing device super block is written by bch_write_bdev_super(),
+the bio complete callback write_bdev_super_endio() simply ignores I/O
+status. Indeed such write request also contribute to backing device
+health status if the request failed.
+
+This patch checkes bio->bi_status in write_bdev_super_endio(), if there
+is error, bch_count_backing_io_errors() will be called to count an I/O
+error to dc->io_errors.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+---
+ drivers/md/bcache/super.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index 1b63ac876169..2858682cce14 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -197,7 +197,9 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
+ static void write_bdev_super_endio(struct bio *bio)
+ {
+ struct cached_dev *dc = bio->bi_private;
+- /* XXX: error checking */
++
++ if (bio->bi_status)
++ bch_count_backing_io_errors(dc, bio);
+
+ closure_put(&dc->sb_write);
+ }
+--
+2.16.4
+
diff --git a/for-test/0003-bcache-remove-XXX-comment-line-from-run_cache_set.patch b/for-test/0003-bcache-remove-XXX-comment-line-from-run_cache_set.patch
new file mode 100644
index 0000000..9243605
--- /dev/null
+++ b/for-test/0003-bcache-remove-XXX-comment-line-from-run_cache_set.patch
@@ -0,0 +1,31 @@
+From 5e92305f8838785b2c42ed2cb8c5f2bc03103e94 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Mon, 13 May 2019 23:47:38 +0800
+Subject: [PATCH 3/5] bcache: remove "XXX:" comment line from run_cache_set()
+
+In previous bcache patches for Linux v5.2, the failure code path of
+run_cache_set() is tested and fixed. So now the following comment
+line can be removed from run_cache_set(),
+ /* XXX: test this, it's broken */
+
+Signed-off-by: Coly Li <colyli@suse.de>
+---
+ drivers/md/bcache/super.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index 2858682cce14..9d9f852852c6 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -1959,7 +1959,7 @@ static int run_cache_set(struct cache_set *c)
+ }
+
+ closure_sync(&cl);
+- /* XXX: test this, it's broken */
++
+ bch_cache_set_error(c, "%s", err);
+
+ return -EIO;
+--
+2.16.4
+
diff --git a/for-test/0004-bcache-remove-unnecessary-prefetch-in-bset_search_tr.patch b/for-test/0004-bcache-remove-unnecessary-prefetch-in-bset_search_tr.patch
new file mode 100644
index 0000000..d8f996a
--- /dev/null
+++ b/for-test/0004-bcache-remove-unnecessary-prefetch-in-bset_search_tr.patch
@@ -0,0 +1,56 @@
+From 77980a54c7e90525e8cada5b75bc44daa214d9e5 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Tue, 14 May 2019 22:23:35 +0800
+Subject: [PATCH 4/5] bcache: remove unnecessary prefetch() in
+ bset_search_tree()
+
+In function bset_search_tree(), when p >= t->size, t->tree[0] will be
+prefetched by the following code piece,
+ 974 unsigned int p = n << 4;
+ 975
+ 976 p &= ((int) (p - t->size)) >> 31;
+ 977
+ 978 prefetch(&t->tree[p]);
+
+The purpose of the above code is to avoid a branch instruction, but
+when p >= t->size, prefetch(&t->tree[0]) has no positive performance
+contribution at all. This patch avoids the unncessary prefetch by only
+calling prefetch() when p < t->size.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+---
+ drivers/md/bcache/bset.c | 16 ++--------------
+ 1 file changed, 2 insertions(+), 14 deletions(-)
+
+diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
+index 8f07fa6e1739..aa2e4ab0fab9 100644
+--- a/drivers/md/bcache/bset.c
++++ b/drivers/md/bcache/bset.c
+@@ -960,22 +960,10 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t,
+ unsigned int inorder, j, n = 1;
+
+ do {
+- /*
+- * A bit trick here.
+- * If p < t->size, (int)(p - t->size) is a minus value and
+- * the most significant bit is set, right shifting 31 bits
+- * gets 1. If p >= t->size, the most significant bit is
+- * not set, right shifting 31 bits gets 0.
+- * So the following 2 lines equals to
+- * if (p >= t->size)
+- * p = 0;
+- * but a branch instruction is avoided.
+- */
+ unsigned int p = n << 4;
+
+- p &= ((int) (p - t->size)) >> 31;
+-
+- prefetch(&t->tree[p]);
++ if (p < t->size)
++ prefetch(&t->tree[p]);
+
+ j = n;
+ f = &t->tree[j];
+--
+2.16.4
+
diff --git a/for-test/0005-bcache-make-bset_search_tree-be-more-understandable.patch b/for-test/0005-bcache-make-bset_search_tree-be-more-understandable.patch
new file mode 100644
index 0000000..1ed7fae
--- /dev/null
+++ b/for-test/0005-bcache-make-bset_search_tree-be-more-understandable.patch
@@ -0,0 +1,58 @@
+From 5e31e419f54eb8db7f4e95bf9328523e801c1dfb Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Tue, 14 May 2019 22:51:40 +0800
+Subject: [PATCH 5/5] bcache: make bset_search_tree() be more understandable
+
+The purpose of following code in bset_search_tree() is to avoid a branch
+instruction,
+ 994 if (likely(f->exponent != 127))
+ 995 n = j * 2 + (((unsigned int)
+ 996 (f->mantissa -
+ 997 bfloat_mantissa(search, f))) >> 31);
+ 998 else
+ 999 n = (bkey_cmp(tree_to_bkey(t, j), search) > 0)
+1000 ? j * 2
+1001 : j * 2 + 1;
+
+This piece of code is not very clear to understand, even when I tried to
+add code comment for it, I made mistake. This patch removes the implict
+bit operation and uses explicit branch to calculate next location in
+binary tree search.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+---
+ drivers/md/bcache/bset.c | 17 +++--------------
+ 1 file changed, 3 insertions(+), 14 deletions(-)
+
+diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
+index aa2e4ab0fab9..f752cc791f50 100644
+--- a/drivers/md/bcache/bset.c
++++ b/drivers/md/bcache/bset.c
+@@ -968,21 +968,10 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t,
+ j = n;
+ f = &t->tree[j];
+
+- /*
+- * Similar bit trick, use subtract operation to avoid a branch
+- * instruction.
+- *
+- * n = (f->mantissa > bfloat_mantissa())
+- * ? j * 2
+- * : j * 2 + 1;
+- *
+- * We need to subtract 1 from f->mantissa for the sign bit trick
+- * to work - that's done in make_bfloat()
+- */
+ if (likely(f->exponent != 127))
+- n = j * 2 + (((unsigned int)
+- (f->mantissa -
+- bfloat_mantissa(search, f))) >> 31);
++ n = (f->mantissa >= bfloat_mantissa(search, f))
++ ? j * 2
++ : j * 2 + 1;
+ else
+ n = (bkey_cmp(tree_to_bkey(t, j), search) > 0)
+ ? j * 2
+--
+2.16.4
+
diff --git a/for-test/jouranl-deadlock/v2-0000-cover-letter.patch b/for-test/jouranl-deadlock/v2/v2-0000-cover-letter.patch
index 19d3c21..19d3c21 100644
--- a/for-test/jouranl-deadlock/v2-0000-cover-letter.patch
+++ b/for-test/jouranl-deadlock/v2/v2-0000-cover-letter.patch
diff --git a/for-test/jouranl-deadlock/v2-0001-bcache-move-definition-of-int-ret-out-of-macro-re.patch b/for-test/jouranl-deadlock/v2/v2-0001-bcache-move-definition-of-int-ret-out-of-macro-re.patch
index 6f5e2da..6f5e2da 100644
--- a/for-test/jouranl-deadlock/v2-0001-bcache-move-definition-of-int-ret-out-of-macro-re.patch
+++ b/for-test/jouranl-deadlock/v2/v2-0001-bcache-move-definition-of-int-ret-out-of-macro-re.patch
diff --git a/for-test/jouranl-deadlock/v2-0002-bcache-never-set-0-to-KEY_PTRS-of-jouranl-key-in-.patch b/for-test/jouranl-deadlock/v2/v2-0002-bcache-never-set-0-to-KEY_PTRS-of-jouranl-key-in-.patch
index fcb490d..fcb490d 100644
--- a/for-test/jouranl-deadlock/v2-0002-bcache-never-set-0-to-KEY_PTRS-of-jouranl-key-in-.patch
+++ b/for-test/jouranl-deadlock/v2/v2-0002-bcache-never-set-0-to-KEY_PTRS-of-jouranl-key-in-.patch
diff --git a/for-test/jouranl-deadlock/v2/v2-0003-bcache-reload-jouranl-key-information-during-jour.patch b/for-test/jouranl-deadlock/v2/v2-0003-bcache-reload-jouranl-key-information-during-jour.patch
new file mode 100644
index 0000000..cfe5323
--- /dev/null
+++ b/for-test/jouranl-deadlock/v2/v2-0003-bcache-reload-jouranl-key-information-during-jour.patch
@@ -0,0 +1,161 @@
+From e3c194808a99446e9bf69ac0707c7d3f473be518 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Wed, 27 Feb 2019 20:32:22 +0800
+Subject: [RFC PATCH v2 03/16] bcache: reload jouranl key information during
+ journal replay
+
+When bcache journal initiates during running cache set, cache set
+journal.blocks_free is initiated as 0. Then during journal replay if
+journal_meta() is called and an empty jset is written to cache device,
+journal_reclaim() is called. If there is available journal bucket to
+reclaim, c->journal.blocks_free is set to numbers of blocks of a journal
+bucket, which is c->sb.bucket_size >> c->block_bits.
+
+Most of time the above process works correctly, expect the condtion
+when journal space is almost full. "Almost full" means there is no free
+journal bucket, but there are still free blocks in last available
+bucket indexed by ja->cur_idx.
+
+If system crashes or reboots when journal space is almost full, problem
+comes. During cache set reload after the reboot, c->journal.blocks_free
+is initialized as 0, when jouranl replay process writes bcache jouranl,
+journal_reclaim() will be called to reclaim available journal bucket and
+set c->journal.blocks_free to c->sb.bucket_size >> c->block_bits. But
+there is no fully free bucket to reclaim in journal_reclaim(), so value
+of c->journal.blocks_free will keep 0. If the first journal entry
+processed by journal_replay() causes btree split and requires writing
+journal space by journal_meta(), journal_meta() has to go into an
+infinite loop to reclaim jouranl bucket, and blocks the whole cache set
+to run.
+
+Such buggy situation can be solved if we do following things before
+journal replay starts,
+- Recover previous value of c->journal.blocks_free in last run time,
+ and set it to current c->journal.blocks_free as initial value.
+- Recover previous value of ja->cur_idx in last run time, and set it to
+ KEY_PTR of current c->journal.key as initial value.
+
+After c->journal.blocks_free and c->journal.key are recovered, in
+condition when jouranl space is almost full and cache set is reloaded,
+meta journal entry from journal reply can be written into free blocks of
+the last available journal bucket, then old jouranl entries can be
+replayed and reclaimed for further journaling request.
+
+This patch adds bch_journal_key_reload() to recover journal blocks_free
+and key ptr value for above purpose. bch_journal_key_reload() is called
+in bch_journal_read() before replying journal by bch_journal_replay().
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Coly Li <colyli@suse.de>
+---
+ drivers/md/bcache/journal.c | 87 +++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 87 insertions(+)
+
+diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
+index 5180bed911ef..a6deb16c15c8 100644
+--- a/drivers/md/bcache/journal.c
++++ b/drivers/md/bcache/journal.c
+@@ -143,6 +143,89 @@ reread: left = ca->sb.bucket_size - offset;
+ return ret;
+ }
+
++static int bch_journal_key_reload(struct cache_set *c)
++{
++ struct cache *ca;
++ unsigned int iter, n = 0;
++ struct bkey *k = &c->journal.key;
++ int ret = 0;
++
++ for_each_cache(ca, c, iter) {
++ struct journal_device *ja = &ca->journal;
++ struct bio *bio = &ja->bio;
++ struct jset *j, *data = c->journal.w[0].data;
++ struct closure cl;
++ unsigned int len, left;
++ unsigned int offset = 0, used_blocks = 0;
++ sector_t bucket = bucket_to_sector(c, ca->sb.d[ja->cur_idx]);
++
++ closure_init_stack(&cl);
++
++ while (offset < ca->sb.bucket_size) {
++reread: left = ca->sb.bucket_size - offset;
++ len = min_t(unsigned int,
++ left, PAGE_SECTORS << JSET_BITS);
++
++ bio_reset(bio);
++ bio->bi_iter.bi_sector = bucket + offset;
++ bio_set_dev(bio, ca->bdev);
++ bio->bi_iter.bi_size = len << 9;
++
++ bio->bi_end_io = journal_read_endio;
++ bio->bi_private = &cl;
++ bio_set_op_attrs(bio, REQ_OP_READ, 0);
++ bch_bio_map(bio, data);
++
++ closure_bio_submit(c, bio, &cl);
++ closure_sync(&cl);
++
++ j = data;
++ while (len) {
++ size_t blocks, bytes = set_bytes(j);
++
++ if (j->magic != jset_magic(&ca->sb))
++ goto out;
++
++ if (bytes > left << 9 ||
++ bytes > PAGE_SIZE << JSET_BITS) {
++ pr_err("jset may be correpted: too big");
++ ret = -EIO;
++ goto err;
++ }
++
++ if (bytes > len << 9)
++ goto reread;
++
++ if (j->csum != csum_set(j)) {
++ pr_err("jset may be corrupted: bad csum");
++ ret = -EIO;
++ goto err;
++ }
++
++ blocks = set_blocks(j, block_bytes(c));
++ used_blocks += blocks;
++
++ offset += blocks * ca->sb.block_size;
++ len -= blocks * ca->sb.block_size;
++ j = ((void *) j) + blocks * block_bytes(ca);
++ }
++ }
++out:
++ c->journal.blocks_free =
++ (c->sb.bucket_size >> c->block_bits) -
++ used_blocks;
++
++ k->ptr[n++] = MAKE_PTR(0, bucket, ca->sb.nr_this_dev);
++ }
++
++ BUG_ON(n == 0);
++ bkey_init(k);
++ SET_KEY_PTRS(k, n);
++
++err:
++ return ret;
++}
++
+ int bch_journal_read(struct cache_set *c, struct list_head *list)
+ {
+ #define read_bucket(b) \
+@@ -268,6 +351,10 @@ int bch_journal_read(struct cache_set *c, struct list_head *list)
+ struct journal_replay,
+ list)->j.seq;
+
++ /* Initial value of c->journal.blocks_free should be 0 */
++ BUG_ON(c->journal.blocks_free != 0);
++ ret = bch_journal_key_reload(c);
++
+ return ret;
+ #undef read_bucket
+ }
+--
+2.16.4
+
diff --git a/for-test/jouranl-deadlock/v2/v2-0004-bcache-fix-journal-deadlock-during-jouranl-replay.patch b/for-test/jouranl-deadlock/v2/v2-0004-bcache-fix-journal-deadlock-during-jouranl-replay.patch
new file mode 100644
index 0000000..39b9873
--- /dev/null
+++ b/for-test/jouranl-deadlock/v2/v2-0004-bcache-fix-journal-deadlock-during-jouranl-replay.patch
@@ -0,0 +1,276 @@
+From 97898c33b4126381cb08f8560623325cc23291e5 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Wed, 27 Feb 2019 20:35:02 +0800
+Subject: [RFC PATCH v2 04/16] bcache: fix journal deadlock during jouranl
+ replay
+
+A deadlock of bcache jouranling may happen during journal replay. Such
+deadlock happens when,
+- Journal space is totally full (no any free blocks) and system crashes
+ or reboots.
+- After reboot, the first journal entry handled by jouranl replay causes
+ btree split and jouranl_meta() is called to write an empty jset to
+ journal space.
+- There is no journal space to write and journal_reclaim() fails to get
+ any available bucket because this is the first replayed journal entry
+ to be blocked.
+Then the whole cache set is blocked from running.
+
+This patch is an effort to fix such journal replay deadlock in a simpler
+way,
+- Add a bool varialbe 'in_replay' in struct journal, set it to true when
+ journal replay starts, and set it to false when journal replay
+ completed. in_replay is initialized to be false.
+- Reserve 6 sectors in journal bucket, do not use them in normal bcache
+ runtime. These sectors are only permitted to use during journal
+ replay (when c->jouranl.in_replay is true)
+
+Then in normal bcache runtime, journal space won't be totally full and
+there are 6 sectors are always reserved for journal replay time. After
+system reboots, if bch_btree_insert() in bch_journal_replay() causes
+btree split and bch_journal_beta() gets called to require 1 sector
+from journal buckets to write an empty jset, there are enough reserved
+space to serve.
+
+The reason to reserve 6 sectors is, we should choose a number that won't
+fix into a bucket size. If the reserved space happens to be a whole
+bucket, more logic has to be added in journal_replay() to handle
+journal.blocks_free with reserved spaces in journal replay time. This is
+why 6 sectors is choosed, it is 3KB and won't be any proper block size
+or bucket size.
+
+The bcache btree node size is quite large, so btree node split won't be
+a frequent event. And when btree node split happens, new added key will
+be insert directly into uppper level or neighbor nodes and won't go into
+journal again, only bch_journal_meta() is called to write jset metadata
+which occupies 1 block in journal space. If blocksize is set to 4K size,
+reserve 6 sectors indeed is 2 blocks, so there can be two continuously
+btree splitting happen during journal replay, this is very very rare in
+practice. As default blocksize is set to sector size, that equals to
+6 blocks reserved. Contiously splitting the btree for 6 times in journal
+replay is almost impossible, so the reserved space seems to be enough
+in my humble opinion.
+
+If in future the reserved space turns out to be not enough, let's extend
+it then.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+---
+ drivers/md/bcache/journal.c | 100 ++++++++++++++++++++++++++++++++++++++++----
+ drivers/md/bcache/journal.h | 4 ++
+ 2 files changed, 97 insertions(+), 7 deletions(-)
+
+diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
+index a6deb16c15c8..c60a702f53a9 100644
+--- a/drivers/md/bcache/journal.c
++++ b/drivers/md/bcache/journal.c
+@@ -415,6 +415,8 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
+ uint64_t start = i->j.last_seq, end = i->j.seq, n = start;
+ struct keylist keylist;
+
++ s->journal.in_replay = true;
++
+ list_for_each_entry(i, list, list) {
+ BUG_ON(i->pin && atomic_read(i->pin) != 1);
+
+@@ -448,6 +450,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
+ pr_info("journal replay done, %i keys in %i entries, seq %llu",
+ keys, entries, end);
+ err:
++ s->journal.in_replay = false;
+ while (!list_empty(list)) {
+ i = list_first_entry(list, struct journal_replay, list);
+ list_del(&i->list);
+@@ -577,6 +580,22 @@ static void do_journal_discard(struct cache *ca)
+ }
+ }
+
++static inline bool last_available_journal_bucket(struct cache_set *c)
++{
++ struct cache *ca;
++ unsigned int iter;
++ struct journal_device *ja;
++
++ for_each_cache(ca, c, iter) {
++ ja = &ca->journal;
++ if (unlikely((ja->cur_idx + 1) % ca->sb.njournal_buckets ==
++ ja->last_idx))
++ return true;
++ }
++
++ return false;
++}
++
+ static void journal_reclaim(struct cache_set *c)
+ {
+ struct bkey *k = &c->journal.key;
+@@ -584,6 +603,7 @@ static void journal_reclaim(struct cache_set *c)
+ uint64_t last_seq;
+ unsigned int iter, n = 0;
+ atomic_t p __maybe_unused;
++ bool last, do_wakeup = false;
+
+ atomic_long_inc(&c->reclaim);
+
+@@ -606,8 +626,13 @@ static void journal_reclaim(struct cache_set *c)
+ for_each_cache(ca, c, iter)
+ do_journal_discard(ca);
+
+- if (c->journal.blocks_free)
++ last = last_available_journal_bucket(c);
++ if ((!last && c->journal.blocks_free) ||
++ (last && (c->journal.blocks_free * c->sb.block_size) >
++ BCH_JOURNAL_RPLY_RESERVE)) {
++ do_wakeup = true;
+ goto out;
++ }
+
+ /*
+ * Allocate:
+@@ -632,9 +657,10 @@ static void journal_reclaim(struct cache_set *c)
+ bkey_init(k);
+ SET_KEY_PTRS(k, n);
+ c->journal.blocks_free = c->sb.bucket_size >> c->block_bits;
++ do_wakeup = true;
+ }
+ out:
+- if (!journal_full(&c->journal))
++ if (do_wakeup && !journal_full(&c->journal))
+ __closure_wake_up(&c->journal.wait);
+ }
+
+@@ -692,6 +718,21 @@ static void journal_write_unlock(struct closure *cl)
+ spin_unlock(&c->journal.lock);
+ }
+
++static bool should_reclaim(struct cache_set *c,
++ struct journal_write *w)
++{
++ if (unlikely(journal_full(&c->journal)))
++ return true;
++
++ if (unlikely(last_available_journal_bucket(c) &&
++ (!c->journal.in_replay) &&
++ (c->journal.blocks_free * c->sb.block_size <=
++ BCH_JOURNAL_RPLY_RESERVE)))
++ return true;
++
++ return false;
++}
++
+ static void journal_write_unlocked(struct closure *cl)
+ __releases(c->journal.lock)
+ {
+@@ -710,7 +751,7 @@ static void journal_write_unlocked(struct closure *cl)
+ if (!w->need_write) {
+ closure_return_with_destructor(cl, journal_write_unlock);
+ return;
+- } else if (journal_full(&c->journal)) {
++ } else if (should_reclaim(c, w)) {
+ journal_reclaim(c);
+ spin_unlock(&c->journal.lock);
+
+@@ -798,6 +839,52 @@ static void journal_try_write(struct cache_set *c)
+ }
+ }
+
++static bool no_journal_wait(struct cache_set *c,
++ size_t sectors)
++{
++ bool last = last_available_journal_bucket(c);
++ size_t reserved_sectors = 0;
++ size_t n = min_t(size_t,
++ c->journal.blocks_free * c->sb.block_size,
++ PAGE_SECTORS << JSET_BITS);
++
++ if (last && !c->journal.in_replay)
++ reserved_sectors = BCH_JOURNAL_RPLY_RESERVE;
++
++ if (sectors <= (n - reserved_sectors))
++ return true;
++
++ return false;
++}
++
++static bool should_try_write(struct cache_set *c,
++ struct journal_write *w)
++{
++ size_t reserved_sectors, n, sectors;
++
++ if (journal_full(&c->journal))
++ return false;
++
++ if (!last_available_journal_bucket(c))
++ return true;
++
++ /* the check in no_journal_wait exceeds BCH_JOURNAL_RPLY_RESERVE */
++ if (w->data->keys == 0)
++ return false;
++
++ reserved_sectors = BCH_JOURNAL_RPLY_RESERVE;
++ n = min_t(size_t,
++ (c->journal.blocks_free * c->sb.block_size),
++ PAGE_SECTORS << JSET_BITS);
++ sectors = __set_blocks(w->data, w->data->keys,
++ block_bytes(c)) * c->sb.block_size;
++ if (sectors <= (n - reserved_sectors))
++ return true;
++
++ return false;
++}
++
++
+ static struct journal_write *journal_wait_for_write(struct cache_set *c,
+ unsigned int nkeys)
+ __acquires(&c->journal.lock)
+@@ -816,15 +903,13 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c,
+ sectors = __set_blocks(w->data, w->data->keys + nkeys,
+ block_bytes(c)) * c->sb.block_size;
+
+- if (sectors <= min_t(size_t,
+- c->journal.blocks_free * c->sb.block_size,
+- PAGE_SECTORS << JSET_BITS))
++ if (no_journal_wait(c, sectors))
+ return w;
+
+ if (wait)
+ closure_wait(&c->journal.wait, &cl);
+
+- if (!journal_full(&c->journal)) {
++ if (should_try_write(c, w)) {
+ if (wait)
+ trace_bcache_journal_entry_full(c);
+
+@@ -933,6 +1018,7 @@ int bch_journal_alloc(struct cache_set *c)
+ INIT_DELAYED_WORK(&j->work, journal_write_work);
+
+ c->journal_delay_ms = 100;
++ j->in_replay = false;
+
+ j->w[0].c = c;
+ j->w[1].c = c;
+diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
+index 66f0facff84b..54408e248a39 100644
+--- a/drivers/md/bcache/journal.h
++++ b/drivers/md/bcache/journal.h
+@@ -108,6 +108,7 @@ struct journal {
+ struct closure io;
+ int io_in_flight;
+ struct delayed_work work;
++ bool in_replay;
+
+ /* Number of blocks free in the bucket(s) we're currently writing to */
+ unsigned int blocks_free;
+@@ -159,6 +160,9 @@ struct journal_device {
+
+ #define JOURNAL_PIN 20000
+
++/* Reserved jouranl space in sectors */
++#define BCH_JOURNAL_RPLY_RESERVE 6U
++
+ #define journal_full(j) \
+ (!(j)->blocks_free || fifo_free(&(j)->pin) <= 1)
+
+--
+2.16.4
+
diff --git a/for-test/jouranl-deadlock/v2/v2-0005-bcache-reserve-space-for-journal_meta-in-run-time.patch b/for-test/jouranl-deadlock/v2/v2-0005-bcache-reserve-space-for-journal_meta-in-run-time.patch
new file mode 100644
index 0000000..07050e9
--- /dev/null
+++ b/for-test/jouranl-deadlock/v2/v2-0005-bcache-reserve-space-for-journal_meta-in-run-time.patch
@@ -0,0 +1,241 @@
+From 4d3d26818916654397a930e8ce082b650dc809eb Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Thu, 28 Feb 2019 20:29:00 +0800
+Subject: [RFC PATCH v2 05/16] bcache: reserve space for journal_meta() in run
+ time
+
+Another journal deadlock of bcache jouranling can happen in normal
+bcache runtime. It is very rare to happen but there are people report
+bkey insert work queue blocked which caused by such deadlock.
+
+This is how such jouranling deadlock in runtime happens,
+- Journal space is totally full and no free space to reclaim, jouranling
+ tasks waiting for space to write in journal_wait_for_write().
+- In order to have free journal space, btree_flush_write() is called to
+ flush earlest journaled in-memory btree key into btree node. Then all
+ journaled bkey in early used journal buckets are flushed to on-disk
+ btree, this journal bucket can be reclaimed for new coming jouranl
+ request.
+- But if the earlest jouranled bkey causes a btree node split during
+ insert it into btree node, finally journal_meta() will be called to
+ journal btree root (and other information) into the journal space.
+- Unfortunately the journal space is full, and the jouranl entries has
+ to be flushed in linear turn. So bch_journal_meta() from bkey insert
+ is blocked too.
+Then jouranling deadlock during bcache run time happens.
+
+A method to fix such deadlock is to reserve some journal space too. The
+reserved space can only be used when,
+- Current journal bucket is the last journal bucket which has available
+ space to write into.
+- When calling bch_journal(), current jset is empty and there is no key
+ in the inserting key list. This means the journal request if from
+ bch_journal_meta() and no non-reserved space can be used.
+
+Then if such journaling request is from bch_journal_meta() of inserting
+the earlest journaled bkey back into btree, the deadlock condition won't
+happen any more because the reserved space can be used for such
+scenario.
+
+Since there are already 6 sectors reserved for journal replay, here we
+reserve 7 sectors for runtime meta journal from btree split caused by
+flushing journal entries back to btree node. Depends on block size from
+1 sector to 4KB, the reserved space can serve for form 7 to 2 journal
+blocks. Indeed only one journal block reserved for such journal deadlock
+scenario is enough, 2 continuous btree splits cause by two adjoin bkey
+flushing from journal is very very rare to happen. So reserve 7 sectors
+should works.
+
+Another reason for reserving 7 sectors is, there are already 6 sectors
+reserved fo journal repley, so in total there are 13 sectors reserved in
+last available journal bucket. 13 sectors won't be a proper bucket size,
+so we don't need to add more code to handle journal.blocks_free
+initialization for whole reserved jouranl bucket. Even such code logic
+is simple, less code is better in my humble opinion.
+
+Again, if in future the reserved space turns out to be not enough, let's
+extend it then.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+---
+ drivers/md/bcache/journal.c | 89 +++++++++++++++++++++++++++++++++------------
+ drivers/md/bcache/journal.h | 1 +
+ 2 files changed, 66 insertions(+), 24 deletions(-)
+
+diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
+index c60a702f53a9..6aa68ab7cd78 100644
+--- a/drivers/md/bcache/journal.c
++++ b/drivers/md/bcache/journal.c
+@@ -629,7 +629,7 @@ static void journal_reclaim(struct cache_set *c)
+ last = last_available_journal_bucket(c);
+ if ((!last && c->journal.blocks_free) ||
+ (last && (c->journal.blocks_free * c->sb.block_size) >
+- BCH_JOURNAL_RPLY_RESERVE)) {
++ (BCH_JOURNAL_RESERVE + BCH_JOURNAL_RPLY_RESERVE))) {
+ do_wakeup = true;
+ goto out;
+ }
+@@ -718,18 +718,27 @@ static void journal_write_unlock(struct closure *cl)
+ spin_unlock(&c->journal.lock);
+ }
+
+-static bool should_reclaim(struct cache_set *c,
+- struct journal_write *w)
++static inline bool should_reclaim(struct cache_set *c,
++ struct journal_write *w)
+ {
+- if (unlikely(journal_full(&c->journal)))
+- return true;
++ bool last = last_available_journal_bucket(c);
+
+- if (unlikely(last_available_journal_bucket(c) &&
+- (!c->journal.in_replay) &&
+- (c->journal.blocks_free * c->sb.block_size <=
+- BCH_JOURNAL_RPLY_RESERVE)))
++ if (!last && journal_full(&c->journal))
+ return true;
+
++ if (unlikely(last)) {
++ size_t n = c->journal.blocks_free * c->sb.block_size;
++
++ if (!c->journal.in_replay) {
++ if (n <= BCH_JOURNAL_RESERVE +
++ BCH_JOURNAL_RPLY_RESERVE)
++ return true;
++ } else {
++ if (n <= BCH_JOURNAL_RPLY_RESERVE)
++ return true;
++ }
++ }
++
+ return false;
+ }
+
+@@ -751,7 +760,9 @@ static void journal_write_unlocked(struct closure *cl)
+ if (!w->need_write) {
+ closure_return_with_destructor(cl, journal_write_unlock);
+ return;
+- } else if (should_reclaim(c, w)) {
++ }
++
++ if (should_reclaim(c, w)) {
+ journal_reclaim(c);
+ spin_unlock(&c->journal.lock);
+
+@@ -840,16 +851,26 @@ static void journal_try_write(struct cache_set *c)
+ }
+
+ static bool no_journal_wait(struct cache_set *c,
+- size_t sectors)
++ size_t sectors,
++ int nkeys)
+ {
++ bool is_journal_meta = (nkeys == 0) ? true : false;
+ bool last = last_available_journal_bucket(c);
+ size_t reserved_sectors = 0;
+- size_t n = min_t(size_t,
+- c->journal.blocks_free * c->sb.block_size,
+- PAGE_SECTORS << JSET_BITS);
++ size_t n;
++
++ if (unlikely(last)) {
++ if (!is_journal_meta)
++ reserved_sectors = BCH_JOURNAL_RESERVE +
++ BCH_JOURNAL_RPLY_RESERVE;
++ else
++ reserved_sectors = (!c->journal.in_replay) ?
++ BCH_JOURNAL_RPLY_RESERVE : 0;
++ }
+
+- if (last && !c->journal.in_replay)
+- reserved_sectors = BCH_JOURNAL_RPLY_RESERVE;
++ n = min_t(size_t,
++ c->journal.blocks_free * c->sb.block_size,
++ PAGE_SECTORS << JSET_BITS);
+
+ if (sectors <= (n - reserved_sectors))
+ return true;
+@@ -858,26 +879,46 @@ static bool no_journal_wait(struct cache_set *c,
+ }
+
+ static bool should_try_write(struct cache_set *c,
+- struct journal_write *w)
++ struct journal_write *w,
++ int nkeys)
+ {
+ size_t reserved_sectors, n, sectors;
++ bool last, empty_jset;
+
+ if (journal_full(&c->journal))
+ return false;
+
+- if (!last_available_journal_bucket(c))
++ last = last_available_journal_bucket(c);
++ empty_jset = (w->data->keys == 0) ? true : false;
++
++ if (!last) {
++ /*
++ * Not last available journal bucket, no reserved journal
++ * space restriction, an empty jset should not be here.
++ */
++ BUG_ON(empty_jset);
+ return true;
++ }
+
+- /* the check in no_journal_wait exceeds BCH_JOURNAL_RPLY_RESERVE */
+- if (w->data->keys == 0)
++ if (empty_jset) {
++ /*
++ * If nkeys is 0 it means the journaling request is for meta
++ * data, which should be returned in journal_wait_for_write()
++ * by checking no_journal_wait(), and won't get here.
++ */
++ BUG_ON(nkeys == 0);
+ return false;
++ }
+
+- reserved_sectors = BCH_JOURNAL_RPLY_RESERVE;
++ reserved_sectors = BCH_JOURNAL_RESERVE +
++ BCH_JOURNAL_RPLY_RESERVE;
+ n = min_t(size_t,
+ (c->journal.blocks_free * c->sb.block_size),
+ PAGE_SECTORS << JSET_BITS);
+- sectors = __set_blocks(w->data, w->data->keys,
++ sectors = __set_blocks(w->data,
++ w->data->keys,
+ block_bytes(c)) * c->sb.block_size;
++
+ if (sectors <= (n - reserved_sectors))
+ return true;
+
+@@ -903,13 +944,13 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c,
+ sectors = __set_blocks(w->data, w->data->keys + nkeys,
+ block_bytes(c)) * c->sb.block_size;
+
+- if (no_journal_wait(c, sectors))
++ if (no_journal_wait(c, sectors, nkeys))
+ return w;
+
+ if (wait)
+ closure_wait(&c->journal.wait, &cl);
+
+- if (should_try_write(c, w)) {
++ if (should_try_write(c, w, nkeys)) {
+ if (wait)
+ trace_bcache_journal_entry_full(c);
+
+diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
+index 54408e248a39..55f81443f304 100644
+--- a/drivers/md/bcache/journal.h
++++ b/drivers/md/bcache/journal.h
+@@ -162,6 +162,7 @@ struct journal_device {
+
+ /* Reserved jouranl space in sectors */
+ #define BCH_JOURNAL_RPLY_RESERVE 6U
++#define BCH_JOURNAL_RESERVE 7U
+
+ #define journal_full(j) \
+ (!(j)->blocks_free || fifo_free(&(j)->pin) <= 1)
+--
+2.16.4
+
diff --git a/for-test/jouranl-deadlock/v2-0006-bcache-add-failure-check-to-run_cache_set-for-jou.patch b/for-test/jouranl-deadlock/v2/v2-0006-bcache-add-failure-check-to-run_cache_set-for-jou.patch
index 47fee81..47fee81 100644
--- a/for-test/jouranl-deadlock/v2-0006-bcache-add-failure-check-to-run_cache_set-for-jou.patch
+++ b/for-test/jouranl-deadlock/v2/v2-0006-bcache-add-failure-check-to-run_cache_set-for-jou.patch
diff --git a/for-test/jouranl-deadlock/v2-0007-bcache-add-comments-for-kobj-release-callback-rou.patch b/for-test/jouranl-deadlock/v2/v2-0007-bcache-add-comments-for-kobj-release-callback-rou.patch
index c675a6d..c675a6d 100644
--- a/for-test/jouranl-deadlock/v2-0007-bcache-add-comments-for-kobj-release-callback-rou.patch
+++ b/for-test/jouranl-deadlock/v2/v2-0007-bcache-add-comments-for-kobj-release-callback-rou.patch
diff --git a/for-test/jouranl-deadlock/v2-0008-bcache-return-error-immediately-in-bch_journal_re.patch b/for-test/jouranl-deadlock/v2/v2-0008-bcache-return-error-immediately-in-bch_journal_re.patch
index 01f188c..01f188c 100644
--- a/for-test/jouranl-deadlock/v2-0008-bcache-return-error-immediately-in-bch_journal_re.patch
+++ b/for-test/jouranl-deadlock/v2/v2-0008-bcache-return-error-immediately-in-bch_journal_re.patch
diff --git a/for-test/jouranl-deadlock/v2-0009-bcache-add-error-check-for-calling-register_bdev.patch b/for-test/jouranl-deadlock/v2/v2-0009-bcache-add-error-check-for-calling-register_bdev.patch
index 4d342e2..4d342e2 100644
--- a/for-test/jouranl-deadlock/v2-0009-bcache-add-error-check-for-calling-register_bdev.patch
+++ b/for-test/jouranl-deadlock/v2/v2-0009-bcache-add-error-check-for-calling-register_bdev.patch
diff --git a/for-test/jouranl-deadlock/v2-0010-bcache-Add-comments-for-blkdev_put-in-registratio.patch b/for-test/jouranl-deadlock/v2/v2-0010-bcache-Add-comments-for-blkdev_put-in-registratio.patch
index 191177d..191177d 100644
--- a/for-test/jouranl-deadlock/v2-0010-bcache-Add-comments-for-blkdev_put-in-registratio.patch
+++ b/for-test/jouranl-deadlock/v2/v2-0010-bcache-Add-comments-for-blkdev_put-in-registratio.patch
diff --git a/for-test/jouranl-deadlock/v2-0011-bcache-add-comments-for-closure_fn-to-be-called-i.patch b/for-test/jouranl-deadlock/v2/v2-0011-bcache-add-comments-for-closure_fn-to-be-called-i.patch
index 3b0c2e3..3b0c2e3 100644
--- a/for-test/jouranl-deadlock/v2-0011-bcache-add-comments-for-closure_fn-to-be-called-i.patch
+++ b/for-test/jouranl-deadlock/v2/v2-0011-bcache-add-comments-for-closure_fn-to-be-called-i.patch
diff --git a/for-test/jouranl-deadlock/v2/v2-0012-bcache-add-pendings_cleanup-to-stop-pending-bcach.patch b/for-test/jouranl-deadlock/v2/v2-0012-bcache-add-pendings_cleanup-to-stop-pending-bcach.patch
new file mode 100644
index 0000000..d81c648
--- /dev/null
+++ b/for-test/jouranl-deadlock/v2/v2-0012-bcache-add-pendings_cleanup-to-stop-pending-bcach.patch
@@ -0,0 +1,107 @@
+From 6da8faaaf5e2ecd2fb3d11ae6bd8ab8ee19b39bc Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Wed, 20 Mar 2019 23:11:59 +0800
+Subject: [RFC PATCH v2 12/16] bcache: add pendings_cleanup to stop pending
+ bcache device
+
+If a bcache device is in dirty state and its cache set is not
+registered, this bcache deivce will not appear in /dev/bcache<N>,
+and there is no way to stop it or remove the bcache kernel module.
+
+This is an as-designed behavior, but sometimes people has to reboot
+whole system to release or stop the pending backing device.
+
+This sysfs interface may remove such pending bcache devices when
+write anything into the sysfs file manually.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+---
+ drivers/md/bcache/super.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 55 insertions(+)
+
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index 9b41e0b62cc0..e988e46a6479 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -2246,9 +2246,13 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page,
+
+ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
+ const char *buffer, size_t size);
++static ssize_t bch_pending_bdevs_cleanup(struct kobject *k,
++ struct kobj_attribute *attr,
++ const char *buffer, size_t size);
+
+ kobj_attribute_write(register, register_bcache);
+ kobj_attribute_write(register_quiet, register_bcache);
++kobj_attribute_write(pendings_cleanup, bch_pending_bdevs_cleanup);
+
+ static bool bch_is_open_backing(struct block_device *bdev)
+ {
+@@ -2373,6 +2377,56 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
+ goto out;
+ }
+
++
++struct pdev {
++ struct list_head list;
++ struct cached_dev *dc;
++};
++
++static ssize_t bch_pending_bdevs_cleanup(struct kobject *k,
++ struct kobj_attribute *attr,
++ const char *buffer,
++ size_t size)
++{
++ LIST_HEAD(pending_devs);
++ ssize_t ret = size;
++ struct cached_dev *dc, *tdc;
++ struct pdev *pdev, *tpdev;
++ struct cache_set *c, *tc;
++
++ mutex_lock(&bch_register_lock);
++ list_for_each_entry_safe(dc, tdc, &uncached_devices, list) {
++ pdev = kmalloc(sizeof(struct pdev), GFP_KERNEL);
++ if (!pdev)
++ break;
++ pdev->dc = dc;
++ list_add(&pdev->list, &pending_devs);
++ }
++
++ list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) {
++ list_for_each_entry_safe(c, tc, &bch_cache_sets, list) {
++ char *pdev_set_uuid = pdev->dc->sb.set_uuid;
++ char *set_uuid = c->sb.uuid;
++
++ if (!memcmp(pdev_set_uuid, set_uuid, 16)) {
++ list_del(&pdev->list);
++ kfree(pdev);
++ break;
++ }
++ }
++ }
++ mutex_unlock(&bch_register_lock);
++
++ list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) {
++ pr_info("delete pdev %p", pdev);
++ list_del(&pdev->list);
++ bcache_device_stop(&pdev->dc->disk);
++ kfree(pdev);
++ }
++
++ return ret;
++}
++
+ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
+ {
+ if (code == SYS_DOWN ||
+@@ -2483,6 +2537,7 @@ static int __init bcache_init(void)
+ static const struct attribute *files[] = {
+ &ksysfs_register.attr,
+ &ksysfs_register_quiet.attr,
++ &ksysfs_pendings_cleanup.attr,
+ NULL
+ };
+
+--
+2.16.4
+
diff --git a/for-test/jouranl-deadlock/v2/v2-0013-bcache-fix-fifo-index-swapping-condition-in-btree.patch b/for-test/jouranl-deadlock/v2/v2-0013-bcache-fix-fifo-index-swapping-condition-in-btree.patch
new file mode 100644
index 0000000..d76c955
--- /dev/null
+++ b/for-test/jouranl-deadlock/v2/v2-0013-bcache-fix-fifo-index-swapping-condition-in-btree.patch
@@ -0,0 +1,90 @@
+From e6ac565cfb5676a9e833e62570fb8a9d786eda47 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sat, 23 Mar 2019 22:54:35 +0800
+Subject: [RFC PATCH v2 13/16] bcache: fix fifo index swapping condition in
+ btree_flush_write()
+
+Current journal_max_cmp() and journal_min_cmp() assume that smaller fifo
+index indicating elder journal entries, but this is only true when fifo
+index is not swapped.
+
+Fifo structure journal.pin is implemented by a cycle buffer, if the head
+index reaches highest location of the cycle buffer, it will be swapped
+to 0. Once the swapping happens, it means a smaller fifo index might be
+associated to a newer journal entry. So the btree node with oldest
+journal entry won't be selected by btree_flush_write() to flush out to
+cache device. The result is, the oldest journal entries may always has
+no chance to be written into cache device, and after a reboot
+bch_journal_replay() may complain some journal entries are missing.
+
+This patch handles the fifo index swapping conditions properly, then in
+btree_flush_write() the btree node with oldest journal entry can be
+slected from c->flush_btree correctly.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Coly Li <colyli@suse.de>
+---
+ drivers/md/bcache/journal.c | 47 +++++++++++++++++++++++++++++++++++++++------
+ 1 file changed, 41 insertions(+), 6 deletions(-)
+
+diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
+index bdb6f9cefe48..bc0e01151155 100644
+--- a/drivers/md/bcache/journal.c
++++ b/drivers/md/bcache/journal.c
+@@ -464,12 +464,47 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
+ }
+
+ /* Journalling */
+-#define journal_max_cmp(l, r) \
+- (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) < \
+- fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal))
+-#define journal_min_cmp(l, r) \
+- (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) > \
+- fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal))
++#define journal_max_cmp(l, r) \
++({ \
++ int l_idx, r_idx, f_idx, b_idx; \
++ bool _ret = true; \
++ \
++ l_idx = fifo_idx(&c->journal.pin, btree_current_write(l)->journal); \
++ r_idx = fifo_idx(&c->journal.pin, btree_current_write(r)->journal); \
++ f_idx = c->journal.pin.front; \
++ b_idx = c->journal.pin.back; \
++ \
++ _ret = (l_idx < r_idx); \
++ /* in case fifo back pointer is swapped */ \
++ if (b_idx < f_idx) { \
++ if (l_idx <= b_idx && r_idx >= f_idx) \
++ _ret = false; \
++ else if (l_idx >= f_idx && r_idx <= b_idx) \
++ _ret = true; \
++ } \
++ _ret; \
++})
++
++#define journal_min_cmp(l, r) \
++({ \
++ int l_idx, r_idx, f_idx, b_idx; \
++ bool _ret = true; \
++ \
++ l_idx = fifo_idx(&c->journal.pin, btree_current_write(l)->journal); \
++ r_idx = fifo_idx(&c->journal.pin, btree_current_write(r)->journal); \
++ f_idx = c->journal.pin.front; \
++ b_idx = c->journal.pin.back; \
++ \
++ _ret = (l_idx > r_idx); \
++ /* in case fifo back pointer is swapped */ \
++ if (b_idx < f_idx) { \
++ if (l_idx <= b_idx && r_idx >= f_idx) \
++ _ret = true; \
++ else if (l_idx >= f_idx && r_idx <= b_idx) \
++ _ret = false; \
++ } \
++ _ret; \
++})
+
+ static void btree_flush_write(struct cache_set *c)
+ {
+--
+2.16.4
+
diff --git a/for-test/jouranl-deadlock/v2/v2-0014-bcache-try-to-flush-btree-nodes-as-many-as-possib.patch b/for-test/jouranl-deadlock/v2/v2-0014-bcache-try-to-flush-btree-nodes-as-many-as-possib.patch
new file mode 100644
index 0000000..4955ef8
--- /dev/null
+++ b/for-test/jouranl-deadlock/v2/v2-0014-bcache-try-to-flush-btree-nodes-as-many-as-possib.patch
@@ -0,0 +1,82 @@
+From d5786e57fca69b65b4b334e34d9ec8033ed6721f Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sun, 24 Mar 2019 00:06:05 +0800
+Subject: [RFC PATCH v2 14/16] bcache: try to flush btree nodes as many as
+ possible
+
+When btree_flush_write() is called, it means the journal space is
+exhuasted already. Current code only selects a single btree node to
+write out, which may introduce huge cache bounce from the spinlock on
+multiple cpu cores, when a lot of kworkers on journaling code path to
+call btree_flush_write() for journal space reclaiming.
+
+This patch tries to flush as many btree node as possible inside
+a single call to btree_flush_write(), then the frequence of calling
+btree_flush_write() can be reduced, which in turn reduces the cache
+bounce from spinlock on multiple cpu cores. Please notice that this
+patch does not reduce the total times of acquiring spinlock, a spin
+lock is still acquired when select every single btree node to write
+out, but this patch will try best to hold the spinlock on same cpu
+core, which avoids the cache bounce where the spinlock is acquired by
+multiple different cpu cores.
+
+After the patch applied, in my pressure testing, 'top' shows more than
+50% sys cpu time reduced from the kworks which competing spinlock
+inside btree_flush_write().
+
+Signed-off-by: Coly Li <colyli@suse.de>
+---
+ drivers/md/bcache/journal.c | 7 ++++++-
+ drivers/md/bcache/journal.h | 4 ++--
+ 2 files changed, 8 insertions(+), 3 deletions(-)
+
+diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
+index bc0e01151155..8536e76fcac9 100644
+--- a/drivers/md/bcache/journal.c
++++ b/drivers/md/bcache/journal.c
+@@ -514,6 +514,7 @@ static void btree_flush_write(struct cache_set *c)
+ */
+ struct btree *b;
+ int i;
++ int n = FLUSH_BTREE_HEAP;
+
+ atomic_long_inc(&c->flush_write);
+
+@@ -552,6 +553,10 @@ static void btree_flush_write(struct cache_set *c)
+
+ __bch_btree_node_write(b, NULL);
+ mutex_unlock(&b->write_lock);
++
++ /* try to flush btree nodes as many as possible */
++ if (--n > 0)
++ goto retry;
+ }
+ }
+
+@@ -1102,7 +1107,7 @@ int bch_journal_alloc(struct cache_set *c)
+ j->w[0].c = c;
+ j->w[1].c = c;
+
+- if (!(init_heap(&c->flush_btree, 128, GFP_KERNEL)) ||
++ if (!(init_heap(&c->flush_btree, FLUSH_BTREE_HEAP, GFP_KERNEL)) ||
+ !(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
+ !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) ||
+ !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)))
+diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
+index 55f81443f304..a8be14c6f6d9 100644
+--- a/drivers/md/bcache/journal.h
++++ b/drivers/md/bcache/journal.h
+@@ -158,8 +158,8 @@ struct journal_device {
+ #define journal_pin_cmp(c, l, r) \
+ (fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r)))
+
+-#define JOURNAL_PIN 20000
+-
++#define FLUSH_BTREE_HEAP 128
++#define JOURNAL_PIN 20000
+ /* Reserved jouranl space in sectors */
+ #define BCH_JOURNAL_RPLY_RESERVE 6U
+ #define BCH_JOURNAL_RESERVE 7U
+--
+2.16.4
+
diff --git a/for-test/jouranl-deadlock/v2-0015-bcache-improve-bcache_reboot.patch b/for-test/jouranl-deadlock/v2/v2-0015-bcache-improve-bcache_reboot.patch
index 3c92f1d..3c92f1d 100644
--- a/for-test/jouranl-deadlock/v2-0015-bcache-improve-bcache_reboot.patch
+++ b/for-test/jouranl-deadlock/v2/v2-0015-bcache-improve-bcache_reboot.patch
diff --git a/for-test/jouranl-deadlock/v2/v2-0016-bcache-introduce-spinlock_t-flush_write_lock-in-s.patch b/for-test/jouranl-deadlock/v2/v2-0016-bcache-introduce-spinlock_t-flush_write_lock-in-s.patch
new file mode 100644
index 0000000..a3d6691
--- /dev/null
+++ b/for-test/jouranl-deadlock/v2/v2-0016-bcache-introduce-spinlock_t-flush_write_lock-in-s.patch
@@ -0,0 +1,74 @@
+From 24539bb78565d784ddabb81f24968c13835eb000 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sun, 24 Mar 2019 23:55:27 +0800
+Subject: [RFC PATCH v2 16/16] bcache: introduce spinlock_t flush_write_lock in
+ struct journal
+
+In btree_flush_write(), iterating all cached btree nodes and adding them
+into ordered heap c->flush_btree takes quite long time. In order to
+protect ordered heap c->flush_btree, spin lock c->journal.lock is held
+for all the iteration and heap ordering. When journal space is fully
+occupied, btree_flush_write() might be called frequently, if the cached
+btree node iteration takes too much time, kenrel will complain that
+normal journal kworkers are blocked too long. Of cause write performance
+drops at this moment.
+
+This patch introduces a new spin lock member in struct journal, named
+flush_write_lock. This lock is only used in btree_flush_write() and
+protect the ordered heap c->flush_btree during all the cached btree node
+iteration. Then there won't be lock contention on c->journal.lock.
+
+After this fix, when journal space is fully occupied, it is very rare to
+observe the journal kworker blocking timeout warning.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+---
+ drivers/md/bcache/journal.c | 5 +++--
+ drivers/md/bcache/journal.h | 1 +
+ 2 files changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
+index 8536e76fcac9..6e38470f6924 100644
+--- a/drivers/md/bcache/journal.c
++++ b/drivers/md/bcache/journal.c
+@@ -519,7 +519,7 @@ static void btree_flush_write(struct cache_set *c)
+ atomic_long_inc(&c->flush_write);
+
+ retry:
+- spin_lock(&c->journal.lock);
++ spin_lock(&c->journal.flush_write_lock);
+ if (heap_empty(&c->flush_btree)) {
+ for_each_cached_btree(b, c, i)
+ if (btree_current_write(b)->journal) {
+@@ -540,7 +540,7 @@ static void btree_flush_write(struct cache_set *c)
+
+ b = NULL;
+ heap_pop(&c->flush_btree, b, journal_min_cmp);
+- spin_unlock(&c->journal.lock);
++ spin_unlock(&c->journal.flush_write_lock);
+
+ if (b) {
+ mutex_lock(&b->write_lock);
+@@ -1099,6 +1099,7 @@ int bch_journal_alloc(struct cache_set *c)
+ struct journal *j = &c->journal;
+
+ spin_lock_init(&j->lock);
++ spin_lock_init(&j->flush_write_lock);
+ INIT_DELAYED_WORK(&j->work, journal_write_work);
+
+ c->journal_delay_ms = 100;
+diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
+index a8be14c6f6d9..d8ad99f6191b 100644
+--- a/drivers/md/bcache/journal.h
++++ b/drivers/md/bcache/journal.h
+@@ -103,6 +103,7 @@ struct journal_write {
+ /* Embedded in struct cache_set */
+ struct journal {
+ spinlock_t lock;
++ spinlock_t flush_write_lock;
+ /* used when waiting because the journal was full */
+ struct closure_waitlist wait;
+ struct closure io;
+--
+2.16.4
+