diff options
author | Coly Li <colyli@suse.de> | 2019-06-05 18:04:18 +0800 |
---|---|---|
committer | Coly Li <colyli@suse.de> | 2019-06-05 18:04:18 +0800 |
commit | d89fd20daa9a3e982d567d5ce2f097376c259052 (patch) | |
tree | ceff8aec47b9f5f924d74157efd0f4032df323ee | |
parent | 856537f92f2feb65cfe555ae18dde4a68fde19ea (diff) | |
download | bcache-patches-d89fd20daa9a3e982d567d5ce2f097376c259052.tar.gz |
for-next: update, internal state
83 files changed, 6523 insertions, 0 deletions
diff --git a/for-next/old/0001-bcache-avoid-flushing-btree-node-in-cache_set_flush-.patch b/for-next/old/0001-bcache-avoid-flushing-btree-node-in-cache_set_flush-.patch new file mode 100644 index 0000000..23eedb3 --- /dev/null +++ b/for-next/old/0001-bcache-avoid-flushing-btree-node-in-cache_set_flush-.patch @@ -0,0 +1,53 @@ +From f52ff33eef3a7bc4bf296b34d972c5be28afc10b Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Thu, 23 May 2019 23:18:10 +0800 +Subject: [PATCH 01/17] bcache: avoid flushing btree node in cache_set_flush() + if io disabled + +When cache_set_flush() is called for too many I/O errors detected on +cache device and the cache set is retiring, inside the function it +doesn't make sense to flushing cached btree nodes from c->btree_cache +because CACHE_SET_IO_DISABLE is set on c->flags already and all I/Os +onto cache device will be rejected. + +This patch checks in cache_set_flush() that whether CACHE_SET_IO_DISABLE +is set. If yes, then avoids to flush the cached btree nodes to reduce +more time and make cache set retiring more faster. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/super.c | 18 +++++++++++------- + 1 file changed, 11 insertions(+), 7 deletions(-) + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 1b63ac876169..f44a666271f5 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -1570,13 +1570,17 @@ static void cache_set_flush(struct closure *cl) + if (!IS_ERR_OR_NULL(c->root)) + list_add(&c->root->list, &c->btree_cache); + +- /* Should skip this if we're unregistering because of an error */ +- list_for_each_entry(b, &c->btree_cache, list) { +- mutex_lock(&b->write_lock); +- if (btree_node_dirty(b)) +- __bch_btree_node_write(b, NULL); +- mutex_unlock(&b->write_lock); +- } ++ /* ++ * Avoid flushing cached nodes if cache set is retiring ++ * due to too many I/O errors detected. ++ */ ++ if (!test_bit(CACHE_SET_IO_DISABLE, &c->flags)) ++ list_for_each_entry(b, &c->btree_cache, list) { ++ mutex_lock(&b->write_lock); ++ if (btree_node_dirty(b)) ++ __bch_btree_node_write(b, NULL); ++ mutex_unlock(&b->write_lock); ++ } + + for_each_cache(ca, c, i) + if (ca->alloc_thread) +-- +2.16.4 + diff --git a/for-next/old/0002-bcache-Revert-bcache-fix-high-CPU-occupancy-during-j.patch b/for-next/old/0002-bcache-Revert-bcache-fix-high-CPU-occupancy-during-j.patch new file mode 100644 index 0000000..f337193 --- /dev/null +++ b/for-next/old/0002-bcache-Revert-bcache-fix-high-CPU-occupancy-during-j.patch @@ -0,0 +1,129 @@ +From 7f20f2a04fc49f06308e0917d4c700b1ca1f5727 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Tue, 28 May 2019 21:19:38 +0800 +Subject: [PATCH 02/17] bcache: Revert "bcache: fix high CPU occupancy during + journal" + +This reverts commit c4dc2497d50d9c6fb16aa0d07b6a14f3b2adb1e0. + +This patch enlarges a race between normal btree flush code path and +flush_btree_write(), which causes deadlock when journal space is +exhausted. Reverts this patch makes the race window from 128 btree +nodes to only 1 btree nodes. + +Fixes: c4dc2497d50d ("bcache: fix high CPU occupancy during journal") +Signed-off-by: Coly Li <colyli@suse.de> +Cc: stable@vger.kernel.org +Cc: Tang Junhui <tang.junhui.linux@gmail.com> +--- + drivers/md/bcache/bcache.h | 2 -- + drivers/md/bcache/journal.c | 47 +++++++++++++++------------------------------ + drivers/md/bcache/util.h | 2 -- + 3 files changed, 15 insertions(+), 36 deletions(-) + +diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h +index fdf75352e16a..e30a983a68cd 100644 +--- a/drivers/md/bcache/bcache.h ++++ b/drivers/md/bcache/bcache.h +@@ -726,8 +726,6 @@ struct cache_set { + + #define BUCKET_HASH_BITS 12 + struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; +- +- DECLARE_HEAP(struct btree *, flush_btree); + }; + + struct bbio { +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index 12dae9348147..a7ff60100755 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -391,12 +391,6 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) + } + + /* Journalling */ +-#define journal_max_cmp(l, r) \ +- (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) < \ +- fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal)) +-#define journal_min_cmp(l, r) \ +- (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) > \ +- fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal)) + + static void btree_flush_write(struct cache_set *c) + { +@@ -404,35 +398,25 @@ static void btree_flush_write(struct cache_set *c) + * Try to find the btree node with that references the oldest journal + * entry, best is our current candidate and is locked if non NULL: + */ +- struct btree *b; +- int i; ++ struct btree *b, *best; ++ unsigned i; + + atomic_long_inc(&c->flush_write); +- + retry: +- spin_lock(&c->journal.lock); +- if (heap_empty(&c->flush_btree)) { +- for_each_cached_btree(b, c, i) +- if (btree_current_write(b)->journal) { +- if (!heap_full(&c->flush_btree)) +- heap_add(&c->flush_btree, b, +- journal_max_cmp); +- else if (journal_max_cmp(b, +- heap_peek(&c->flush_btree))) { +- c->flush_btree.data[0] = b; +- heap_sift(&c->flush_btree, 0, +- journal_max_cmp); +- } ++ best = NULL; ++ ++ for_each_cached_btree(b, c, i) ++ if (btree_current_write(b)->journal) { ++ if (!best) ++ best = b; ++ else if (journal_pin_cmp(c, ++ btree_current_write(best)->journal, ++ btree_current_write(b)->journal)) { ++ best = b; + } ++ } + +- for (i = c->flush_btree.used / 2 - 1; i >= 0; --i) +- heap_sift(&c->flush_btree, i, journal_min_cmp); +- } +- +- b = NULL; +- heap_pop(&c->flush_btree, b, journal_min_cmp); +- spin_unlock(&c->journal.lock); +- ++ b = best; + if (b) { + mutex_lock(&b->write_lock); + if (!btree_current_write(b)->journal) { +@@ -870,8 +854,7 @@ int bch_journal_alloc(struct cache_set *c) + j->w[0].c = c; + j->w[1].c = c; + +- if (!(init_heap(&c->flush_btree, 128, GFP_KERNEL)) || +- !(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || ++ if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || + !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) || + !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS))) + return -ENOMEM; +diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h +index 1fbced94e4cc..c029f7443190 100644 +--- a/drivers/md/bcache/util.h ++++ b/drivers/md/bcache/util.h +@@ -113,8 +113,6 @@ do { \ + + #define heap_full(h) ((h)->used == (h)->size) + +-#define heap_empty(h) ((h)->used == 0) +- + #define DECLARE_FIFO(type, name) \ + struct { \ + size_t front, back, size, mask; \ +-- +2.16.4 + diff --git a/for-next/old/0003-bcache-Revert-bcache-add-journal-statistic.patch b/for-next/old/0003-bcache-Revert-bcache-add-journal-statistic.patch new file mode 100644 index 0000000..7242a54 --- /dev/null +++ b/for-next/old/0003-bcache-Revert-bcache-add-journal-statistic.patch @@ -0,0 +1,108 @@ +From 9ae3221cdc796b2b13a05534884ce10f87557e7b Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Tue, 28 May 2019 21:25:43 +0800 +Subject: [PATCH 03/17] bcache: Revert "bcache: add journal statistic" + +This reverts commit a728eacbbdd229d1d903e46261c57d5206f87a4a. + +The following new fixes for race in btree_flush_write() will rewrite +current code logic, the variables to count for statistic don't exist +anymore. So this patch is useless for now. + +Fixes: a728eacbbdd2 ("bcache: add journal statistic") +Signed-off-by: Coly Li <colyli@suse.de> +CC: Tang Junhui <tang.junhui.linux@gmail.com> +--- + drivers/md/bcache/bcache.h | 4 ---- + drivers/md/bcache/journal.c | 5 ----- + drivers/md/bcache/sysfs.c | 15 --------------- + 3 files changed, 24 deletions(-) + +diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h +index e30a983a68cd..baf06209fb89 100644 +--- a/drivers/md/bcache/bcache.h ++++ b/drivers/md/bcache/bcache.h +@@ -704,10 +704,6 @@ struct cache_set { + atomic_long_t writeback_keys_done; + atomic_long_t writeback_keys_failed; + +- atomic_long_t reclaim; +- atomic_long_t flush_write; +- atomic_long_t retry_flush_write; +- + enum { + ON_ERROR_UNREGISTER, + ON_ERROR_PANIC, +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index a7ff60100755..d62e0b7f978c 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -400,8 +400,6 @@ static void btree_flush_write(struct cache_set *c) + */ + struct btree *b, *best; + unsigned i; +- +- atomic_long_inc(&c->flush_write); + retry: + best = NULL; + +@@ -422,7 +420,6 @@ static void btree_flush_write(struct cache_set *c) + if (!btree_current_write(b)->journal) { + mutex_unlock(&b->write_lock); + /* We raced */ +- atomic_long_inc(&c->retry_flush_write); + goto retry; + } + +@@ -502,8 +499,6 @@ static void journal_reclaim(struct cache_set *c) + unsigned int iter, n = 0; + atomic_t p __maybe_unused; + +- atomic_long_inc(&c->reclaim); +- + while (!atomic_read(&fifo_front(&c->journal.pin))) + fifo_pop(&c->journal.pin, p); + +diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c +index 6cd44d3cf906..4a700c9eb70b 100644 +--- a/drivers/md/bcache/sysfs.c ++++ b/drivers/md/bcache/sysfs.c +@@ -83,9 +83,6 @@ read_attribute(bset_tree_stats); + + read_attribute(state); + read_attribute(cache_read_races); +-read_attribute(reclaim); +-read_attribute(flush_write); +-read_attribute(retry_flush_write); + read_attribute(writeback_keys_done); + read_attribute(writeback_keys_failed); + read_attribute(io_errors); +@@ -685,15 +682,6 @@ SHOW(__bch_cache_set) + sysfs_print(cache_read_races, + atomic_long_read(&c->cache_read_races)); + +- sysfs_print(reclaim, +- atomic_long_read(&c->reclaim)); +- +- sysfs_print(flush_write, +- atomic_long_read(&c->flush_write)); +- +- sysfs_print(retry_flush_write, +- atomic_long_read(&c->retry_flush_write)); +- + sysfs_print(writeback_keys_done, + atomic_long_read(&c->writeback_keys_done)); + sysfs_print(writeback_keys_failed, +@@ -908,9 +896,6 @@ static struct attribute *bch_cache_set_internal_files[] = { + + &sysfs_bset_tree_stats, + &sysfs_cache_read_races, +- &sysfs_reclaim, +- &sysfs_flush_write, +- &sysfs_retry_flush_write, + &sysfs_writeback_keys_done, + &sysfs_writeback_keys_failed, + +-- +2.16.4 + diff --git a/for-next/old/0004-bcache-Revert-bcache-free-heap-cache_set-flush_btree.patch b/for-next/old/0004-bcache-Revert-bcache-free-heap-cache_set-flush_btree.patch new file mode 100644 index 0000000..9899f36 --- /dev/null +++ b/for-next/old/0004-bcache-Revert-bcache-free-heap-cache_set-flush_btree.patch @@ -0,0 +1,35 @@ +From a244fdd5c635a070e03fabf2f710e038f4c9a6c5 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Tue, 28 May 2019 21:36:56 +0800 +Subject: [PATCH 04/17] bcache: Revert "bcache: free heap + cache_set->flush_btree in bch_journal_free" + +This reverts commit 6268dc2c4703aabfb0b35681be709acf4c2826c6. + +This patch depends on commit c4dc2497d50d ("bcache: fix high CPU +occupancy during journal") which is reverted in previous patch. So +revert this one too. + +Fixes: 6268dc2c4703 ("bcache: free heap cache_set->flush_btree in bch_journal_free") +Signed-off-by: Coly Li <colyli@suse.de> +Cc: stable@vger.kernel.org +Cc: Shenghui Wang <shhuiw@foxmail.com> +--- + drivers/md/bcache/journal.c | 1 - + 1 file changed, 1 deletion(-) + +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index d62e0b7f978c..af3025d79fc5 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -834,7 +834,6 @@ void bch_journal_free(struct cache_set *c) + free_pages((unsigned long) c->journal.w[1].data, JSET_BITS); + free_pages((unsigned long) c->journal.w[0].data, JSET_BITS); + free_fifo(&c->journal.pin); +- free_heap(&c->flush_btree); + } + + int bch_journal_alloc(struct cache_set *c) +-- +2.16.4 + diff --git a/for-next/old/0005-bcache-ignore-read-ahead-request-failure-on-backing-.patch b/for-next/old/0005-bcache-ignore-read-ahead-request-failure-on-backing-.patch new file mode 100644 index 0000000..7872d61 --- /dev/null +++ b/for-next/old/0005-bcache-ignore-read-ahead-request-failure-on-backing-.patch @@ -0,0 +1,55 @@ +From bc44ed34a12549b3e8051cd0cd3652162c4e1c0b Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Mon, 13 May 2019 22:48:09 +0800 +Subject: [PATCH 05/17] bcache: ignore read-ahead request failure on backing + device + +When md raid device (e.g. raid456) is used as backing device, read-ahead +requests on a degrading and recovering md raid device might be failured +immediately by md raid code, but indeed this md raid array can still be +read or write for normal I/O requests. Therefore such failed read-ahead +request are not real hardware failure. Further more, after degrading and +recovering accomplished, read-ahead requests will be handled by md raid +array again. + +For such condition, I/O failures of read-ahead requests don't indicate +real health status (because normal I/O still be served), they should not +be counted into I/O error counter dc->io_errors. + +Since there is no simple way to detect whether the backing divice is a +md raid device, this patch simply ignores I/O failures for read-ahead +bios on backing device, to avoid bogus backing device failure on a +degrading md raid array. + +Suggested-and-tested-by: Thorsten Knabe <linux@thorsten-knabe.de> +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/io.c | 12 ++++++++++++ + 1 file changed, 12 insertions(+) + +diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c +index c25097968319..4d93f07f63e5 100644 +--- a/drivers/md/bcache/io.c ++++ b/drivers/md/bcache/io.c +@@ -58,6 +58,18 @@ void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio) + + WARN_ONCE(!dc, "NULL pointer of struct cached_dev"); + ++ /* ++ * Read-ahead requests on a degrading and recovering md raid ++ * (e.g. raid6) device might be failured immediately by md ++ * raid code, which is not a real hardware media failure. So ++ * we shouldn't count failed REQ_RAHEAD bio to dc->io_errors. ++ */ ++ if (bio->bi_opf & REQ_RAHEAD) { ++ pr_warn_ratelimited("%s: Read-ahead I/O failed on backing device, ignore", ++ dc->backing_dev_name); ++ return; ++ } ++ + errors = atomic_add_return(1, &dc->io_errors); + if (errors < dc->error_limit) + pr_err("%s: IO error on backing device, unrecoverable", +-- +2.16.4 + diff --git a/for-next/old/0006-bcache-add-io-error-counting-in-write_bdev_super_end.patch b/for-next/old/0006-bcache-add-io-error-counting-in-write_bdev_super_end.patch new file mode 100644 index 0000000..85c521f --- /dev/null +++ b/for-next/old/0006-bcache-add-io-error-counting-in-write_bdev_super_end.patch @@ -0,0 +1,38 @@ +From 81d4485ea60256b740ac52384b900046516dd61a Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Mon, 13 May 2019 23:42:39 +0800 +Subject: [PATCH 06/17] bcache: add io error counting in + write_bdev_super_endio() + +When backing device super block is written by bch_write_bdev_super(), +the bio complete callback write_bdev_super_endio() simply ignores I/O +status. Indeed such write request also contribute to backing device +health status if the request failed. + +This patch checkes bio->bi_status in write_bdev_super_endio(), if there +is error, bch_count_backing_io_errors() will be called to count an I/O +error to dc->io_errors. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/super.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index f44a666271f5..c486a9de1219 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -197,7 +197,9 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, + static void write_bdev_super_endio(struct bio *bio) + { + struct cached_dev *dc = bio->bi_private; +- /* XXX: error checking */ ++ ++ if (bio->bi_status) ++ bch_count_backing_io_errors(dc, bio); + + closure_put(&dc->sb_write); + } +-- +2.16.4 + diff --git a/for-next/old/0007-bcache-remove-XXX-comment-line-from-run_cache_set.patch b/for-next/old/0007-bcache-remove-XXX-comment-line-from-run_cache_set.patch new file mode 100644 index 0000000..ddfb632 --- /dev/null +++ b/for-next/old/0007-bcache-remove-XXX-comment-line-from-run_cache_set.patch @@ -0,0 +1,31 @@ +From c3c9aca1934a35cc6b1c5d94cb01be11a1b26807 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Mon, 13 May 2019 23:47:38 +0800 +Subject: [PATCH 07/17] bcache: remove "XXX:" comment line from run_cache_set() + +In previous bcache patches for Linux v5.2, the failure code path of +run_cache_set() is tested and fixed. So now the following comment +line can be removed from run_cache_set(), + /* XXX: test this, it's broken */ + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/super.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index c486a9de1219..962c53493cf0 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -1963,7 +1963,7 @@ static int run_cache_set(struct cache_set *c) + } + + closure_sync(&cl); +- /* XXX: test this, it's broken */ ++ + bch_cache_set_error(c, "%s", err); + + return -EIO; +-- +2.16.4 + diff --git a/for-next/old/0008-bcache-remove-unnecessary-prefetch-in-bset_search_tr.patch b/for-next/old/0008-bcache-remove-unnecessary-prefetch-in-bset_search_tr.patch new file mode 100644 index 0000000..7c52abb --- /dev/null +++ b/for-next/old/0008-bcache-remove-unnecessary-prefetch-in-bset_search_tr.patch @@ -0,0 +1,56 @@ +From 8ea4429c8b78e7fe1ccba6172f780cce7d15a27e Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Tue, 14 May 2019 22:23:35 +0800 +Subject: [PATCH 08/17] bcache: remove unnecessary prefetch() in + bset_search_tree() + +In function bset_search_tree(), when p >= t->size, t->tree[0] will be +prefetched by the following code piece, + 974 unsigned int p = n << 4; + 975 + 976 p &= ((int) (p - t->size)) >> 31; + 977 + 978 prefetch(&t->tree[p]); + +The purpose of the above code is to avoid a branch instruction, but +when p >= t->size, prefetch(&t->tree[0]) has no positive performance +contribution at all. This patch avoids the unncessary prefetch by only +calling prefetch() when p < t->size. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/bset.c | 16 ++-------------- + 1 file changed, 2 insertions(+), 14 deletions(-) + +diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c +index 8f07fa6e1739..aa2e4ab0fab9 100644 +--- a/drivers/md/bcache/bset.c ++++ b/drivers/md/bcache/bset.c +@@ -960,22 +960,10 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t, + unsigned int inorder, j, n = 1; + + do { +- /* +- * A bit trick here. +- * If p < t->size, (int)(p - t->size) is a minus value and +- * the most significant bit is set, right shifting 31 bits +- * gets 1. If p >= t->size, the most significant bit is +- * not set, right shifting 31 bits gets 0. +- * So the following 2 lines equals to +- * if (p >= t->size) +- * p = 0; +- * but a branch instruction is avoided. +- */ + unsigned int p = n << 4; + +- p &= ((int) (p - t->size)) >> 31; +- +- prefetch(&t->tree[p]); ++ if (p < t->size) ++ prefetch(&t->tree[p]); + + j = n; + f = &t->tree[j]; +-- +2.16.4 + diff --git a/for-next/old/0009-bcache-make-bset_search_tree-be-more-understandable.patch b/for-next/old/0009-bcache-make-bset_search_tree-be-more-understandable.patch new file mode 100644 index 0000000..8223834 --- /dev/null +++ b/for-next/old/0009-bcache-make-bset_search_tree-be-more-understandable.patch @@ -0,0 +1,58 @@ +From 3af55633bbca2965b06c333bfb8454577d662753 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Tue, 14 May 2019 22:51:40 +0800 +Subject: [PATCH 09/17] bcache: make bset_search_tree() be more understandable + +The purpose of following code in bset_search_tree() is to avoid a branch +instruction, + 994 if (likely(f->exponent != 127)) + 995 n = j * 2 + (((unsigned int) + 996 (f->mantissa - + 997 bfloat_mantissa(search, f))) >> 31); + 998 else + 999 n = (bkey_cmp(tree_to_bkey(t, j), search) > 0) +1000 ? j * 2 +1001 : j * 2 + 1; + +This piece of code is not very clear to understand, even when I tried to +add code comment for it, I made mistake. This patch removes the implict +bit operation and uses explicit branch to calculate next location in +binary tree search. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/bset.c | 17 +++-------------- + 1 file changed, 3 insertions(+), 14 deletions(-) + +diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c +index aa2e4ab0fab9..f752cc791f50 100644 +--- a/drivers/md/bcache/bset.c ++++ b/drivers/md/bcache/bset.c +@@ -968,21 +968,10 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t, + j = n; + f = &t->tree[j]; + +- /* +- * Similar bit trick, use subtract operation to avoid a branch +- * instruction. +- * +- * n = (f->mantissa > bfloat_mantissa()) +- * ? j * 2 +- * : j * 2 + 1; +- * +- * We need to subtract 1 from f->mantissa for the sign bit trick +- * to work - that's done in make_bfloat() +- */ + if (likely(f->exponent != 127)) +- n = j * 2 + (((unsigned int) +- (f->mantissa - +- bfloat_mantissa(search, f))) >> 31); ++ n = (f->mantissa >= bfloat_mantissa(search, f)) ++ ? j * 2 ++ : j * 2 + 1; + else + n = (bkey_cmp(tree_to_bkey(t, j), search) > 0) + ? j * 2 +-- +2.16.4 + diff --git a/for-next/old/0010-bcache-use-sysfs_match_string-instead-of-__sysfs_mat.patch b/for-next/old/0010-bcache-use-sysfs_match_string-instead-of-__sysfs_mat.patch new file mode 100644 index 0000000..e982a84 --- /dev/null +++ b/for-next/old/0010-bcache-use-sysfs_match_string-instead-of-__sysfs_mat.patch @@ -0,0 +1,97 @@ +From f65e282de5a589918ec171c4ba4cbec635f7e8e2 Mon Sep 17 00:00:00 2001 +From: Alexandru Ardelean <alexandru.ardelean@analog.com> +Date: Tue, 7 May 2019 12:43:12 +0300 +Subject: [PATCH 10/17] bcache: use sysfs_match_string() instead of + __sysfs_match_string() + +The arrays (of strings) that are passed to __sysfs_match_string() are +static, so use sysfs_match_string() which does an implicit ARRAY_SIZE() +over these arrays. + +Functionally, this doesn't change anything. +The change is more cosmetic. + +It only shrinks the static arrays by 1 byte each. + +Signed-off-by: Alexandru Ardelean <alexandru.ardelean@analog.com> +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/sysfs.c | 20 ++++++++------------ + 1 file changed, 8 insertions(+), 12 deletions(-) + +diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c +index 4a700c9eb70b..f6847ae15cd3 100644 +--- a/drivers/md/bcache/sysfs.c ++++ b/drivers/md/bcache/sysfs.c +@@ -21,28 +21,24 @@ static const char * const bch_cache_modes[] = { + "writethrough", + "writeback", + "writearound", +- "none", +- NULL ++ "none" + }; + + /* Default is 0 ("auto") */ + static const char * const bch_stop_on_failure_modes[] = { + "auto", +- "always", +- NULL ++ "always" + }; + + static const char * const cache_replacement_policies[] = { + "lru", + "fifo", +- "random", +- NULL ++ "random" + }; + + static const char * const error_actions[] = { + "unregister", +- "panic", +- NULL ++ "panic" + }; + + write_attribute(attach); +@@ -330,7 +326,7 @@ STORE(__cached_dev) + bch_cached_dev_run(dc); + + if (attr == &sysfs_cache_mode) { +- v = __sysfs_match_string(bch_cache_modes, -1, buf); ++ v = sysfs_match_string(bch_cache_modes, buf); + if (v < 0) + return v; + +@@ -341,7 +337,7 @@ STORE(__cached_dev) + } + + if (attr == &sysfs_stop_when_cache_set_failed) { +- v = __sysfs_match_string(bch_stop_on_failure_modes, -1, buf); ++ v = sysfs_match_string(bch_stop_on_failure_modes, buf); + if (v < 0) + return v; + +@@ -782,7 +778,7 @@ STORE(__bch_cache_set) + 0, UINT_MAX); + + if (attr == &sysfs_errors) { +- v = __sysfs_match_string(error_actions, -1, buf); ++ v = sysfs_match_string(error_actions, buf); + if (v < 0) + return v; + +@@ -1043,7 +1039,7 @@ STORE(__bch_cache) + } + + if (attr == &sysfs_cache_replacement_policy) { +- v = __sysfs_match_string(cache_replacement_policies, -1, buf); ++ v = sysfs_match_string(cache_replacement_policies, buf); + if (v < 0) + return v; + +-- +2.16.4 + diff --git a/for-next/old/0011-bcache-add-return-value-check-to-bch_cached_dev_run.patch b/for-next/old/0011-bcache-add-return-value-check-to-bch_cached_dev_run.patch new file mode 100644 index 0000000..e1d8c55 --- /dev/null +++ b/for-next/old/0011-bcache-add-return-value-check-to-bch_cached_dev_run.patch @@ -0,0 +1,151 @@ +From 788e1b23e22d1ce633d2970a0f188c7f3e100a82 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Tue, 21 May 2019 22:16:38 +0800 +Subject: [PATCH 11/17] bcache: add return value check to bch_cached_dev_run() + +This patch adds return value check to bch_cached_dev_run(), now if there +is error happens inside bch_cached_dev_run(), it can be catched. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/bcache.h | 2 +- + drivers/md/bcache/super.c | 32 +++++++++++++++++++++++++------- + drivers/md/bcache/sysfs.c | 7 +++++-- + 3 files changed, 31 insertions(+), 10 deletions(-) + +diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h +index baf06209fb89..ffeb16c8af0a 100644 +--- a/drivers/md/bcache/bcache.h ++++ b/drivers/md/bcache/bcache.h +@@ -1000,7 +1000,7 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size); + int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, + uint8_t *set_uuid); + void bch_cached_dev_detach(struct cached_dev *dc); +-void bch_cached_dev_run(struct cached_dev *dc); ++int bch_cached_dev_run(struct cached_dev *dc); + void bcache_device_stop(struct bcache_device *d); + + void bch_cache_set_unregister(struct cache_set *c); +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 962c53493cf0..cb9abfa73d02 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -910,7 +910,7 @@ static int cached_dev_status_update(void *arg) + } + + +-void bch_cached_dev_run(struct cached_dev *dc) ++int bch_cached_dev_run(struct cached_dev *dc) + { + struct bcache_device *d = &dc->disk; + char *buf = kmemdup_nul(dc->sb.label, SB_LABEL_SIZE, GFP_KERNEL); +@@ -921,11 +921,14 @@ void bch_cached_dev_run(struct cached_dev *dc) + NULL, + }; + ++ if (dc->io_disable) ++ return -EIO; ++ + if (atomic_xchg(&dc->running, 1)) { + kfree(env[1]); + kfree(env[2]); + kfree(buf); +- return; ++ return -EBUSY; + } + + if (!d->c && +@@ -951,8 +954,10 @@ void bch_cached_dev_run(struct cached_dev *dc) + kfree(buf); + + if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || +- sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) ++ sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) { + pr_debug("error creating sysfs link"); ++ return -ENOMEM; ++ } + + dc->status_update_thread = kthread_run(cached_dev_status_update, + dc, "bcache_status_update"); +@@ -961,6 +966,8 @@ void bch_cached_dev_run(struct cached_dev *dc) + "continue to run without monitoring backing " + "device status"); + } ++ ++ return 0; + } + + /* +@@ -1056,6 +1063,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, + uint32_t rtime = cpu_to_le32((u32)ktime_get_real_seconds()); + struct uuid_entry *u; + struct cached_dev *exist_dc, *t; ++ int ret = 0; + + if ((set_uuid && memcmp(set_uuid, c->sb.set_uuid, 16)) || + (!set_uuid && memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16))) +@@ -1165,7 +1173,12 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, + + bch_sectors_dirty_init(&dc->disk); + +- bch_cached_dev_run(dc); ++ ret = bch_cached_dev_run(dc); ++ if (ret) { ++ up_write(&dc->writeback_lock); ++ return ret; ++ } ++ + bcache_device_link(&dc->disk, c, "bdev"); + atomic_inc(&c->attached_dev_nr); + +@@ -1292,6 +1305,7 @@ static int register_bdev(struct cache_sb *sb, struct page *sb_page, + { + const char *err = "cannot allocate memory"; + struct cache_set *c; ++ int ret = -ENOMEM; + + bdevname(bdev, dc->backing_dev_name); + memcpy(&dc->sb, sb, sizeof(struct cache_sb)); +@@ -1321,14 +1335,18 @@ static int register_bdev(struct cache_sb *sb, struct page *sb_page, + bch_cached_dev_attach(dc, c, NULL); + + if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE || +- BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) +- bch_cached_dev_run(dc); ++ BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) { ++ err = "failed to run cached device"; ++ ret = bch_cached_dev_run(dc); ++ if (ret) ++ goto err; ++ } + + return 0; + err: + pr_notice("error %s: %s", dc->backing_dev_name, err); + bcache_device_stop(&dc->disk); +- return -EIO; ++ return ret; + } + + /* Flash only volumes */ +diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c +index f6847ae15cd3..ed5381e341aa 100644 +--- a/drivers/md/bcache/sysfs.c ++++ b/drivers/md/bcache/sysfs.c +@@ -322,8 +322,11 @@ STORE(__cached_dev) + bch_cache_accounting_clear(&dc->accounting); + + if (attr == &sysfs_running && +- strtoul_or_return(buf)) +- bch_cached_dev_run(dc); ++ strtoul_or_return(buf)) { ++ v = bch_cached_dev_run(dc); ++ if (v) ++ return v; ++ } + + if (attr == &sysfs_cache_mode) { + v = sysfs_match_string(bch_cache_modes, buf); +-- +2.16.4 + diff --git a/for-next/old/0012-bcache-remove-unncessary-code-in-bch_btree_keys_init.patch b/for-next/old/0012-bcache-remove-unncessary-code-in-bch_btree_keys_init.patch new file mode 100644 index 0000000..9c328be --- /dev/null +++ b/for-next/old/0012-bcache-remove-unncessary-code-in-bch_btree_keys_init.patch @@ -0,0 +1,72 @@ +From b9c2ad9842765caf511b1461666a6c7afb82aa92 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Tue, 21 May 2019 22:36:35 +0800 +Subject: [PATCH 12/17] bcache: remove unncessary code in bch_btree_keys_init() + +Function bch_btree_keys_init() initializes b->set[].size and +b->set[].data to zero. As the code comments indicates, these code indeed +is unncessary, because both struct btree_keys and struct bset_tree are +nested embedded into struct btree, when struct btree is filled with 0 +bits by kzalloc() in mca_bucket_alloc(), b->set[].size and +b->set[].data are initialized to 0 (a.k.a NULL) already. + +This patch removes the redundant code, and add comments in +bch_btree_keys_init() and mca_bucket_alloc() to explain why it's safe. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/bset.c | 15 ++++++--------- + drivers/md/bcache/btree.c | 4 ++++ + 2 files changed, 10 insertions(+), 9 deletions(-) + +diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c +index f752cc791f50..32e2e4d8fa6c 100644 +--- a/drivers/md/bcache/bset.c ++++ b/drivers/md/bcache/bset.c +@@ -347,22 +347,19 @@ EXPORT_SYMBOL(bch_btree_keys_alloc); + void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops, + bool *expensive_debug_checks) + { +- unsigned int i; +- + b->ops = ops; + b->expensive_debug_checks = expensive_debug_checks; + b->nsets = 0; + b->last_set_unwritten = 0; + +- /* XXX: shouldn't be needed */ +- for (i = 0; i < MAX_BSETS; i++) +- b->set[i].size = 0; + /* +- * Second loop starts at 1 because b->keys[0]->data is the memory we +- * allocated ++ * struct btree_keys in embedded in struct btree, and struct ++ * bset_tree is embedded into struct btree_keys. They are all ++ * initialized as 0 by kzalloc() in mca_bucket_alloc(), and ++ * b->set[0].data is allocated in bch_btree_keys_alloc(), so we ++ * don't have to initiate b->set[].size and b->set[].data here ++ * any more. + */ +- for (i = 1; i < MAX_BSETS; i++) +- b->set[i].data = NULL; + } + EXPORT_SYMBOL(bch_btree_keys_init); + +diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c +index 773f5fdad25f..cf38a1b031fa 100644 +--- a/drivers/md/bcache/btree.c ++++ b/drivers/md/bcache/btree.c +@@ -613,6 +613,10 @@ static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) + static struct btree *mca_bucket_alloc(struct cache_set *c, + struct bkey *k, gfp_t gfp) + { ++ /* ++ * kzalloc() is necessary here for initialization, ++ * see code comments in bch_btree_keys_init(). ++ */ + struct btree *b = kzalloc(sizeof(struct btree), gfp); + + if (!b) +-- +2.16.4 + diff --git a/for-next/old/0013-bcache-avoid-a-deadlock-in-bcache_reboot.patch b/for-next/old/0013-bcache-avoid-a-deadlock-in-bcache_reboot.patch new file mode 100644 index 0000000..8816fb5 --- /dev/null +++ b/for-next/old/0013-bcache-avoid-a-deadlock-in-bcache_reboot.patch @@ -0,0 +1,216 @@ +From f2cffa615c588b527c3a0bd1d6eaf22755b71dc3 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Tue, 21 May 2019 23:19:55 +0800 +Subject: [PATCH 13/17] bcache: avoid a deadlock in bcache_reboot() + +It is quite frequently to observe deadlock in bcache_reboot() happens +and hang the system reboot process. The reason is, in bcache_reboot() +when calling bch_cache_set_stop() and bcache_device_stop() the mutex +bch_register_lock is held. But in the process to stop cache set and +bcache device, bch_register_lock will be acquired again. If this mutex +is held here, deadlock will happen inside the stopping process. The +aftermath of the deadlock is, whole system reboot gets hung. + +The fix is to avoid holding bch_register_lock for the following loops +in bcache_reboot(), + list_for_each_entry_safe(c, tc, &bch_cache_sets, list) + bch_cache_set_stop(c); + + list_for_each_entry_safe(dc, tdc, &uncached_devices, list) + bcache_device_stop(&dc->disk); + +A module range variable 'bcache_is_reboot' is added, it sets to true +in bcache_reboot(). In register_bcache(), if bcache_is_reboot is checked +to be true, reject the registration by returning -EBUSY immediately. + +After the fix, there is still a small race window. Here is the example. +1) a pointer 'c' is slected from list bch_cache_sets in the loop + list_for_each_entry_safe(). +2) In bch_cache_set_stop() bit CACHE_SET_STOPPING is tested. + +During the time window between step 1) and 2), if cache set 'c' is +retired and memory object is released for too many I/O errors, testing +CACHE_SET_STOPPING on c->flags may trigger a NULL pointer deference. + +Fortunately the above race window is very small, it is much less +probably to happen and has same result as bch_register_lock (non clearly +shut down bcache device), comparing the deadlock in bcache_reboot(). So +in order to fix the deadlock in bcache_reboot(), such small race is +acceptable as cost for the simple fix. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/super.c | 35 ++++++++++++++++++++++++++++++++++- + drivers/md/bcache/sysfs.c | 26 ++++++++++++++++++++++++++ + 2 files changed, 60 insertions(+), 1 deletion(-) + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index cb9abfa73d02..f445ddd74688 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -40,6 +40,7 @@ static const char invalid_uuid[] = { + + static struct kobject *bcache_kobj; + struct mutex bch_register_lock; ++bool bcache_is_reboot; + LIST_HEAD(bch_cache_sets); + static LIST_HEAD(uncached_devices); + +@@ -49,6 +50,7 @@ static wait_queue_head_t unregister_wait; + struct workqueue_struct *bcache_wq; + struct workqueue_struct *bch_journal_wq; + ++ + #define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE) + /* limitation of partitions number on single bcache device */ + #define BCACHE_MINORS 128 +@@ -2325,6 +2327,9 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, + if (!try_module_get(THIS_MODULE)) + return -EBUSY; + ++ if (bcache_is_reboot) ++ return -EBUSY; ++ + path = kstrndup(buffer, size, GFP_KERNEL); + if (!path) + goto err; +@@ -2414,21 +2419,47 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) + struct cache_set *c, *tc; + struct cached_dev *dc, *tdc; + ++ /* New registration is rejected since now */ ++ bcache_is_reboot = true; ++ /* ++ * Make registering caller (if there is) on other CPU ++ * core know bcache_is_reboot set to true earlier ++ */ ++ smp_mb(); ++ + mutex_lock(&bch_register_lock); + + if (list_empty(&bch_cache_sets) && + list_empty(&uncached_devices)) + goto out; + ++ mutex_unlock(&bch_register_lock); ++ + pr_info("Stopping all devices:"); + ++ /* ++ * The reason bch_register_lock is not held to call ++ * bch_cache_set_stop() and bcache_device_stop() is to ++ * avoid potential deadlock during reboot, because cache ++ * set or bcache device stopping process will acqurie ++ * bch_register_lock too. ++ * ++ * We are safe here because bcache_is_reboot sets to ++ * true already, register_bcache() will reject new ++ * registration now. There is still a small race here, ++ * between a list node fetched and the flags bits tested ++ * in bch_cache_set_stop() or bcache_device_stop(). ++ * Such race in reboot process only results non-clearly ++ * shut down bcache device, same as the bch_register_deadlock ++ * does, but much less probably to happen. So we take the ++ * cost here. ++ */ + list_for_each_entry_safe(c, tc, &bch_cache_sets, list) + bch_cache_set_stop(c); + + list_for_each_entry_safe(dc, tdc, &uncached_devices, list) + bcache_device_stop(&dc->disk); + +- mutex_unlock(&bch_register_lock); + + /* + * Give an early chance for other kthreads and +@@ -2555,6 +2586,8 @@ static int __init bcache_init(void) + bch_debug_init(); + closure_debug_init(); + ++ bcache_is_reboot = false; ++ + return 0; + err: + bcache_exit(); +diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c +index ed5381e341aa..a6f5da7f89d9 100644 +--- a/drivers/md/bcache/sysfs.c ++++ b/drivers/md/bcache/sysfs.c +@@ -16,6 +16,8 @@ + #include <linux/sort.h> + #include <linux/sched/clock.h> + ++extern bool bcache_is_reboot; ++ + /* Default is 0 ("writethrough") */ + static const char * const bch_cache_modes[] = { + "writethrough", +@@ -264,6 +266,10 @@ STORE(__cached_dev) + struct cache_set *c; + struct kobj_uevent_env *env; + ++ /* no user space access if system is rebooting */ ++ if (bcache_is_reboot) ++ return -EBUSY; ++ + #define d_strtoul(var) sysfs_strtoul(var, dc->var) + #define d_strtoul_nonzero(var) sysfs_strtoul_clamp(var, dc->var, 1, INT_MAX) + #define d_strtoi_h(var) sysfs_hatoi(var, dc->var) +@@ -404,6 +410,10 @@ STORE(bch_cached_dev) + struct cached_dev *dc = container_of(kobj, struct cached_dev, + disk.kobj); + ++ /* no user space access if system is rebooting */ ++ if (bcache_is_reboot) ++ return -EBUSY; ++ + mutex_lock(&bch_register_lock); + size = __cached_dev_store(kobj, attr, buf, size); + +@@ -502,6 +512,10 @@ STORE(__bch_flash_dev) + kobj); + struct uuid_entry *u = &d->c->uuids[d->id]; + ++ /* no user space access if system is rebooting */ ++ if (bcache_is_reboot) ++ return -EBUSY; ++ + sysfs_strtoul(data_csum, d->data_csum); + + if (attr == &sysfs_size) { +@@ -728,6 +742,10 @@ STORE(__bch_cache_set) + struct cache_set *c = container_of(kobj, struct cache_set, kobj); + ssize_t v; + ++ /* no user space access if system is rebooting */ ++ if (bcache_is_reboot) ++ return -EBUSY; ++ + if (attr == &sysfs_unregister) + bch_cache_set_unregister(c); + +@@ -847,6 +865,10 @@ STORE(bch_cache_set_internal) + { + struct cache_set *c = container_of(kobj, struct cache_set, internal); + ++ /* no user space access if system is rebooting */ ++ if (bcache_is_reboot) ++ return -EBUSY; ++ + return bch_cache_set_store(&c->kobj, attr, buf, size); + } + +@@ -1029,6 +1051,10 @@ STORE(__bch_cache) + struct cache *ca = container_of(kobj, struct cache, kobj); + ssize_t v; + ++ /* no user space access if system is rebooting */ ++ if (bcache_is_reboot) ++ return -EBUSY; ++ + if (attr == &sysfs_discard) { + bool v = strtoul_or_return(buf); + +-- +2.16.4 + diff --git a/for-next/old/0014-bcache-check-CACHE_SET_IO_DISABLE-in-allocator-code.patch b/for-next/old/0014-bcache-check-CACHE_SET_IO_DISABLE-in-allocator-code.patch new file mode 100644 index 0000000..909cba2 --- /dev/null +++ b/for-next/old/0014-bcache-check-CACHE_SET_IO_DISABLE-in-allocator-code.patch @@ -0,0 +1,52 @@ +From 444036782baf9ec58dacb7f249a21a6fd4e53059 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Wed, 22 May 2019 21:55:09 +0800 +Subject: [PATCH 14/17] bcache: check CACHE_SET_IO_DISABLE in allocator code + +If CACHE_SET_IO_DISABLE of a cache set flag is set by too many I/O +errors, currently allocator routines can still continue allocate +space which may introduce inconsistent metadata state. + +This patch checkes CACHE_SET_IO_DISABLE bit in following allocator +routines, +- bch_bucket_alloc() +- __bch_bucket_alloc_set() +Once CACHE_SET_IO_DISABLE is set on cache set, the allocator routines +may reject allocation request earlier to avoid potential inconsistent +metadata. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/alloc.c | 9 +++++++++ + 1 file changed, 9 insertions(+) + +diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c +index f8986effcb50..34ae5bb6724a 100644 +--- a/drivers/md/bcache/alloc.c ++++ b/drivers/md/bcache/alloc.c +@@ -393,6 +393,11 @@ long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait) + struct bucket *b; + long r; + ++ ++ /* No allocation if CACHE_SET_IO_DISABLE set */ ++ if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &ca->set->flags))) ++ return -1; ++ + /* fastpath */ + if (fifo_pop(&ca->free[RESERVE_NONE], r) || + fifo_pop(&ca->free[reserve], r)) +@@ -484,6 +489,10 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, + { + int i; + ++ /* No allocation if CACHE_SET_IO_DISABLE set */ ++ if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) ++ return -1; ++ + lockdep_assert_held(&c->bucket_lock); + BUG_ON(!n || n > c->caches_loaded || n > MAX_CACHES_PER_SET); + +-- +2.16.4 + diff --git a/for-next/old/0015-bcache-check-CACHE_SET_IO_DISABLE-bit-in-bch_journal.patch b/for-next/old/0015-bcache-check-CACHE_SET_IO_DISABLE-bit-in-bch_journal.patch new file mode 100644 index 0000000..31987f8 --- /dev/null +++ b/for-next/old/0015-bcache-check-CACHE_SET_IO_DISABLE-bit-in-bch_journal.patch @@ -0,0 +1,39 @@ +From aa3aacfe703aed11754fc418fbfbc62a5f8a2ba8 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Wed, 22 May 2019 22:06:21 +0800 +Subject: [PATCH 15/17] bcache: check CACHE_SET_IO_DISABLE bit in bch_journal() + +When too many I/O errors happen on cache set and CACHE_SET_IO_DISABLE +bit is set, bch_journal() may continue to work because the journaling +bkey might be still in write set yet. The caller of bch_journal() may +believe the journal still work but the truth is in-memory journal write +set won't be written into cache device any more. This behavior may +introduce potential inconsistent metadata status. + +This patch checks CACHE_SET_IO_DISABLE bit at the head of bch_journal(), +if the bit is set, bch_journal() returns NULL immediately to notice +caller to know journal does not work. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/journal.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index af3025d79fc5..938dc4904778 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -790,6 +790,10 @@ atomic_t *bch_journal(struct cache_set *c, + struct journal_write *w; + atomic_t *ret; + ++ /* No journaling if CACHE_SET_IO_DISABLE set already */ ++ if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) ++ return NULL; ++ + if (!CACHE_SYNC(&c->sb)) + return NULL; + +-- +2.16.4 + diff --git a/for-next/old/0016-bcache-add-pendings_cleanup-to-stop-pending-bcache-d.patch b/for-next/old/0016-bcache-add-pendings_cleanup-to-stop-pending-bcache-d.patch new file mode 100644 index 0000000..f5fa86d --- /dev/null +++ b/for-next/old/0016-bcache-add-pendings_cleanup-to-stop-pending-bcache-d.patch @@ -0,0 +1,107 @@ +From 2357f63dbac0253fd991c3a90fd1fd89097bf32d Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Wed, 20 Mar 2019 23:11:59 +0800 +Subject: [PATCH 16/17] bcache: add pendings_cleanup to stop pending bcache + device + +If a bcache device is in dirty state and its cache set is not +registered, this bcache deivce will not appear in /dev/bcache<N>, +and there is no way to stop it or remove the bcache kernel module. + +This is an as-designed behavior, but sometimes people has to reboot +whole system to release or stop the pending backing device. + +This sysfs interface may remove such pending bcache devices when +write anything into the sysfs file manually. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/super.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 55 insertions(+) + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index f445ddd74688..604560f7d73c 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -2277,9 +2277,13 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page, + + static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, + const char *buffer, size_t size); ++static ssize_t bch_pending_bdevs_cleanup(struct kobject *k, ++ struct kobj_attribute *attr, ++ const char *buffer, size_t size); + + kobj_attribute_write(register, register_bcache); + kobj_attribute_write(register_quiet, register_bcache); ++kobj_attribute_write(pendings_cleanup, bch_pending_bdevs_cleanup); + + static bool bch_is_open_backing(struct block_device *bdev) + { +@@ -2407,6 +2411,56 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, + goto out; + } + ++ ++struct pdev { ++ struct list_head list; ++ struct cached_dev *dc; ++}; ++ ++static ssize_t bch_pending_bdevs_cleanup(struct kobject *k, ++ struct kobj_attribute *attr, ++ const char *buffer, ++ size_t size) ++{ ++ LIST_HEAD(pending_devs); ++ ssize_t ret = size; ++ struct cached_dev *dc, *tdc; ++ struct pdev *pdev, *tpdev; ++ struct cache_set *c, *tc; ++ ++ mutex_lock(&bch_register_lock); ++ list_for_each_entry_safe(dc, tdc, &uncached_devices, list) { ++ pdev = kmalloc(sizeof(struct pdev), GFP_KERNEL); ++ if (!pdev) ++ break; ++ pdev->dc = dc; ++ list_add(&pdev->list, &pending_devs); ++ } ++ ++ list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) { ++ list_for_each_entry_safe(c, tc, &bch_cache_sets, list) { ++ char *pdev_set_uuid = pdev->dc->sb.set_uuid; ++ char *set_uuid = c->sb.uuid; ++ ++ if (!memcmp(pdev_set_uuid, set_uuid, 16)) { ++ list_del(&pdev->list); ++ kfree(pdev); ++ break; ++ } ++ } ++ } ++ mutex_unlock(&bch_register_lock); ++ ++ list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) { ++ pr_info("delete pdev %p", pdev); ++ list_del(&pdev->list); ++ bcache_device_stop(&pdev->dc->disk); ++ kfree(pdev); ++ } ++ ++ return ret; ++} ++ + static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) + { + if (code == SYS_DOWN || +@@ -2551,6 +2605,7 @@ static int __init bcache_init(void) + static const struct attribute *files[] = { + &ksysfs_register.attr, + &ksysfs_register_quiet.attr, ++ &ksysfs_pendings_cleanup.attr, + NULL + }; + +-- +2.16.4 + diff --git a/for-next/old/0017-bcache-reload-jouranl-key-information-during-journal.patch b/for-next/old/0017-bcache-reload-jouranl-key-information-during-journal.patch new file mode 100644 index 0000000..9ed22f2 --- /dev/null +++ b/for-next/old/0017-bcache-reload-jouranl-key-information-during-journal.patch @@ -0,0 +1,161 @@ +From b347b87e9c61c1daadce6f25ee4cf07f2b8dbc8e Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Wed, 27 Feb 2019 20:32:22 +0800 +Subject: [PATCH 17/17] bcache: reload jouranl key information during journal + replay + +When bcache journal initiates during running cache set, cache set +journal.blocks_free is initiated as 0. Then during journal replay if +journal_meta() is called and an empty jset is written to cache device, +journal_reclaim() is called. If there is available journal bucket to +reclaim, c->journal.blocks_free is set to numbers of blocks of a journal +bucket, which is c->sb.bucket_size >> c->block_bits. + +Most of time the above process works correctly, expect the condtion +when journal space is almost full. "Almost full" means there is no free +journal bucket, but there are still free blocks in last available +bucket indexed by ja->cur_idx. + +If system crashes or reboots when journal space is almost full, problem +comes. During cache set reload after the reboot, c->journal.blocks_free +is initialized as 0, when jouranl replay process writes bcache jouranl, +journal_reclaim() will be called to reclaim available journal bucket and +set c->journal.blocks_free to c->sb.bucket_size >> c->block_bits. But +there is no fully free bucket to reclaim in journal_reclaim(), so value +of c->journal.blocks_free will keep 0. If the first journal entry +processed by journal_replay() causes btree split and requires writing +journal space by journal_meta(), journal_meta() has to go into an +infinite loop to reclaim jouranl bucket, and blocks the whole cache set +to run. + +Such buggy situation can be solved if we do following things before +journal replay starts, +- Recover previous value of c->journal.blocks_free in last run time, + and set it to current c->journal.blocks_free as initial value. +- Recover previous value of ja->cur_idx in last run time, and set it to + KEY_PTR of current c->journal.key as initial value. + +After c->journal.blocks_free and c->journal.key are recovered, in +condition when jouranl space is almost full and cache set is reloaded, +meta journal entry from journal reply can be written into free blocks of +the last available journal bucket, then old jouranl entries can be +replayed and reclaimed for further journaling request. + +This patch adds bch_journal_key_reload() to recover journal blocks_free +and key ptr value for above purpose. bch_journal_key_reload() is called +in bch_journal_read() before replying journal by bch_journal_replay(). + +Cc: stable@vger.kernel.org +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/journal.c | 87 +++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 87 insertions(+) + +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index 938dc4904778..4fc0752f5580 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -143,6 +143,89 @@ reread: left = ca->sb.bucket_size - offset; + return ret; + } + ++static int bch_journal_key_reload(struct cache_set *c) ++{ ++ struct cache *ca; ++ unsigned int iter, n = 0; ++ struct bkey *k = &c->journal.key; ++ int ret = 0; ++ ++ for_each_cache(ca, c, iter) { ++ struct journal_device *ja = &ca->journal; ++ struct bio *bio = &ja->bio; ++ struct jset *j, *data = c->journal.w[0].data; ++ struct closure cl; ++ unsigned int len, left; ++ unsigned int offset = 0, used_blocks = 0; ++ sector_t bucket = bucket_to_sector(c, ca->sb.d[ja->cur_idx]); ++ ++ closure_init_stack(&cl); ++ ++ while (offset < ca->sb.bucket_size) { ++reread: left = ca->sb.bucket_size - offset; ++ len = min_t(unsigned int, ++ left, PAGE_SECTORS << JSET_BITS); ++ ++ bio_reset(bio); ++ bio->bi_iter.bi_sector = bucket + offset; ++ bio_set_dev(bio, ca->bdev); ++ bio->bi_iter.bi_size = len << 9; ++ ++ bio->bi_end_io = journal_read_endio; ++ bio->bi_private = &cl; ++ bio_set_op_attrs(bio, REQ_OP_READ, 0); ++ bch_bio_map(bio, data); ++ ++ closure_bio_submit(c, bio, &cl); ++ closure_sync(&cl); ++ ++ j = data; ++ while (len) { ++ size_t blocks, bytes = set_bytes(j); ++ ++ if (j->magic != jset_magic(&ca->sb)) ++ goto out; ++ ++ if (bytes > left << 9 || ++ bytes > PAGE_SIZE << JSET_BITS) { ++ pr_err("jset may be correpted: too big"); ++ ret = -EIO; ++ goto err; ++ } ++ ++ if (bytes > len << 9) ++ goto reread; ++ ++ if (j->csum != csum_set(j)) { ++ pr_err("jset may be corrupted: bad csum"); ++ ret = -EIO; ++ goto err; ++ } ++ ++ blocks = set_blocks(j, block_bytes(c)); ++ used_blocks += blocks; ++ ++ offset += blocks * ca->sb.block_size; ++ len -= blocks * ca->sb.block_size; ++ j = ((void *) j) + blocks * block_bytes(ca); ++ } ++ } ++out: ++ c->journal.blocks_free = ++ (c->sb.bucket_size >> c->block_bits) - ++ used_blocks; ++ ++ k->ptr[n++] = MAKE_PTR(0, bucket, ca->sb.nr_this_dev); ++ } ++ ++ BUG_ON(n == 0); ++ bkey_init(k); ++ SET_KEY_PTRS(k, n); ++ ++err: ++ return ret; ++} ++ + int bch_journal_read(struct cache_set *c, struct list_head *list) + { + #define read_bucket(b) \ +@@ -268,6 +351,10 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) + struct journal_replay, + list)->j.seq; + ++ /* Initial value of c->journal.blocks_free should be 0 */ ++ BUG_ON(c->journal.blocks_free != 0); ++ ret = bch_journal_key_reload(c); ++ + return ret; + #undef read_bucket + } +-- +2.16.4 + diff --git a/for-next/old2/0000-cover-letter.patch b/for-next/old2/0000-cover-letter.patch new file mode 100644 index 0000000..d5465ae --- /dev/null +++ b/for-next/old2/0000-cover-letter.patch @@ -0,0 +1,56 @@ +From 83bd89b79c4dfc2daa4d6a72dfe590f105d4b30b Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sun, 2 Jun 2019 01:08:15 +0800 +Subject: [PATCH 00/27] *** SUBJECT HERE *** + +*** BLURB HERE *** + +Alexandru Ardelean (1): + bcache: use sysfs_match_string() instead of __sysfs_match_string() + +Coly Li (26): + bcache: avoid flushing btree node in cache_set_flush() if io disabled + bcache: Revert "bcache: fix high CPU occupancy during journal" + bcache: Revert "bcache: free heap cache_set->flush_btree in + bch_journal_free" + bcache: ignore read-ahead request failure on backing device + bcache: add io error counting in write_bdev_super_endio() + bcache: remove "XXX:" comment line from run_cache_set() + bcache: remove unnecessary prefetch() in bset_search_tree() + bcache: make bset_search_tree() be more understandable + bcache: add return value check to bch_cached_dev_run() + bcache: remove unncessary code in bch_btree_keys_init() + bcache: avoid a deadlock in bcache_reboot() + bcache: check CACHE_SET_IO_DISABLE in allocator code + bcache: check CACHE_SET_IO_DISABLE bit in bch_journal() + bcache: add pendings_cleanup to stop pending bcache device + bcache: add code comments for journal_read_bucket() + bcache: set largest seq to ja->seq[bucket_index] in + journal_read_bucket() + bcache: simplify bch_journal_read() + bcache: shrink btree node cache after bch_btree_check() + bcache: more detailed error message to bcache_device_link() + bcache: add more error message in bch_cached_dev_attach() + bcache: improve error message in bch_cached_dev_run() + bcache: fix race in btree_flush_write() + bcache: remove retry_flush_write from struct cache_set + bcache: use bcache_mod_wq to replace system wide system_wq + bcache: add reclaimed_journal_buckets to struct cache_set + bcache: acquire bch_register_lock later in cached_dev_detach_finish() + + drivers/md/bcache/alloc.c | 9 ++ + drivers/md/bcache/bcache.h | 6 +- + drivers/md/bcache/bset.c | 48 ++------- + drivers/md/bcache/btree.c | 19 +++- + drivers/md/bcache/btree.h | 2 + + drivers/md/bcache/io.c | 12 +++ + drivers/md/bcache/journal.c | 238 ++++++++++++++++++++------------------------ + drivers/md/bcache/journal.h | 4 + + drivers/md/bcache/super.c | 210 ++++++++++++++++++++++++++++++++------ + drivers/md/bcache/sysfs.c | 63 ++++++++---- + drivers/md/bcache/util.h | 2 - + 11 files changed, 389 insertions(+), 224 deletions(-) + +-- +2.16.4 + diff --git a/for-next/old2/0001-bcache-avoid-flushing-btree-node-in-cache_set_flush-.patch b/for-next/old2/0001-bcache-avoid-flushing-btree-node-in-cache_set_flush-.patch new file mode 100644 index 0000000..5b37e9a --- /dev/null +++ b/for-next/old2/0001-bcache-avoid-flushing-btree-node-in-cache_set_flush-.patch @@ -0,0 +1,53 @@ +From c10fba3925fb78d2a0d4122a6e86f58154a3c82e Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Thu, 23 May 2019 23:18:10 +0800 +Subject: [PATCH 01/31] bcache: avoid flushing btree node in cache_set_flush() + if io disabled + +When cache_set_flush() is called for too many I/O errors detected on +cache device and the cache set is retiring, inside the function it +doesn't make sense to flushing cached btree nodes from c->btree_cache +because CACHE_SET_IO_DISABLE is set on c->flags already and all I/Os +onto cache device will be rejected. + +This patch checks in cache_set_flush() that whether CACHE_SET_IO_DISABLE +is set. If yes, then avoids to flush the cached btree nodes to reduce +more time and make cache set retiring more faster. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/super.c | 18 +++++++++++------- + 1 file changed, 11 insertions(+), 7 deletions(-) + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 1b63ac876169..f44a666271f5 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -1570,13 +1570,17 @@ static void cache_set_flush(struct closure *cl) + if (!IS_ERR_OR_NULL(c->root)) + list_add(&c->root->list, &c->btree_cache); + +- /* Should skip this if we're unregistering because of an error */ +- list_for_each_entry(b, &c->btree_cache, list) { +- mutex_lock(&b->write_lock); +- if (btree_node_dirty(b)) +- __bch_btree_node_write(b, NULL); +- mutex_unlock(&b->write_lock); +- } ++ /* ++ * Avoid flushing cached nodes if cache set is retiring ++ * due to too many I/O errors detected. ++ */ ++ if (!test_bit(CACHE_SET_IO_DISABLE, &c->flags)) ++ list_for_each_entry(b, &c->btree_cache, list) { ++ mutex_lock(&b->write_lock); ++ if (btree_node_dirty(b)) ++ __bch_btree_node_write(b, NULL); ++ mutex_unlock(&b->write_lock); ++ } + + for_each_cache(ca, c, i) + if (ca->alloc_thread) +-- +2.16.4 + diff --git a/for-next/old2/0002-bcache-Revert-bcache-fix-high-CPU-occupancy-during-j.patch b/for-next/old2/0002-bcache-Revert-bcache-fix-high-CPU-occupancy-during-j.patch new file mode 100644 index 0000000..cf0316b --- /dev/null +++ b/for-next/old2/0002-bcache-Revert-bcache-fix-high-CPU-occupancy-during-j.patch @@ -0,0 +1,129 @@ +From 8d978651ae76cf942f6cc404ef4d7c70e463dc3d Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Tue, 28 May 2019 21:19:38 +0800 +Subject: [PATCH 02/31] bcache: Revert "bcache: fix high CPU occupancy during + journal" + +This reverts commit c4dc2497d50d9c6fb16aa0d07b6a14f3b2adb1e0. + +This patch enlarges a race between normal btree flush code path and +flush_btree_write(), which causes deadlock when journal space is +exhausted. Reverts this patch makes the race window from 128 btree +nodes to only 1 btree nodes. + +Fixes: c4dc2497d50d ("bcache: fix high CPU occupancy during journal") +Signed-off-by: Coly Li <colyli@suse.de> +Cc: stable@vger.kernel.org +Cc: Tang Junhui <tang.junhui.linux@gmail.com> +--- + drivers/md/bcache/bcache.h | 2 -- + drivers/md/bcache/journal.c | 47 +++++++++++++++------------------------------ + drivers/md/bcache/util.h | 2 -- + 3 files changed, 15 insertions(+), 36 deletions(-) + +diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h +index fdf75352e16a..e30a983a68cd 100644 +--- a/drivers/md/bcache/bcache.h ++++ b/drivers/md/bcache/bcache.h +@@ -726,8 +726,6 @@ struct cache_set { + + #define BUCKET_HASH_BITS 12 + struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; +- +- DECLARE_HEAP(struct btree *, flush_btree); + }; + + struct bbio { +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index 12dae9348147..a7ff60100755 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -391,12 +391,6 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) + } + + /* Journalling */ +-#define journal_max_cmp(l, r) \ +- (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) < \ +- fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal)) +-#define journal_min_cmp(l, r) \ +- (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) > \ +- fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal)) + + static void btree_flush_write(struct cache_set *c) + { +@@ -404,35 +398,25 @@ static void btree_flush_write(struct cache_set *c) + * Try to find the btree node with that references the oldest journal + * entry, best is our current candidate and is locked if non NULL: + */ +- struct btree *b; +- int i; ++ struct btree *b, *best; ++ unsigned i; + + atomic_long_inc(&c->flush_write); +- + retry: +- spin_lock(&c->journal.lock); +- if (heap_empty(&c->flush_btree)) { +- for_each_cached_btree(b, c, i) +- if (btree_current_write(b)->journal) { +- if (!heap_full(&c->flush_btree)) +- heap_add(&c->flush_btree, b, +- journal_max_cmp); +- else if (journal_max_cmp(b, +- heap_peek(&c->flush_btree))) { +- c->flush_btree.data[0] = b; +- heap_sift(&c->flush_btree, 0, +- journal_max_cmp); +- } ++ best = NULL; ++ ++ for_each_cached_btree(b, c, i) ++ if (btree_current_write(b)->journal) { ++ if (!best) ++ best = b; ++ else if (journal_pin_cmp(c, ++ btree_current_write(best)->journal, ++ btree_current_write(b)->journal)) { ++ best = b; + } ++ } + +- for (i = c->flush_btree.used / 2 - 1; i >= 0; --i) +- heap_sift(&c->flush_btree, i, journal_min_cmp); +- } +- +- b = NULL; +- heap_pop(&c->flush_btree, b, journal_min_cmp); +- spin_unlock(&c->journal.lock); +- ++ b = best; + if (b) { + mutex_lock(&b->write_lock); + if (!btree_current_write(b)->journal) { +@@ -870,8 +854,7 @@ int bch_journal_alloc(struct cache_set *c) + j->w[0].c = c; + j->w[1].c = c; + +- if (!(init_heap(&c->flush_btree, 128, GFP_KERNEL)) || +- !(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || ++ if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || + !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) || + !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS))) + return -ENOMEM; +diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h +index 1fbced94e4cc..c029f7443190 100644 +--- a/drivers/md/bcache/util.h ++++ b/drivers/md/bcache/util.h +@@ -113,8 +113,6 @@ do { \ + + #define heap_full(h) ((h)->used == (h)->size) + +-#define heap_empty(h) ((h)->used == 0) +- + #define DECLARE_FIFO(type, name) \ + struct { \ + size_t front, back, size, mask; \ +-- +2.16.4 + diff --git a/for-next/old2/0003-bcache-Revert-bcache-free-heap-cache_set-flush_btree.patch b/for-next/old2/0003-bcache-Revert-bcache-free-heap-cache_set-flush_btree.patch new file mode 100644 index 0000000..89b66ed --- /dev/null +++ b/for-next/old2/0003-bcache-Revert-bcache-free-heap-cache_set-flush_btree.patch @@ -0,0 +1,35 @@ +From adf1ee061acd018bd6329661175fb6c442f8969a Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Tue, 28 May 2019 21:36:56 +0800 +Subject: [PATCH 03/31] bcache: Revert "bcache: free heap + cache_set->flush_btree in bch_journal_free" + +This reverts commit 6268dc2c4703aabfb0b35681be709acf4c2826c6. + +This patch depends on commit c4dc2497d50d ("bcache: fix high CPU +occupancy during journal") which is reverted in previous patch. So +revert this one too. + +Fixes: 6268dc2c4703 ("bcache: free heap cache_set->flush_btree in bch_journal_free") +Signed-off-by: Coly Li <colyli@suse.de> +Cc: stable@vger.kernel.org +Cc: Shenghui Wang <shhuiw@foxmail.com> +--- + drivers/md/bcache/journal.c | 1 - + 1 file changed, 1 deletion(-) + +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index a7ff60100755..38849736fa1c 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -839,7 +839,6 @@ void bch_journal_free(struct cache_set *c) + free_pages((unsigned long) c->journal.w[1].data, JSET_BITS); + free_pages((unsigned long) c->journal.w[0].data, JSET_BITS); + free_fifo(&c->journal.pin); +- free_heap(&c->flush_btree); + } + + int bch_journal_alloc(struct cache_set *c) +-- +2.16.4 + diff --git a/for-next/old2/0004-bcache-ignore-read-ahead-request-failure-on-backing-.patch b/for-next/old2/0004-bcache-ignore-read-ahead-request-failure-on-backing-.patch new file mode 100644 index 0000000..cc0ef1d --- /dev/null +++ b/for-next/old2/0004-bcache-ignore-read-ahead-request-failure-on-backing-.patch @@ -0,0 +1,55 @@ +From 732aba1ff2549438105b085b0387a270586ebdc3 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Mon, 13 May 2019 22:48:09 +0800 +Subject: [PATCH 04/31] bcache: ignore read-ahead request failure on backing + device + +When md raid device (e.g. raid456) is used as backing device, read-ahead +requests on a degrading and recovering md raid device might be failured +immediately by md raid code, but indeed this md raid array can still be +read or write for normal I/O requests. Therefore such failed read-ahead +request are not real hardware failure. Further more, after degrading and +recovering accomplished, read-ahead requests will be handled by md raid +array again. + +For such condition, I/O failures of read-ahead requests don't indicate +real health status (because normal I/O still be served), they should not +be counted into I/O error counter dc->io_errors. + +Since there is no simple way to detect whether the backing divice is a +md raid device, this patch simply ignores I/O failures for read-ahead +bios on backing device, to avoid bogus backing device failure on a +degrading md raid array. + +Suggested-and-tested-by: Thorsten Knabe <linux@thorsten-knabe.de> +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/io.c | 12 ++++++++++++ + 1 file changed, 12 insertions(+) + +diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c +index c25097968319..4d93f07f63e5 100644 +--- a/drivers/md/bcache/io.c ++++ b/drivers/md/bcache/io.c +@@ -58,6 +58,18 @@ void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio) + + WARN_ONCE(!dc, "NULL pointer of struct cached_dev"); + ++ /* ++ * Read-ahead requests on a degrading and recovering md raid ++ * (e.g. raid6) device might be failured immediately by md ++ * raid code, which is not a real hardware media failure. So ++ * we shouldn't count failed REQ_RAHEAD bio to dc->io_errors. ++ */ ++ if (bio->bi_opf & REQ_RAHEAD) { ++ pr_warn_ratelimited("%s: Read-ahead I/O failed on backing device, ignore", ++ dc->backing_dev_name); ++ return; ++ } ++ + errors = atomic_add_return(1, &dc->io_errors); + if (errors < dc->error_limit) + pr_err("%s: IO error on backing device, unrecoverable", +-- +2.16.4 + diff --git a/for-next/old2/0005-bcache-add-io-error-counting-in-write_bdev_super_end.patch b/for-next/old2/0005-bcache-add-io-error-counting-in-write_bdev_super_end.patch new file mode 100644 index 0000000..3ea9b1c --- /dev/null +++ b/for-next/old2/0005-bcache-add-io-error-counting-in-write_bdev_super_end.patch @@ -0,0 +1,38 @@ +From 4570746d46fbbdc0cd0ad54fa454f4ba9e16c8c2 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Mon, 13 May 2019 23:42:39 +0800 +Subject: [PATCH 05/31] bcache: add io error counting in + write_bdev_super_endio() + +When backing device super block is written by bch_write_bdev_super(), +the bio complete callback write_bdev_super_endio() simply ignores I/O +status. Indeed such write request also contribute to backing device +health status if the request failed. + +This patch checkes bio->bi_status in write_bdev_super_endio(), if there +is error, bch_count_backing_io_errors() will be called to count an I/O +error to dc->io_errors. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/super.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index f44a666271f5..c486a9de1219 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -197,7 +197,9 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, + static void write_bdev_super_endio(struct bio *bio) + { + struct cached_dev *dc = bio->bi_private; +- /* XXX: error checking */ ++ ++ if (bio->bi_status) ++ bch_count_backing_io_errors(dc, bio); + + closure_put(&dc->sb_write); + } +-- +2.16.4 + diff --git a/for-next/old2/0006-bcache-remove-XXX-comment-line-from-run_cache_set.patch b/for-next/old2/0006-bcache-remove-XXX-comment-line-from-run_cache_set.patch new file mode 100644 index 0000000..5d4071e --- /dev/null +++ b/for-next/old2/0006-bcache-remove-XXX-comment-line-from-run_cache_set.patch @@ -0,0 +1,31 @@ +From e84087569e15dca4859bd6413a2638f7eefa40f1 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Mon, 13 May 2019 23:47:38 +0800 +Subject: [PATCH 06/31] bcache: remove "XXX:" comment line from run_cache_set() + +In previous bcache patches for Linux v5.2, the failure code path of +run_cache_set() is tested and fixed. So now the following comment +line can be removed from run_cache_set(), + /* XXX: test this, it's broken */ + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/super.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index c486a9de1219..962c53493cf0 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -1963,7 +1963,7 @@ static int run_cache_set(struct cache_set *c) + } + + closure_sync(&cl); +- /* XXX: test this, it's broken */ ++ + bch_cache_set_error(c, "%s", err); + + return -EIO; +-- +2.16.4 + diff --git a/for-next/old2/0007-bcache-remove-unnecessary-prefetch-in-bset_search_tr.patch b/for-next/old2/0007-bcache-remove-unnecessary-prefetch-in-bset_search_tr.patch new file mode 100644 index 0000000..d7a856b --- /dev/null +++ b/for-next/old2/0007-bcache-remove-unnecessary-prefetch-in-bset_search_tr.patch @@ -0,0 +1,56 @@ +From 327c62d3e7e990243c2ea930e3f60770cffb0124 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Tue, 14 May 2019 22:23:35 +0800 +Subject: [PATCH 07/31] bcache: remove unnecessary prefetch() in + bset_search_tree() + +In function bset_search_tree(), when p >= t->size, t->tree[0] will be +prefetched by the following code piece, + 974 unsigned int p = n << 4; + 975 + 976 p &= ((int) (p - t->size)) >> 31; + 977 + 978 prefetch(&t->tree[p]); + +The purpose of the above code is to avoid a branch instruction, but +when p >= t->size, prefetch(&t->tree[0]) has no positive performance +contribution at all. This patch avoids the unncessary prefetch by only +calling prefetch() when p < t->size. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/bset.c | 16 ++-------------- + 1 file changed, 2 insertions(+), 14 deletions(-) + +diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c +index 8f07fa6e1739..aa2e4ab0fab9 100644 +--- a/drivers/md/bcache/bset.c ++++ b/drivers/md/bcache/bset.c +@@ -960,22 +960,10 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t, + unsigned int inorder, j, n = 1; + + do { +- /* +- * A bit trick here. +- * If p < t->size, (int)(p - t->size) is a minus value and +- * the most significant bit is set, right shifting 31 bits +- * gets 1. If p >= t->size, the most significant bit is +- * not set, right shifting 31 bits gets 0. +- * So the following 2 lines equals to +- * if (p >= t->size) +- * p = 0; +- * but a branch instruction is avoided. +- */ + unsigned int p = n << 4; + +- p &= ((int) (p - t->size)) >> 31; +- +- prefetch(&t->tree[p]); ++ if (p < t->size) ++ prefetch(&t->tree[p]); + + j = n; + f = &t->tree[j]; +-- +2.16.4 + diff --git a/for-next/old2/0008-bcache-make-bset_search_tree-be-more-understandable.patch b/for-next/old2/0008-bcache-make-bset_search_tree-be-more-understandable.patch new file mode 100644 index 0000000..1518cff --- /dev/null +++ b/for-next/old2/0008-bcache-make-bset_search_tree-be-more-understandable.patch @@ -0,0 +1,58 @@ +From 6a3180c3fea9089b72d0e1ff51816b1552c26117 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Tue, 14 May 2019 22:51:40 +0800 +Subject: [PATCH 08/31] bcache: make bset_search_tree() be more understandable + +The purpose of following code in bset_search_tree() is to avoid a branch +instruction, + 994 if (likely(f->exponent != 127)) + 995 n = j * 2 + (((unsigned int) + 996 (f->mantissa - + 997 bfloat_mantissa(search, f))) >> 31); + 998 else + 999 n = (bkey_cmp(tree_to_bkey(t, j), search) > 0) +1000 ? j * 2 +1001 : j * 2 + 1; + +This piece of code is not very clear to understand, even when I tried to +add code comment for it, I made mistake. This patch removes the implict +bit operation and uses explicit branch to calculate next location in +binary tree search. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/bset.c | 17 +++-------------- + 1 file changed, 3 insertions(+), 14 deletions(-) + +diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c +index aa2e4ab0fab9..f752cc791f50 100644 +--- a/drivers/md/bcache/bset.c ++++ b/drivers/md/bcache/bset.c +@@ -968,21 +968,10 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t, + j = n; + f = &t->tree[j]; + +- /* +- * Similar bit trick, use subtract operation to avoid a branch +- * instruction. +- * +- * n = (f->mantissa > bfloat_mantissa()) +- * ? j * 2 +- * : j * 2 + 1; +- * +- * We need to subtract 1 from f->mantissa for the sign bit trick +- * to work - that's done in make_bfloat() +- */ + if (likely(f->exponent != 127)) +- n = j * 2 + (((unsigned int) +- (f->mantissa - +- bfloat_mantissa(search, f))) >> 31); ++ n = (f->mantissa >= bfloat_mantissa(search, f)) ++ ? j * 2 ++ : j * 2 + 1; + else + n = (bkey_cmp(tree_to_bkey(t, j), search) > 0) + ? j * 2 +-- +2.16.4 + diff --git a/for-next/old2/0009-bcache-use-sysfs_match_string-instead-of-__sysfs_mat.patch b/for-next/old2/0009-bcache-use-sysfs_match_string-instead-of-__sysfs_mat.patch new file mode 100644 index 0000000..dcc8e89 --- /dev/null +++ b/for-next/old2/0009-bcache-use-sysfs_match_string-instead-of-__sysfs_mat.patch @@ -0,0 +1,97 @@ +From de0a1c6ff54f0aa7005470614fcc07e57d6477d5 Mon Sep 17 00:00:00 2001 +From: Alexandru Ardelean <alexandru.ardelean@analog.com> +Date: Tue, 7 May 2019 12:43:12 +0300 +Subject: [PATCH 09/31] bcache: use sysfs_match_string() instead of + __sysfs_match_string() + +The arrays (of strings) that are passed to __sysfs_match_string() are +static, so use sysfs_match_string() which does an implicit ARRAY_SIZE() +over these arrays. + +Functionally, this doesn't change anything. +The change is more cosmetic. + +It only shrinks the static arrays by 1 byte each. + +Signed-off-by: Alexandru Ardelean <alexandru.ardelean@analog.com> +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/sysfs.c | 20 ++++++++------------ + 1 file changed, 8 insertions(+), 12 deletions(-) + +diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c +index 6cd44d3cf906..3a520262933d 100644 +--- a/drivers/md/bcache/sysfs.c ++++ b/drivers/md/bcache/sysfs.c +@@ -21,28 +21,24 @@ static const char * const bch_cache_modes[] = { + "writethrough", + "writeback", + "writearound", +- "none", +- NULL ++ "none" + }; + + /* Default is 0 ("auto") */ + static const char * const bch_stop_on_failure_modes[] = { + "auto", +- "always", +- NULL ++ "always" + }; + + static const char * const cache_replacement_policies[] = { + "lru", + "fifo", +- "random", +- NULL ++ "random" + }; + + static const char * const error_actions[] = { + "unregister", +- "panic", +- NULL ++ "panic" + }; + + write_attribute(attach); +@@ -333,7 +329,7 @@ STORE(__cached_dev) + bch_cached_dev_run(dc); + + if (attr == &sysfs_cache_mode) { +- v = __sysfs_match_string(bch_cache_modes, -1, buf); ++ v = sysfs_match_string(bch_cache_modes, buf); + if (v < 0) + return v; + +@@ -344,7 +340,7 @@ STORE(__cached_dev) + } + + if (attr == &sysfs_stop_when_cache_set_failed) { +- v = __sysfs_match_string(bch_stop_on_failure_modes, -1, buf); ++ v = sysfs_match_string(bch_stop_on_failure_modes, buf); + if (v < 0) + return v; + +@@ -794,7 +790,7 @@ STORE(__bch_cache_set) + 0, UINT_MAX); + + if (attr == &sysfs_errors) { +- v = __sysfs_match_string(error_actions, -1, buf); ++ v = sysfs_match_string(error_actions, buf); + if (v < 0) + return v; + +@@ -1058,7 +1054,7 @@ STORE(__bch_cache) + } + + if (attr == &sysfs_cache_replacement_policy) { +- v = __sysfs_match_string(cache_replacement_policies, -1, buf); ++ v = sysfs_match_string(cache_replacement_policies, buf); + if (v < 0) + return v; + +-- +2.16.4 + diff --git a/for-next/old2/0010-bcache-add-return-value-check-to-bch_cached_dev_run.patch b/for-next/old2/0010-bcache-add-return-value-check-to-bch_cached_dev_run.patch new file mode 100644 index 0000000..565ba65 --- /dev/null +++ b/for-next/old2/0010-bcache-add-return-value-check-to-bch_cached_dev_run.patch @@ -0,0 +1,151 @@ +From d4d152860628f81b058c84f759fe74b635998204 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Tue, 21 May 2019 22:16:38 +0800 +Subject: [PATCH 10/31] bcache: add return value check to bch_cached_dev_run() + +This patch adds return value check to bch_cached_dev_run(), now if there +is error happens inside bch_cached_dev_run(), it can be catched. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/bcache.h | 2 +- + drivers/md/bcache/super.c | 32 +++++++++++++++++++++++++------- + drivers/md/bcache/sysfs.c | 7 +++++-- + 3 files changed, 31 insertions(+), 10 deletions(-) + +diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h +index e30a983a68cd..cb268d7c6cea 100644 +--- a/drivers/md/bcache/bcache.h ++++ b/drivers/md/bcache/bcache.h +@@ -1004,7 +1004,7 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size); + int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, + uint8_t *set_uuid); + void bch_cached_dev_detach(struct cached_dev *dc); +-void bch_cached_dev_run(struct cached_dev *dc); ++int bch_cached_dev_run(struct cached_dev *dc); + void bcache_device_stop(struct bcache_device *d); + + void bch_cache_set_unregister(struct cache_set *c); +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 962c53493cf0..8bc5c55d0ee2 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -910,7 +910,7 @@ static int cached_dev_status_update(void *arg) + } + + +-void bch_cached_dev_run(struct cached_dev *dc) ++int bch_cached_dev_run(struct cached_dev *dc) + { + struct bcache_device *d = &dc->disk; + char *buf = kmemdup_nul(dc->sb.label, SB_LABEL_SIZE, GFP_KERNEL); +@@ -921,11 +921,14 @@ void bch_cached_dev_run(struct cached_dev *dc) + NULL, + }; + ++ if (dc->io_disable) ++ return -EIO; ++ + if (atomic_xchg(&dc->running, 1)) { + kfree(env[1]); + kfree(env[2]); + kfree(buf); +- return; ++ return -EBUSY; + } + + if (!d->c && +@@ -951,8 +954,10 @@ void bch_cached_dev_run(struct cached_dev *dc) + kfree(buf); + + if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || +- sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) ++ sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) { + pr_debug("error creating sysfs link"); ++ return -ENOMEM; ++ } + + dc->status_update_thread = kthread_run(cached_dev_status_update, + dc, "bcache_status_update"); +@@ -961,6 +966,8 @@ void bch_cached_dev_run(struct cached_dev *dc) + "continue to run without monitoring backing " + "device status"); + } ++ ++ return 0; + } + + /* +@@ -1056,6 +1063,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, + uint32_t rtime = cpu_to_le32((u32)ktime_get_real_seconds()); + struct uuid_entry *u; + struct cached_dev *exist_dc, *t; ++ int ret = 0; + + if ((set_uuid && memcmp(set_uuid, c->sb.set_uuid, 16)) || + (!set_uuid && memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16))) +@@ -1165,7 +1173,12 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, + + bch_sectors_dirty_init(&dc->disk); + +- bch_cached_dev_run(dc); ++ ret = bch_cached_dev_run(dc); ++ if (ret && (ret != -EBUSY)) { ++ up_write(&dc->writeback_lock); ++ return ret; ++ } ++ + bcache_device_link(&dc->disk, c, "bdev"); + atomic_inc(&c->attached_dev_nr); + +@@ -1292,6 +1305,7 @@ static int register_bdev(struct cache_sb *sb, struct page *sb_page, + { + const char *err = "cannot allocate memory"; + struct cache_set *c; ++ int ret = -ENOMEM; + + bdevname(bdev, dc->backing_dev_name); + memcpy(&dc->sb, sb, sizeof(struct cache_sb)); +@@ -1321,14 +1335,18 @@ static int register_bdev(struct cache_sb *sb, struct page *sb_page, + bch_cached_dev_attach(dc, c, NULL); + + if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE || +- BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) +- bch_cached_dev_run(dc); ++ BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) { ++ err = "failed to run cached device"; ++ ret = bch_cached_dev_run(dc); ++ if (ret) ++ goto err; ++ } + + return 0; + err: + pr_notice("error %s: %s", dc->backing_dev_name, err); + bcache_device_stop(&dc->disk); +- return -EIO; ++ return ret; + } + + /* Flash only volumes */ +diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c +index 3a520262933d..129031663cc8 100644 +--- a/drivers/md/bcache/sysfs.c ++++ b/drivers/md/bcache/sysfs.c +@@ -325,8 +325,11 @@ STORE(__cached_dev) + bch_cache_accounting_clear(&dc->accounting); + + if (attr == &sysfs_running && +- strtoul_or_return(buf)) +- bch_cached_dev_run(dc); ++ strtoul_or_return(buf)) { ++ v = bch_cached_dev_run(dc); ++ if (v) ++ return v; ++ } + + if (attr == &sysfs_cache_mode) { + v = sysfs_match_string(bch_cache_modes, buf); +-- +2.16.4 + diff --git a/for-next/old2/0011-bcache-remove-unncessary-code-in-bch_btree_keys_init.patch b/for-next/old2/0011-bcache-remove-unncessary-code-in-bch_btree_keys_init.patch new file mode 100644 index 0000000..2366c77 --- /dev/null +++ b/for-next/old2/0011-bcache-remove-unncessary-code-in-bch_btree_keys_init.patch @@ -0,0 +1,72 @@ +From 916a60181db538619e372a4f76e5a01120478b79 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Tue, 21 May 2019 22:36:35 +0800 +Subject: [PATCH 11/31] bcache: remove unncessary code in bch_btree_keys_init() + +Function bch_btree_keys_init() initializes b->set[].size and +b->set[].data to zero. As the code comments indicates, these code indeed +is unncessary, because both struct btree_keys and struct bset_tree are +nested embedded into struct btree, when struct btree is filled with 0 +bits by kzalloc() in mca_bucket_alloc(), b->set[].size and +b->set[].data are initialized to 0 (a.k.a NULL) already. + +This patch removes the redundant code, and add comments in +bch_btree_keys_init() and mca_bucket_alloc() to explain why it's safe. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/bset.c | 15 ++++++--------- + drivers/md/bcache/btree.c | 4 ++++ + 2 files changed, 10 insertions(+), 9 deletions(-) + +diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c +index f752cc791f50..32e2e4d8fa6c 100644 +--- a/drivers/md/bcache/bset.c ++++ b/drivers/md/bcache/bset.c +@@ -347,22 +347,19 @@ EXPORT_SYMBOL(bch_btree_keys_alloc); + void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops, + bool *expensive_debug_checks) + { +- unsigned int i; +- + b->ops = ops; + b->expensive_debug_checks = expensive_debug_checks; + b->nsets = 0; + b->last_set_unwritten = 0; + +- /* XXX: shouldn't be needed */ +- for (i = 0; i < MAX_BSETS; i++) +- b->set[i].size = 0; + /* +- * Second loop starts at 1 because b->keys[0]->data is the memory we +- * allocated ++ * struct btree_keys in embedded in struct btree, and struct ++ * bset_tree is embedded into struct btree_keys. They are all ++ * initialized as 0 by kzalloc() in mca_bucket_alloc(), and ++ * b->set[0].data is allocated in bch_btree_keys_alloc(), so we ++ * don't have to initiate b->set[].size and b->set[].data here ++ * any more. + */ +- for (i = 1; i < MAX_BSETS; i++) +- b->set[i].data = NULL; + } + EXPORT_SYMBOL(bch_btree_keys_init); + +diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c +index 773f5fdad25f..cf38a1b031fa 100644 +--- a/drivers/md/bcache/btree.c ++++ b/drivers/md/bcache/btree.c +@@ -613,6 +613,10 @@ static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) + static struct btree *mca_bucket_alloc(struct cache_set *c, + struct bkey *k, gfp_t gfp) + { ++ /* ++ * kzalloc() is necessary here for initialization, ++ * see code comments in bch_btree_keys_init(). ++ */ + struct btree *b = kzalloc(sizeof(struct btree), gfp); + + if (!b) +-- +2.16.4 + diff --git a/for-next/old2/0012-bcache-avoid-a-deadlock-in-bcache_reboot.patch b/for-next/old2/0012-bcache-avoid-a-deadlock-in-bcache_reboot.patch new file mode 100644 index 0000000..4f73d80 --- /dev/null +++ b/for-next/old2/0012-bcache-avoid-a-deadlock-in-bcache_reboot.patch @@ -0,0 +1,212 @@ +From 6fccf1970e4705e9924329dc4dbb24181338a257 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Tue, 21 May 2019 23:19:55 +0800 +Subject: [PATCH 12/31] bcache: avoid a deadlock in bcache_reboot() + +It is quite frequently to observe deadlock in bcache_reboot() happens +and hang the system reboot process. The reason is, in bcache_reboot() +when calling bch_cache_set_stop() and bcache_device_stop() the mutex +bch_register_lock is held. But in the process to stop cache set and +bcache device, bch_register_lock will be acquired again. If this mutex +is held here, deadlock will happen inside the stopping process. The +aftermath of the deadlock is, whole system reboot gets hung. + +The fix is to avoid holding bch_register_lock for the following loops +in bcache_reboot(), + list_for_each_entry_safe(c, tc, &bch_cache_sets, list) + bch_cache_set_stop(c); + + list_for_each_entry_safe(dc, tdc, &uncached_devices, list) + bcache_device_stop(&dc->disk); + +A module range variable 'bcache_is_reboot' is added, it sets to true +in bcache_reboot(). In register_bcache(), if bcache_is_reboot is checked +to be true, reject the registration by returning -EBUSY immediately. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/super.c | 41 ++++++++++++++++++++++++++++++++++++++++- + drivers/md/bcache/sysfs.c | 26 ++++++++++++++++++++++++++ + 2 files changed, 66 insertions(+), 1 deletion(-) + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 8bc5c55d0ee2..978689d4363c 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -40,6 +40,7 @@ static const char invalid_uuid[] = { + + static struct kobject *bcache_kobj; + struct mutex bch_register_lock; ++bool bcache_is_reboot; + LIST_HEAD(bch_cache_sets); + static LIST_HEAD(uncached_devices); + +@@ -49,6 +50,7 @@ static wait_queue_head_t unregister_wait; + struct workqueue_struct *bcache_wq; + struct workqueue_struct *bch_journal_wq; + ++ + #define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE) + /* limitation of partitions number on single bcache device */ + #define BCACHE_MINORS 128 +@@ -2325,6 +2327,11 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, + if (!try_module_get(THIS_MODULE)) + return -EBUSY; + ++ /* For latest state of bcache_is_reboot */ ++ smp_mb(); ++ if (bcache_is_reboot) ++ return -EBUSY; ++ + path = kstrndup(buffer, size, GFP_KERNEL); + if (!path) + goto err; +@@ -2404,6 +2411,9 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, + + static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) + { ++ if (bcache_is_reboot) ++ return NOTIFY_DONE; ++ + if (code == SYS_DOWN || + code == SYS_HALT || + code == SYS_POWER_OFF) { +@@ -2416,19 +2426,46 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) + + mutex_lock(&bch_register_lock); + ++ if (bcache_is_reboot) { ++ goto out; ++ } ++ ++ /* New registration is rejected since now */ ++ bcache_is_reboot = true; ++ /* ++ * Make registering caller (if there is) on other CPU ++ * core know bcache_is_reboot set to true earlier ++ */ ++ smp_mb(); ++ + if (list_empty(&bch_cache_sets) && + list_empty(&uncached_devices)) + goto out; + ++ mutex_unlock(&bch_register_lock); ++ + pr_info("Stopping all devices:"); + ++ /* ++ * The reason bch_register_lock is not held to call ++ * bch_cache_set_stop() and bcache_device_stop() is to ++ * avoid potential deadlock during reboot, because cache ++ * set or bcache device stopping process will acqurie ++ * bch_register_lock too. ++ * ++ * We are safe here because bcache_is_reboot sets to ++ * true already, register_bcache() will reject new ++ * registration now. bcache_is_reboot also makes sure ++ * bcache_reboot() won't be re-entered on by other thread, ++ * so there is no race in following list iteration by ++ * list_for_each_entry_safe(). ++ */ + list_for_each_entry_safe(c, tc, &bch_cache_sets, list) + bch_cache_set_stop(c); + + list_for_each_entry_safe(dc, tdc, &uncached_devices, list) + bcache_device_stop(&dc->disk); + +- mutex_unlock(&bch_register_lock); + + /* + * Give an early chance for other kthreads and +@@ -2555,6 +2592,8 @@ static int __init bcache_init(void) + bch_debug_init(); + closure_debug_init(); + ++ bcache_is_reboot = false; ++ + return 0; + err: + bcache_exit(); +diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c +index 129031663cc8..961a13a223ee 100644 +--- a/drivers/md/bcache/sysfs.c ++++ b/drivers/md/bcache/sysfs.c +@@ -16,6 +16,8 @@ + #include <linux/sort.h> + #include <linux/sched/clock.h> + ++extern bool bcache_is_reboot; ++ + /* Default is 0 ("writethrough") */ + static const char * const bch_cache_modes[] = { + "writethrough", +@@ -267,6 +269,10 @@ STORE(__cached_dev) + struct cache_set *c; + struct kobj_uevent_env *env; + ++ /* no user space access if system is rebooting */ ++ if (bcache_is_reboot) ++ return -EBUSY; ++ + #define d_strtoul(var) sysfs_strtoul(var, dc->var) + #define d_strtoul_nonzero(var) sysfs_strtoul_clamp(var, dc->var, 1, INT_MAX) + #define d_strtoi_h(var) sysfs_hatoi(var, dc->var) +@@ -407,6 +413,10 @@ STORE(bch_cached_dev) + struct cached_dev *dc = container_of(kobj, struct cached_dev, + disk.kobj); + ++ /* no user space access if system is rebooting */ ++ if (bcache_is_reboot) ++ return -EBUSY; ++ + mutex_lock(&bch_register_lock); + size = __cached_dev_store(kobj, attr, buf, size); + +@@ -505,6 +515,10 @@ STORE(__bch_flash_dev) + kobj); + struct uuid_entry *u = &d->c->uuids[d->id]; + ++ /* no user space access if system is rebooting */ ++ if (bcache_is_reboot) ++ return -EBUSY; ++ + sysfs_strtoul(data_csum, d->data_csum); + + if (attr == &sysfs_size) { +@@ -740,6 +754,10 @@ STORE(__bch_cache_set) + struct cache_set *c = container_of(kobj, struct cache_set, kobj); + ssize_t v; + ++ /* no user space access if system is rebooting */ ++ if (bcache_is_reboot) ++ return -EBUSY; ++ + if (attr == &sysfs_unregister) + bch_cache_set_unregister(c); + +@@ -859,6 +877,10 @@ STORE(bch_cache_set_internal) + { + struct cache_set *c = container_of(kobj, struct cache_set, internal); + ++ /* no user space access if system is rebooting */ ++ if (bcache_is_reboot) ++ return -EBUSY; ++ + return bch_cache_set_store(&c->kobj, attr, buf, size); + } + +@@ -1044,6 +1066,10 @@ STORE(__bch_cache) + struct cache *ca = container_of(kobj, struct cache, kobj); + ssize_t v; + ++ /* no user space access if system is rebooting */ ++ if (bcache_is_reboot) ++ return -EBUSY; ++ + if (attr == &sysfs_discard) { + bool v = strtoul_or_return(buf); + +-- +2.16.4 + diff --git a/for-next/old2/0013-bcache-check-CACHE_SET_IO_DISABLE-in-allocator-code.patch b/for-next/old2/0013-bcache-check-CACHE_SET_IO_DISABLE-in-allocator-code.patch new file mode 100644 index 0000000..37037e5 --- /dev/null +++ b/for-next/old2/0013-bcache-check-CACHE_SET_IO_DISABLE-in-allocator-code.patch @@ -0,0 +1,52 @@ +From e5b879a8c9c431f7dfce7ebf1edd7f277a5463c0 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Wed, 22 May 2019 21:55:09 +0800 +Subject: [PATCH 13/31] bcache: check CACHE_SET_IO_DISABLE in allocator code + +If CACHE_SET_IO_DISABLE of a cache set flag is set by too many I/O +errors, currently allocator routines can still continue allocate +space which may introduce inconsistent metadata state. + +This patch checkes CACHE_SET_IO_DISABLE bit in following allocator +routines, +- bch_bucket_alloc() +- __bch_bucket_alloc_set() +Once CACHE_SET_IO_DISABLE is set on cache set, the allocator routines +may reject allocation request earlier to avoid potential inconsistent +metadata. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/alloc.c | 9 +++++++++ + 1 file changed, 9 insertions(+) + +diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c +index f8986effcb50..34ae5bb6724a 100644 +--- a/drivers/md/bcache/alloc.c ++++ b/drivers/md/bcache/alloc.c +@@ -393,6 +393,11 @@ long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait) + struct bucket *b; + long r; + ++ ++ /* No allocation if CACHE_SET_IO_DISABLE set */ ++ if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &ca->set->flags))) ++ return -1; ++ + /* fastpath */ + if (fifo_pop(&ca->free[RESERVE_NONE], r) || + fifo_pop(&ca->free[reserve], r)) +@@ -484,6 +489,10 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, + { + int i; + ++ /* No allocation if CACHE_SET_IO_DISABLE set */ ++ if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) ++ return -1; ++ + lockdep_assert_held(&c->bucket_lock); + BUG_ON(!n || n > c->caches_loaded || n > MAX_CACHES_PER_SET); + +-- +2.16.4 + diff --git a/for-next/old2/0014-bcache-check-CACHE_SET_IO_DISABLE-bit-in-bch_journal.patch b/for-next/old2/0014-bcache-check-CACHE_SET_IO_DISABLE-bit-in-bch_journal.patch new file mode 100644 index 0000000..d19a50e --- /dev/null +++ b/for-next/old2/0014-bcache-check-CACHE_SET_IO_DISABLE-bit-in-bch_journal.patch @@ -0,0 +1,39 @@ +From 3ed211c0914ea6b5fca88b386b586b17e3ddb835 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Wed, 22 May 2019 22:06:21 +0800 +Subject: [PATCH 14/31] bcache: check CACHE_SET_IO_DISABLE bit in bch_journal() + +When too many I/O errors happen on cache set and CACHE_SET_IO_DISABLE +bit is set, bch_journal() may continue to work because the journaling +bkey might be still in write set yet. The caller of bch_journal() may +believe the journal still work but the truth is in-memory journal write +set won't be written into cache device any more. This behavior may +introduce potential inconsistent metadata status. + +This patch checks CACHE_SET_IO_DISABLE bit at the head of bch_journal(), +if the bit is set, bch_journal() returns NULL immediately to notice +caller to know journal does not work. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/journal.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index 38849736fa1c..ddbdbeb758e8 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -795,6 +795,10 @@ atomic_t *bch_journal(struct cache_set *c, + struct journal_write *w; + atomic_t *ret; + ++ /* No journaling if CACHE_SET_IO_DISABLE set already */ ++ if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) ++ return NULL; ++ + if (!CACHE_SYNC(&c->sb)) + return NULL; + +-- +2.16.4 + diff --git a/for-next/old2/0015-bcache-add-pendings_cleanup-to-stop-pending-bcache-d.patch b/for-next/old2/0015-bcache-add-pendings_cleanup-to-stop-pending-bcache-d.patch new file mode 100644 index 0000000..10d5381 --- /dev/null +++ b/for-next/old2/0015-bcache-add-pendings_cleanup-to-stop-pending-bcache-d.patch @@ -0,0 +1,107 @@ +From ac49935ca5ad1a451e2b47091866f9e20fb87c4a Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Wed, 20 Mar 2019 23:11:59 +0800 +Subject: [PATCH 15/31] bcache: add pendings_cleanup to stop pending bcache + device + +If a bcache device is in dirty state and its cache set is not +registered, this bcache deivce will not appear in /dev/bcache<N>, +and there is no way to stop it or remove the bcache kernel module. + +This is an as-designed behavior, but sometimes people has to reboot +whole system to release or stop the pending backing device. + +This sysfs interface may remove such pending bcache devices when +write anything into the sysfs file manually. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/super.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 55 insertions(+) + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 978689d4363c..905aece72664 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -2277,9 +2277,13 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page, + + static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, + const char *buffer, size_t size); ++static ssize_t bch_pending_bdevs_cleanup(struct kobject *k, ++ struct kobj_attribute *attr, ++ const char *buffer, size_t size); + + kobj_attribute_write(register, register_bcache); + kobj_attribute_write(register_quiet, register_bcache); ++kobj_attribute_write(pendings_cleanup, bch_pending_bdevs_cleanup); + + static bool bch_is_open_backing(struct block_device *bdev) + { +@@ -2409,6 +2413,56 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, + goto out; + } + ++ ++struct pdev { ++ struct list_head list; ++ struct cached_dev *dc; ++}; ++ ++static ssize_t bch_pending_bdevs_cleanup(struct kobject *k, ++ struct kobj_attribute *attr, ++ const char *buffer, ++ size_t size) ++{ ++ LIST_HEAD(pending_devs); ++ ssize_t ret = size; ++ struct cached_dev *dc, *tdc; ++ struct pdev *pdev, *tpdev; ++ struct cache_set *c, *tc; ++ ++ mutex_lock(&bch_register_lock); ++ list_for_each_entry_safe(dc, tdc, &uncached_devices, list) { ++ pdev = kmalloc(sizeof(struct pdev), GFP_KERNEL); ++ if (!pdev) ++ break; ++ pdev->dc = dc; ++ list_add(&pdev->list, &pending_devs); ++ } ++ ++ list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) { ++ list_for_each_entry_safe(c, tc, &bch_cache_sets, list) { ++ char *pdev_set_uuid = pdev->dc->sb.set_uuid; ++ char *set_uuid = c->sb.uuid; ++ ++ if (!memcmp(pdev_set_uuid, set_uuid, 16)) { ++ list_del(&pdev->list); ++ kfree(pdev); ++ break; ++ } ++ } ++ } ++ mutex_unlock(&bch_register_lock); ++ ++ list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) { ++ pr_info("delete pdev %p", pdev); ++ list_del(&pdev->list); ++ bcache_device_stop(&pdev->dc->disk); ++ kfree(pdev); ++ } ++ ++ return ret; ++} ++ + static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) + { + if (bcache_is_reboot) +@@ -2557,6 +2611,7 @@ static int __init bcache_init(void) + static const struct attribute *files[] = { + &ksysfs_register.attr, + &ksysfs_register_quiet.attr, ++ &ksysfs_pendings_cleanup.attr, + NULL + }; + +-- +2.16.4 + diff --git a/for-next/old2/0016-bcache-add-code-comments-for-journal_read_bucket.patch b/for-next/old2/0016-bcache-add-code-comments-for-journal_read_bucket.patch new file mode 100644 index 0000000..cf8a5e9 --- /dev/null +++ b/for-next/old2/0016-bcache-add-code-comments-for-journal_read_bucket.patch @@ -0,0 +1,71 @@ +From 275ca04c5e2105b800bcb8440fcd1d9d5de8aca9 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Thu, 30 May 2019 18:39:17 +0800 +Subject: [PATCH 16/31] bcache: add code comments for journal_read_bucket() + +make code to be more understandible + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/journal.c | 24 ++++++++++++++++++++++++ + 1 file changed, 24 insertions(+) + +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index ddbdbeb758e8..7f7f5e947d7e 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -100,6 +100,20 @@ reread: left = ca->sb.bucket_size - offset; + + blocks = set_blocks(j, block_bytes(ca->set)); + ++ /* ++ * Nodes in 'list' are in linear increasing order of ++ * i->j.seq, the node on head has the smallest (oldest) ++ * journal seq, the node on tail has the biggest ++ * (latest) journal seq. ++ */ ++ ++ /* ++ * Check from the oldest jset for last_seq. If ++ * i->j.seq < j->last_seq, it means the oldest jset ++ * in list is expired and useless, remove it from ++ * this list. Otherwise, j is a condidate jset for ++ * further following checks. ++ */ + while (!list_empty(list)) { + i = list_first_entry(list, + struct journal_replay, list); +@@ -109,13 +123,22 @@ reread: left = ca->sb.bucket_size - offset; + kfree(i); + } + ++ /* iterate list in reverse order (from latest jset) */ + list_for_each_entry_reverse(i, list, list) { + if (j->seq == i->j.seq) + goto next_set; + ++ /* ++ * if j->seq is less than any i->j.last_seq ++ * in list, j is an expired and useless jset. ++ */ + if (j->seq < i->j.last_seq) + goto next_set; + ++ /* ++ * 'where' points to first jset in list which ++ * is elder then j. ++ */ + if (j->seq > i->j.seq) { + where = &i->list; + goto add; +@@ -129,6 +152,7 @@ reread: left = ca->sb.bucket_size - offset; + if (!i) + return -ENOMEM; + memcpy(&i->j, j, bytes); ++ /* Add to the location after 'where' points to */ + list_add(&i->list, where); + ret = 1; + +-- +2.16.4 + diff --git a/for-next/old2/0017-bcache-set-largest-seq-to-ja-seq-bucket_index-in-jou.patch b/for-next/old2/0017-bcache-set-largest-seq-to-ja-seq-bucket_index-in-jou.patch new file mode 100644 index 0000000..ee01f97 --- /dev/null +++ b/for-next/old2/0017-bcache-set-largest-seq-to-ja-seq-bucket_index-in-jou.patch @@ -0,0 +1,30 @@ +From 3ccac67fe77c5735537e609f9d3aeb70e72eaaca Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Thu, 30 May 2019 18:40:37 +0800 +Subject: [PATCH 17/31] bcache: set largest seq to ja->seq[bucket_index] in + journal_read_bucket() + +Make sure always setting largest seq to ja->seq[bucket_index] + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/journal.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index 7f7f5e947d7e..152ec33981be 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -156,7 +156,8 @@ reread: left = ca->sb.bucket_size - offset; + list_add(&i->list, where); + ret = 1; + +- ja->seq[bucket_index] = j->seq; ++ if (j->seq > ja->seq[bucket_index]) ++ ja->seq[bucket_index] = j->seq; + next_set: + offset += blocks * ca->sb.block_size; + len -= blocks * ca->sb.block_size; +-- +2.16.4 + diff --git a/for-next/old2/0018-bcache-simplify-bch_journal_read.patch b/for-next/old2/0018-bcache-simplify-bch_journal_read.patch new file mode 100644 index 0000000..0c46ec7 --- /dev/null +++ b/for-next/old2/0018-bcache-simplify-bch_journal_read.patch @@ -0,0 +1,143 @@ +From 16ec857d53042372c10a7007465e274626ff20f8 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Thu, 30 May 2019 21:52:50 +0800 +Subject: [PATCH 18/31] bcache: simplify bch_journal_read() + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/journal.c | 99 ++++----------------------------------------- + 1 file changed, 7 insertions(+), 92 deletions(-) + +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index 152ec33981be..0e39d6d63ab8 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -170,109 +170,25 @@ reread: left = ca->sb.bucket_size - offset; + + int bch_journal_read(struct cache_set *c, struct list_head *list) + { +-#define read_bucket(b) \ +- ({ \ +- ret = journal_read_bucket(ca, list, b); \ +- __set_bit(b, bitmap); \ +- if (ret < 0) \ +- return ret; \ +- ret; \ +- }) +- + struct cache *ca; + unsigned int iter; +- int ret = 0; + + for_each_cache(ca, c, iter) { + struct journal_device *ja = &ca->journal; +- DECLARE_BITMAP(bitmap, SB_JOURNAL_BUCKETS); +- unsigned int i, l, r, m; +- uint64_t seq; ++ unsigned short i; ++ uint64_t seq = 0; + +- bitmap_zero(bitmap, SB_JOURNAL_BUCKETS); + pr_debug("%u journal buckets", ca->sb.njournal_buckets); +- +- /* +- * Read journal buckets ordered by golden ratio hash to quickly +- * find a sequence of buckets with valid journal entries +- */ + for (i = 0; i < ca->sb.njournal_buckets; i++) { +- /* +- * We must try the index l with ZERO first for +- * correctness due to the scenario that the journal +- * bucket is circular buffer which might have wrapped +- */ +- l = (i * 2654435769U) % ca->sb.njournal_buckets; +- +- if (test_bit(l, bitmap)) +- break; ++ int ret; + +- if (read_bucket(l)) +- goto bsearch; ++ ret = journal_read_bucket(ca, list, i); ++ if (ret < 0) ++ return ret; + } + +- /* +- * If that fails, check all the buckets we haven't checked +- * already +- */ +- pr_debug("falling back to linear search"); +- +- for (l = find_first_zero_bit(bitmap, ca->sb.njournal_buckets); +- l < ca->sb.njournal_buckets; +- l = find_next_zero_bit(bitmap, ca->sb.njournal_buckets, +- l + 1)) +- if (read_bucket(l)) +- goto bsearch; +- +- /* no journal entries on this device? */ +- if (l == ca->sb.njournal_buckets) +- continue; +-bsearch: + BUG_ON(list_empty(list)); + +- /* Binary search */ +- m = l; +- r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1); +- pr_debug("starting binary search, l %u r %u", l, r); +- +- while (l + 1 < r) { +- seq = list_entry(list->prev, struct journal_replay, +- list)->j.seq; +- +- m = (l + r) >> 1; +- read_bucket(m); +- +- if (seq != list_entry(list->prev, struct journal_replay, +- list)->j.seq) +- l = m; +- else +- r = m; +- } +- +- /* +- * Read buckets in reverse order until we stop finding more +- * journal entries +- */ +- pr_debug("finishing up: m %u njournal_buckets %u", +- m, ca->sb.njournal_buckets); +- l = m; +- +- while (1) { +- if (!l--) +- l = ca->sb.njournal_buckets - 1; +- +- if (l == m) +- break; +- +- if (test_bit(l, bitmap)) +- continue; +- +- if (!read_bucket(l)) +- break; +- } +- +- seq = 0; +- + for (i = 0; i < ca->sb.njournal_buckets; i++) + if (ja->seq[i] > seq) { + seq = ja->seq[i]; +@@ -293,8 +209,7 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) + struct journal_replay, + list)->j.seq; + +- return ret; +-#undef read_bucket ++ return 0; + } + + void bch_journal_mark(struct cache_set *c, struct list_head *list) +-- +2.16.4 + diff --git a/for-next/old2/0019-bcache-shrink-btree-node-cache-after-bch_btree_check.patch b/for-next/old2/0019-bcache-shrink-btree-node-cache-after-bch_btree_check.patch new file mode 100644 index 0000000..583519f --- /dev/null +++ b/for-next/old2/0019-bcache-shrink-btree-node-cache-after-bch_btree_check.patch @@ -0,0 +1,45 @@ +From a5803acaa5237df9515f6e49adec1b71763b6a36 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Fri, 31 May 2019 17:29:56 +0800 +Subject: [PATCH 19/31] bcache: shrink btree node cache after bch_btree_check() + +To release memory proactively for memory allocation in following +routines. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/super.c | 18 ++++++++++++++++++ + 1 file changed, 18 insertions(+) + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 905aece72664..f0e75b0b1eb6 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -1875,6 +1875,24 @@ static int run_cache_set(struct cache_set *c) + if (bch_btree_check(c)) + goto err; + ++ /* ++ * bch_btree_check() may occupy too much system memory which ++ * will fail memory allocation operations in the following ++ * routines before kernel triggers memory shrinker call backs. ++ * Shrinking 25% mca cache memory proactively here to avoid ++ * potential memory allocation failure. ++ */ ++ if (!c->shrinker_disabled) { ++ struct shrink_control sc; ++ ++ sc.gfp_mask = GFP_KERNEL; ++ sc.nr_to_scan = ++ c->shrink.count_objects(&c->shrink, &sc) / 4; ++ pr_debug("try to shrink %lu (25%%) cached btree node", ++ sc.nr_to_scan); ++ c->shrink.scan_objects(&c->shrink, &sc); ++ } ++ + bch_journal_mark(c, &journal); + bch_initial_gc_finish(c); + pr_debug("btree_check() done"); +-- +2.16.4 + diff --git a/for-next/old2/0020-bcache-more-detailed-error-message-to-bcache_device_.patch b/for-next/old2/0020-bcache-more-detailed-error-message-to-bcache_device_.patch new file mode 100644 index 0000000..7503181 --- /dev/null +++ b/for-next/old2/0020-bcache-more-detailed-error-message-to-bcache_device_.patch @@ -0,0 +1,43 @@ +From d83f5094f6c0f5d2e3c9b73dc53a9eebcebc9274 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sat, 1 Jun 2019 00:57:38 +0800 +Subject: [PATCH 20/31] bcache: more detailed error message to + bcache_device_link() + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/super.c | 11 ++++++++--- + 1 file changed, 8 insertions(+), 3 deletions(-) + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index f0e75b0b1eb6..be445bb0fe23 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -695,6 +695,7 @@ static void bcache_device_link(struct bcache_device *d, struct cache_set *c, + { + unsigned int i; + struct cache *ca; ++ int ret; + + for_each_cache(ca, d->c, i) + bd_link_disk_holder(ca->bdev, d->disk); +@@ -702,9 +703,13 @@ static void bcache_device_link(struct bcache_device *d, struct cache_set *c, + snprintf(d->name, BCACHEDEVNAME_SIZE, + "%s%u", name, d->id); + +- WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") || +- sysfs_create_link(&c->kobj, &d->kobj, d->name), +- "Couldn't create device <-> cache set symlinks"); ++ ret = sysfs_create_link(&d->kobj, &c->kobj, "cache"); ++ if (ret < 0) ++ pr_err("Couldn't create device -> cache set symlink"); ++ ++ ret = sysfs_create_link(&c->kobj, &d->kobj, d->name); ++ if (ret < 0) ++ pr_err("Couldn't create cache set -> device symlink"); + + clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags); + } +-- +2.16.4 + diff --git a/for-next/old2/0021-bcache-add-more-error-message-in-bch_cached_dev_atta.patch b/for-next/old2/0021-bcache-add-more-error-message-in-bch_cached_dev_atta.patch new file mode 100644 index 0000000..1589fcc --- /dev/null +++ b/for-next/old2/0021-bcache-add-more-error-message-in-bch_cached_dev_atta.patch @@ -0,0 +1,36 @@ +From 028baa2853232d534bb41def4592f2517f46ceb3 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sat, 1 Jun 2019 01:03:00 +0800 +Subject: [PATCH 21/31] bcache: add more error message in + bch_cached_dev_attach() + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/super.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index be445bb0fe23..c9328070bed8 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -1170,6 +1170,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, + down_write(&dc->writeback_lock); + if (bch_cached_dev_writeback_start(dc)) { + up_write(&dc->writeback_lock); ++ pr_err("Couldn't start writeback facilities for %s", ++ dc->disk.disk->disk_name); + return -ENOMEM; + } + +@@ -1183,6 +1185,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, + ret = bch_cached_dev_run(dc); + if (ret && (ret != -EBUSY)) { + up_write(&dc->writeback_lock); ++ pr_err("Couldn't run cached device %s", ++ dc->backing_dev_name); + return ret; + } + +-- +2.16.4 + diff --git a/for-next/old2/0022-bcache-improve-error-message-in-bch_cached_dev_run.patch b/for-next/old2/0022-bcache-improve-error-message-in-bch_cached_dev_run.patch new file mode 100644 index 0000000..402dcc4 --- /dev/null +++ b/for-next/old2/0022-bcache-improve-error-message-in-bch_cached_dev_run.patch @@ -0,0 +1,49 @@ +From 4e7cab006d1c4333f82aab9eec1a41e9c4ef7e17 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sat, 1 Jun 2019 01:09:06 +0800 +Subject: [PATCH 22/31] bcache: improve error message in bch_cached_dev_run() + +should use pr_err(), not pr_debug(), and provide more detailed error +message. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/super.c | 9 +++++++-- + 1 file changed, 7 insertions(+), 2 deletions(-) + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index c9328070bed8..a254225e726b 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -928,13 +928,18 @@ int bch_cached_dev_run(struct cached_dev *dc) + NULL, + }; + +- if (dc->io_disable) ++ if (dc->io_disable) { ++ pr_err("I/O disabled on cached dev %s", ++ dc->backing_dev_name); + return -EIO; ++ } + + if (atomic_xchg(&dc->running, 1)) { + kfree(env[1]); + kfree(env[2]); + kfree(buf); ++ pr_err("cached dev %s is running already", ++ dc->backing_dev_name); + return -EBUSY; + } + +@@ -962,7 +967,7 @@ int bch_cached_dev_run(struct cached_dev *dc) + + if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || + sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) { +- pr_debug("error creating sysfs link"); ++ pr_err("Couldn't create bcache dev <-> disk sysfs symlinks"); + return -ENOMEM; + } + +-- +2.16.4 + diff --git a/for-next/old2/0023-bcache-fix-race-in-btree_flush_write.patch b/for-next/old2/0023-bcache-fix-race-in-btree_flush_write.patch new file mode 100644 index 0000000..7a4d2ce --- /dev/null +++ b/for-next/old2/0023-bcache-fix-race-in-btree_flush_write.patch @@ -0,0 +1,213 @@ +From 5c02d54148c5a04e6dfd3e23d84b7efd719405de Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sat, 1 Jun 2019 01:55:30 +0800 +Subject: [PATCH 23/31] bcache: fix race in btree_flush_write() + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/btree.c | 15 +++++++- + drivers/md/bcache/btree.h | 2 + + drivers/md/bcache/journal.c | 93 ++++++++++++++++++++++++++++++++++----------- + drivers/md/bcache/journal.h | 4 ++ + 4 files changed, 90 insertions(+), 24 deletions(-) + +diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c +index cf38a1b031fa..c0dd8fde37af 100644 +--- a/drivers/md/bcache/btree.c ++++ b/drivers/md/bcache/btree.c +@@ -660,6 +660,13 @@ static int mca_reap(struct btree *b, unsigned int min_order, bool flush) + } + + mutex_lock(&b->write_lock); ++ /* don't reap btree node handling in btree_flush_write() */ ++ if (btree_node_journal_flush(b)) { ++ pr_debug("bnode %p is flushing by journal, ignore", b); ++ mutex_unlock(&b->write_lock); ++ goto out_unlock; ++ } ++ + if (btree_node_dirty(b)) + __bch_btree_node_write(b, &cl); + mutex_unlock(&b->write_lock); +@@ -1071,8 +1078,14 @@ static void btree_node_free(struct btree *b) + + BUG_ON(b == b->c->root); + ++retry: + mutex_lock(&b->write_lock); +- ++ if (btree_node_journal_flush(b)) { ++ mutex_unlock(&b->write_lock); ++ pr_debug("bnode %p journal_flush set, retry", b); ++ schedule_timeout_interruptible(1); ++ goto retry; ++ } + if (btree_node_dirty(b)) + btree_complete_write(b, btree_current_write(b)); + clear_bit(BTREE_NODE_dirty, &b->flags); +diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h +index d1c72ef64edf..76cfd121a486 100644 +--- a/drivers/md/bcache/btree.h ++++ b/drivers/md/bcache/btree.h +@@ -158,11 +158,13 @@ enum btree_flags { + BTREE_NODE_io_error, + BTREE_NODE_dirty, + BTREE_NODE_write_idx, ++ BTREE_NODE_journal_flush, + }; + + BTREE_FLAG(io_error); + BTREE_FLAG(dirty); + BTREE_FLAG(write_idx); ++BTREE_FLAG(journal_flush); + + static inline struct btree_write *btree_current_write(struct btree *b) + { +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index 0e39d6d63ab8..28e53b196a80 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -334,41 +334,87 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) + + static void btree_flush_write(struct cache_set *c) + { +- /* +- * Try to find the btree node with that references the oldest journal +- * entry, best is our current candidate and is locked if non NULL: +- */ +- struct btree *b, *best; +- unsigned i; ++ struct btree *b, *btree_nodes[BTREE_FLUSH_NR]; ++ unsigned i, n; ++ ++ if (c->journal.btree_flushing) ++ return; ++ ++ spin_lock(&c->journal.flush_write_lock); ++ if (c->journal.btree_flushing) { ++ spin_unlock(&c->journal.flush_write_lock); ++ return; ++ } ++ c->journal.btree_flushing = true; ++ spin_unlock(&c->journal.flush_write_lock); + + atomic_long_inc(&c->flush_write); +-retry: +- best = NULL; +- +- for_each_cached_btree(b, c, i) +- if (btree_current_write(b)->journal) { +- if (!best) +- best = b; +- else if (journal_pin_cmp(c, +- btree_current_write(best)->journal, +- btree_current_write(b)->journal)) { +- best = b; +- } ++ memset(btree_nodes, 0, sizeof(btree_nodes)); ++ n = 0; ++ ++ mutex_lock(&c->bucket_lock); ++ list_for_each_entry_reverse(b, &c->btree_cache, list) { ++ if (btree_node_journal_flush(b)) ++ pr_err("BUG: flush_write bit should not be set here!"); ++ ++ mutex_lock(&b->write_lock); ++ ++ if(!btree_node_dirty(b)) { ++ mutex_unlock(&b->write_lock); ++ continue; ++ } ++ ++ if (!btree_current_write(b)->journal) { ++ mutex_unlock(&b->write_lock); ++ continue; ++ } ++ ++ set_btree_node_journal_flush(b); ++ ++ mutex_unlock(&b->write_lock); ++ ++ btree_nodes[n++] = b; ++ if (n == BTREE_FLUSH_NR) ++ break; ++ } ++ mutex_unlock(&c->bucket_lock); ++ ++ for (i = 0; i < n; i++) { ++ b = btree_nodes[i]; ++ if (!b) { ++ pr_err("BUG: btree_nodes[%d] is NULL", i); ++ continue; ++ } ++ ++ /* safe to check without holding b->write_lock */ ++ if (!btree_node_journal_flush(b)) { ++ pr_err("BUG: bnode %p: journal_flush bit cleaned", b); ++ continue; + } + +- b = best; +- if (b) { + mutex_lock(&b->write_lock); + if (!btree_current_write(b)->journal) { + mutex_unlock(&b->write_lock); +- /* We raced */ +- atomic_long_inc(&c->retry_flush_write); +- goto retry; ++ pr_debug("bnode %p: written by others", b); ++ clear_bit(BTREE_NODE_journal_flush, &b->flags); ++ continue; ++ } ++ ++ if (!btree_node_dirty(b)) { ++ pr_debug("bnode %p: dirty bit cleaned by others", b); ++ clear_bit(BTREE_NODE_journal_flush, &b->flags); ++ mutex_unlock(&b->write_lock); ++ continue; + } + + __bch_btree_node_write(b, NULL); ++ clear_bit(BTREE_NODE_journal_flush, &b->flags); + mutex_unlock(&b->write_lock); + } ++ ++ spin_lock(&c->journal.flush_write_lock); ++ c->journal.btree_flushing = false; ++ spin_unlock(&c->journal.flush_write_lock); + } + + #define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1) +@@ -790,6 +836,7 @@ int bch_journal_alloc(struct cache_set *c) + struct journal *j = &c->journal; + + spin_lock_init(&j->lock); ++ spin_lock_init(&j->flush_write_lock); + INIT_DELAYED_WORK(&j->work, journal_write_work); + + c->journal_delay_ms = 100; +diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h +index 66f0facff84b..aeed791f05e7 100644 +--- a/drivers/md/bcache/journal.h ++++ b/drivers/md/bcache/journal.h +@@ -103,6 +103,8 @@ struct journal_write { + /* Embedded in struct cache_set */ + struct journal { + spinlock_t lock; ++ spinlock_t flush_write_lock; ++ bool btree_flushing; + /* used when waiting because the journal was full */ + struct closure_waitlist wait; + struct closure io; +@@ -154,6 +156,8 @@ struct journal_device { + struct bio_vec bv[8]; + }; + ++#define BTREE_FLUSH_NR 32 ++ + #define journal_pin_cmp(c, l, r) \ + (fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r))) + +-- +2.16.4 + diff --git a/for-next/old2/0024-bcache-remove-retry_flush_write-from-struct-cache_se.patch b/for-next/old2/0024-bcache-remove-retry_flush_write-from-struct-cache_se.patch new file mode 100644 index 0000000..de48d70 --- /dev/null +++ b/for-next/old2/0024-bcache-remove-retry_flush_write-from-struct-cache_se.patch @@ -0,0 +1,58 @@ +From e4421d2068810b74f81c8f7dea37762dab985bc4 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sat, 1 Jun 2019 01:58:23 +0800 +Subject: [PATCH 24/31] bcache: remove retry_flush_write from struct cache_set + +useless anymore + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/bcache.h | 1 - + drivers/md/bcache/sysfs.c | 5 ----- + 2 files changed, 6 deletions(-) + +diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h +index cb268d7c6cea..35396248a7d5 100644 +--- a/drivers/md/bcache/bcache.h ++++ b/drivers/md/bcache/bcache.h +@@ -706,7 +706,6 @@ struct cache_set { + + atomic_long_t reclaim; + atomic_long_t flush_write; +- atomic_long_t retry_flush_write; + + enum { + ON_ERROR_UNREGISTER, +diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c +index 961a13a223ee..0bfe4e30c501 100644 +--- a/drivers/md/bcache/sysfs.c ++++ b/drivers/md/bcache/sysfs.c +@@ -83,7 +83,6 @@ read_attribute(state); + read_attribute(cache_read_races); + read_attribute(reclaim); + read_attribute(flush_write); +-read_attribute(retry_flush_write); + read_attribute(writeback_keys_done); + read_attribute(writeback_keys_failed); + read_attribute(io_errors); +@@ -704,9 +703,6 @@ SHOW(__bch_cache_set) + sysfs_print(flush_write, + atomic_long_read(&c->flush_write)); + +- sysfs_print(retry_flush_write, +- atomic_long_read(&c->retry_flush_write)); +- + sysfs_print(writeback_keys_done, + atomic_long_read(&c->writeback_keys_done)); + sysfs_print(writeback_keys_failed, +@@ -931,7 +927,6 @@ static struct attribute *bch_cache_set_internal_files[] = { + &sysfs_cache_read_races, + &sysfs_reclaim, + &sysfs_flush_write, +- &sysfs_retry_flush_write, + &sysfs_writeback_keys_done, + &sysfs_writeback_keys_failed, + +-- +2.16.4 + diff --git a/for-next/old2/0025-bcache-use-bcache_mod_wq-to-replace-system-wide-syst.patch b/for-next/old2/0025-bcache-use-bcache_mod_wq-to-replace-system-wide-syst.patch new file mode 100644 index 0000000..9c4a546 --- /dev/null +++ b/for-next/old2/0025-bcache-use-bcache_mod_wq-to-replace-system-wide-syst.patch @@ -0,0 +1,107 @@ +From 5821f626160483f2b5fad3f55c5017287e993e2e Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sun, 2 Jun 2019 00:36:18 +0800 +Subject: [PATCH 25/31] bcache: use bcache_mod_wq to replace system wide + system_wq + +to avoid blocking happens in bcache worker blocks other kernel +subsystem kworker (e.g. network). + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/super.c | 21 ++++++++++++++------- + 1 file changed, 14 insertions(+), 7 deletions(-) + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index a254225e726b..0bdf589ff5f2 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -47,6 +47,7 @@ static LIST_HEAD(uncached_devices); + static int bcache_major; + static DEFINE_IDA(bcache_device_idx); + static wait_queue_head_t unregister_wait; ++struct workqueue_struct *bcache_mod_wq; + struct workqueue_struct *bcache_wq; + struct workqueue_struct *bch_journal_wq; + +@@ -1260,7 +1261,7 @@ static void cached_dev_flush(struct closure *cl) + bch_cache_accounting_destroy(&dc->accounting); + kobject_del(&d->kobj); + +- continue_at(cl, cached_dev_free, system_wq); ++ continue_at(cl, cached_dev_free, bcache_mod_wq); + } + + static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) +@@ -1272,7 +1273,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) + __module_get(THIS_MODULE); + INIT_LIST_HEAD(&dc->list); + closure_init(&dc->disk.cl, NULL); +- set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq); ++ set_closure_fn(&dc->disk.cl, cached_dev_flush, bcache_mod_wq); + kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype); + INIT_WORK(&dc->detach, cached_dev_detach_finish); + sema_init(&dc->sb_write_mutex, 1); +@@ -1395,7 +1396,7 @@ static void flash_dev_flush(struct closure *cl) + bcache_device_unlink(d); + mutex_unlock(&bch_register_lock); + kobject_del(&d->kobj); +- continue_at(cl, flash_dev_free, system_wq); ++ continue_at(cl, flash_dev_free, bcache_mod_wq); + } + + static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) +@@ -1406,7 +1407,7 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) + return -ENOMEM; + + closure_init(&d->cl, NULL); +- set_closure_fn(&d->cl, flash_dev_flush, system_wq); ++ set_closure_fn(&d->cl, flash_dev_flush, bcache_mod_wq); + + kobject_init(&d->kobj, &bch_flash_dev_ktype); + +@@ -1714,7 +1715,7 @@ static void __cache_set_unregister(struct closure *cl) + + mutex_unlock(&bch_register_lock); + +- continue_at(cl, cache_set_flush, system_wq); ++ continue_at(cl, cache_set_flush, bcache_mod_wq); + } + + void bch_cache_set_stop(struct cache_set *c) +@@ -1743,10 +1744,10 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) + + __module_get(THIS_MODULE); + closure_init(&c->cl, NULL); +- set_closure_fn(&c->cl, cache_set_free, system_wq); ++ set_closure_fn(&c->cl, cache_set_free, bcache_mod_wq); + + closure_init(&c->caching, &c->cl); +- set_closure_fn(&c->caching, __cache_set_unregister, system_wq); ++ set_closure_fn(&c->caching, __cache_set_unregister, bcache_mod_wq); + + /* Maybe create continue_at_noreturn() and use it here? */ + closure_set_stopped(&c->cl); +@@ -2601,6 +2602,8 @@ static void bcache_exit(void) + bch_request_exit(); + if (bcache_kobj) + kobject_put(bcache_kobj); ++ if (bcache_mod_wq) ++ destroy_workqueue(bcache_mod_wq); + if (bcache_wq) + destroy_workqueue(bcache_wq); + if (bch_journal_wq) +@@ -2660,6 +2663,10 @@ static int __init bcache_init(void) + return bcache_major; + } + ++ bcache_mod_wq = alloc_workqueue("bcache_mod_wq", WQ_MEM_RECLAIM, 0); ++ if (!bcache_mod_wq) ++ goto err; ++ + bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0); + if (!bcache_wq) + goto err; +-- +2.16.4 + diff --git a/for-next/old2/0026-bcache-add-reclaimed_journal_buckets-to-struct-cache.patch b/for-next/old2/0026-bcache-add-reclaimed_journal_buckets-to-struct-cache.patch new file mode 100644 index 0000000..85cdd71 --- /dev/null +++ b/for-next/old2/0026-bcache-add-reclaimed_journal_buckets-to-struct-cache.patch @@ -0,0 +1,73 @@ +From a7b1ad5e3423b9e52707655561d1a0a901724c60 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sun, 2 Jun 2019 00:47:23 +0800 +Subject: [PATCH 26/31] bcache: add reclaimed_journal_buckets to struct + cache_set + +An increase-only counter, to count how many buckets are indeed +reclaimed in journal_reclaim(). + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/bcache.h | 1 + + drivers/md/bcache/journal.c | 1 + + drivers/md/bcache/sysfs.c | 5 +++++ + 3 files changed, 7 insertions(+) + +diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h +index 35396248a7d5..013e35a9e317 100644 +--- a/drivers/md/bcache/bcache.h ++++ b/drivers/md/bcache/bcache.h +@@ -705,6 +705,7 @@ struct cache_set { + atomic_long_t writeback_keys_failed; + + atomic_long_t reclaim; ++ atomic_long_t reclaimed_journal_buckets; + atomic_long_t flush_write; + + enum { +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index 28e53b196a80..cc83c612937f 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -529,6 +529,7 @@ static void journal_reclaim(struct cache_set *c) + k->ptr[n++] = MAKE_PTR(0, + bucket_to_sector(c, ca->sb.d[ja->cur_idx]), + ca->sb.nr_this_dev); ++ atomic_long_inc(&c->reclaimed_journal_buckets); + } + + if (n) { +diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c +index 0bfe4e30c501..4ab15442cab5 100644 +--- a/drivers/md/bcache/sysfs.c ++++ b/drivers/md/bcache/sysfs.c +@@ -82,6 +82,7 @@ read_attribute(bset_tree_stats); + read_attribute(state); + read_attribute(cache_read_races); + read_attribute(reclaim); ++read_attribute(reclaimed_journal_buckets); + read_attribute(flush_write); + read_attribute(writeback_keys_done); + read_attribute(writeback_keys_failed); +@@ -700,6 +701,9 @@ SHOW(__bch_cache_set) + sysfs_print(reclaim, + atomic_long_read(&c->reclaim)); + ++ sysfs_print(reclaimed_journal_buckets, ++ atomic_long_read(&c->reclaimed_journal_buckets)); ++ + sysfs_print(flush_write, + atomic_long_read(&c->flush_write)); + +@@ -926,6 +930,7 @@ static struct attribute *bch_cache_set_internal_files[] = { + &sysfs_bset_tree_stats, + &sysfs_cache_read_races, + &sysfs_reclaim, ++ &sysfs_reclaimed_journal_buckets, + &sysfs_flush_write, + &sysfs_writeback_keys_done, + &sysfs_writeback_keys_failed, +-- +2.16.4 + diff --git a/for-next/old2/0027-bcache-acquire-bch_register_lock-later-in-cached_dev.patch b/for-next/old2/0027-bcache-acquire-bch_register_lock-later-in-cached_dev.patch new file mode 100644 index 0000000..adf810c --- /dev/null +++ b/for-next/old2/0027-bcache-acquire-bch_register_lock-later-in-cached_dev.patch @@ -0,0 +1,38 @@ +From f8d7f061c544469e254d7e71f47f9430d244a35d Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sun, 2 Jun 2019 01:06:12 +0800 +Subject: [PATCH 27/31] bcache: acquire bch_register_lock later in + cached_dev_detach_finish() + +To avoid potential deadlock on bch_register_lock when stopping kthread +or cancel kworker (which is done in sync way) + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/super.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 0bdf589ff5f2..5c76f4fd1661 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -1018,7 +1018,6 @@ static void cached_dev_detach_finish(struct work_struct *w) + BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)); + BUG_ON(refcount_read(&dc->count)); + +- mutex_lock(&bch_register_lock); + + if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) + cancel_writeback_rate_update_dwork(dc); +@@ -1034,6 +1033,8 @@ static void cached_dev_detach_finish(struct work_struct *w) + bch_write_bdev_super(dc, &cl); + closure_sync(&cl); + ++ mutex_lock(&bch_register_lock); ++ + calc_cached_dev_sectors(dc->disk.c); + bcache_device_detach(&dc->disk); + list_move(&dc->list, &uncached_devices); +-- +2.16.4 + diff --git a/for-next/old2/0028-bcache-move-dc-io_disable-into-dc-flags.patch b/for-next/old2/0028-bcache-move-dc-io_disable-into-dc-flags.patch new file mode 100644 index 0000000..ad13329 --- /dev/null +++ b/for-next/old2/0028-bcache-move-dc-io_disable-into-dc-flags.patch @@ -0,0 +1,170 @@ +From acfa48f0d22b1288b8947fb2a00b68bc191e42ea Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sun, 2 Jun 2019 01:41:01 +0800 +Subject: [PATCH 28/31] bcache: move dc->io_disable into dc->flags + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/bcache.h | 3 ++- + drivers/md/bcache/request.c | 4 ++-- + drivers/md/bcache/super.c | 36 ++++++++++++++++++++++-------------- + drivers/md/bcache/sysfs.c | 9 +++++++-- + 4 files changed, 33 insertions(+), 19 deletions(-) + +diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h +index 013e35a9e317..ccfc3b245462 100644 +--- a/drivers/md/bcache/bcache.h ++++ b/drivers/md/bcache/bcache.h +@@ -362,7 +362,8 @@ struct cached_dev { + unsigned int sequential_cutoff; + unsigned int readahead; + +- unsigned int io_disable:1; ++#define CACHED_DEV_IO_DISABLED 0 ++ unsigned long flags; + unsigned int verify:1; + unsigned int bypass_torture_test:1; + +diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c +index 41adcd1546f1..4bdf5be04c0a 100644 +--- a/drivers/md/bcache/request.c ++++ b/drivers/md/bcache/request.c +@@ -1175,7 +1175,7 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, + int rw = bio_data_dir(bio); + + if (unlikely((d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags)) || +- dc->io_disable)) { ++ test_bit(CACHED_DEV_IO_DISABLED, &dc->flags))) { + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + return BLK_QC_T_NONE; +@@ -1236,7 +1236,7 @@ static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode, + { + struct cached_dev *dc = container_of(d, struct cached_dev, disk); + +- if (dc->io_disable) ++ if (test_bit(CACHED_DEV_IO_DISABLED, &dc->flags)) + return -EIO; + + return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg); +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 5c76f4fd1661..af221bac5ac1 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -888,10 +888,11 @@ static int cached_dev_status_update(void *arg) + + /* + * If this delayed worker is stopping outside, directly quit here. +- * dc->io_disable might be set via sysfs interface, so check it +- * here too. ++ * CACHED_DEV_IO_DISABLED might be set via sysfs interface, so check ++ * it here too. + */ +- while (!kthread_should_stop() && !dc->io_disable) { ++ while (!kthread_should_stop() && ++ !test_bit(CACHED_DEV_IO_DISABLED, &dc->flags)) { + q = bdev_get_queue(dc->bdev); + if (blk_queue_dying(q)) + dc->offline_seconds++; +@@ -904,8 +905,11 @@ static int cached_dev_status_update(void *arg) + BACKING_DEV_OFFLINE_TIMEOUT); + pr_err("%s: disable I/O request due to backing " + "device offline", dc->disk.name); +- dc->io_disable = true; +- /* let others know earlier that io_disable is true */ ++ set_bit(CACHED_DEV_IO_DISABLED, &dc->flags); ++ /* ++ * let others know earlier that CACHED_DEV_IO_DISABLED ++ * is set. ++ */ + smp_mb(); + bcache_device_stop(&dc->disk); + break; +@@ -929,7 +933,7 @@ int bch_cached_dev_run(struct cached_dev *dc) + NULL, + }; + +- if (dc->io_disable) { ++ if (test_bit(CACHED_DEV_IO_DISABLED, &dc->flags)) { + pr_err("I/O disabled on cached dev %s", + dc->backing_dev_name); + return -EIO; +@@ -1305,7 +1309,11 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) + q->backing_dev_info->ra_pages); + + atomic_set(&dc->io_errors, 0); +- dc->io_disable = false; ++ /* ++ * Clear following bit position in dc->flags ++ * - CACHED_DEV_IO_DISABLED ++ */ ++ dc->flags = 0; + dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT; + /* default to auto */ + dc->stop_when_cache_set_failed = BCH_CACHED_DEV_STOP_AUTO; +@@ -1480,8 +1488,8 @@ bool bch_cached_dev_error(struct cached_dev *dc) + if (!dc || test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags)) + return false; + +- dc->io_disable = true; +- /* make others know io_disable is true earlier */ ++ set_bit(CACHED_DEV_IO_DISABLED, &dc->flags); ++ /* make others know CACHED_DEV_IO_DISABLED is set earlier */ + smp_mb(); + + pr_err("stop %s: too many IO errors on backing device %s\n", +@@ -1489,7 +1497,7 @@ bool bch_cached_dev_error(struct cached_dev *dc) + + /* + * If the cached device is still attached to a cache set, +- * even dc->io_disable is true and no more I/O requests ++ * even CACHED_DEV_IO_DISABLED is set and no more I/O requests + * accepted, cache device internal I/O (writeback scan or + * garbage collection) may still prevent bcache device from + * being stopped. So here CACHE_SET_IO_DISABLE should be +@@ -1672,11 +1680,11 @@ static void conditional_stop_bcache_device(struct cache_set *c, + * behavior may also introduce potential inconsistence + * data in writeback mode while cache is dirty. + * Therefore before calling bcache_device_stop() due +- * to a broken cache device, dc->io_disable should be +- * explicitly set to true. ++ * to a broken cache device, CACHED_DEV_IO_DISABLED should ++ * be explicitly set. + */ +- dc->io_disable = true; +- /* make others know io_disable is true earlier */ ++ set_bit(CACHED_DEV_IO_DISABLED, &dc->flags); ++ /* make others know CACHED_DEV_IO_DISABLED is set earlier */ + smp_mb(); + bcache_device_stop(d); + } else { +diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c +index 4ab15442cab5..4bb1592270b1 100644 +--- a/drivers/md/bcache/sysfs.c ++++ b/drivers/md/bcache/sysfs.c +@@ -180,7 +180,8 @@ SHOW(__bch_cached_dev) + wb ? atomic_long_read(&dc->writeback_rate.rate) << 9 : 0); + sysfs_hprint(io_errors, atomic_read(&dc->io_errors)); + sysfs_printf(io_error_limit, "%i", dc->error_limit); +- sysfs_printf(io_disable, "%i", dc->io_disable); ++ sysfs_printf(io_disable, "%i", ++ (int)test_bit(CACHED_DEV_IO_DISABLED, &dc->flags)); + var_print(writeback_rate_update_seconds); + var_print(writeback_rate_i_term_inverse); + var_print(writeback_rate_p_term_inverse); +@@ -319,7 +320,11 @@ STORE(__cached_dev) + if (attr == &sysfs_io_disable) { + int v = strtoul_or_return(buf); + +- dc->io_disable = v ? 1 : 0; ++ if (v > 0) ++ set_bit(CACHED_DEV_IO_DISABLED, &dc->flags); ++ else ++ clear_bit(CACHED_DEV_IO_DISABLED, &dc->flags); ++ return size; + } + + sysfs_strtoul_clamp(sequential_cutoff, +-- +2.16.4 + diff --git a/for-next/old2/0029-bcache-add-CACHED_DEV_FREEING-to-dc-flags.patch b/for-next/old2/0029-bcache-add-CACHED_DEV_FREEING-to-dc-flags.patch new file mode 100644 index 0000000..c3220ed --- /dev/null +++ b/for-next/old2/0029-bcache-add-CACHED_DEV_FREEING-to-dc-flags.patch @@ -0,0 +1,51 @@ +From 5c1a6b0a78a71c8aa454720c3652343b46a953cb Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sun, 2 Jun 2019 03:18:23 +0800 +Subject: [PATCH 29/31] bcache: add CACHED_DEV_FREEING to dc->flags + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/bcache.h | 1 + + drivers/md/bcache/super.c | 7 +++++++ + 2 files changed, 8 insertions(+) + +diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h +index ccfc3b245462..aae69060db7a 100644 +--- a/drivers/md/bcache/bcache.h ++++ b/drivers/md/bcache/bcache.h +@@ -363,6 +363,7 @@ struct cached_dev { + unsigned int readahead; + + #define CACHED_DEV_IO_DISABLED 0 ++#define CACHED_DEV_FREEING 1 + unsigned long flags; + unsigned int verify:1; + unsigned int bypass_torture_test:1; +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index af221bac5ac1..2c3348aff926 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -1227,6 +1227,12 @@ static void cached_dev_free(struct closure *cl) + { + struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); + ++ if (test_and_set_bit(CACHED_DEV_FREEING, &dc->flags)) { ++ pr_info("cached device %s is freeing already", ++ dc->backing_dev_name); ++ return; ++ } ++ + mutex_lock(&bch_register_lock); + + if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) +@@ -1312,6 +1318,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) + /* + * Clear following bit position in dc->flags + * - CACHED_DEV_IO_DISABLED ++ * - CACHED_DEV_FREEING + */ + dc->flags = 0; + dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT; +-- +2.16.4 + diff --git a/for-next/old2/0030-bcache-acquire-bch_register_lock-later-in-cached_dev.patch b/for-next/old2/0030-bcache-acquire-bch_register_lock-later-in-cached_dev.patch new file mode 100644 index 0000000..5eb31f8 --- /dev/null +++ b/for-next/old2/0030-bcache-acquire-bch_register_lock-later-in-cached_dev.patch @@ -0,0 +1,39 @@ +From f46c81a38a1dd0d5550a366f4676480b8648575d Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sun, 2 Jun 2019 03:21:16 +0800 +Subject: [PATCH 30/31] bcache: acquire bch_register_lock later in + cached_dev_free() + +to avoid deadlock when stopping workqueue. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/super.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 2c3348aff926..e8dc7a991696 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -1233,8 +1233,6 @@ static void cached_dev_free(struct closure *cl) + return; + } + +- mutex_lock(&bch_register_lock); +- + if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) + cancel_writeback_rate_update_dwork(dc); + +@@ -1247,6 +1245,9 @@ static void cached_dev_free(struct closure *cl) + + if (atomic_read(&dc->running)) + bd_unlink_disk_holder(dc->bdev, dc->disk.disk); ++ ++ mutex_lock(&bch_register_lock); ++ + bcache_device_free(&dc->disk); + list_del(&dc->list); + +-- +2.16.4 + diff --git a/for-next/old2/0031-bcache-replace-system_wq-to-bcache_mod_wq.patch b/for-next/old2/0031-bcache-replace-system_wq-to-bcache_mod_wq.patch new file mode 100644 index 0000000..494678c --- /dev/null +++ b/for-next/old2/0031-bcache-replace-system_wq-to-bcache_mod_wq.patch @@ -0,0 +1,104 @@ +From 1a99d8f9e3c0279938bd5a5f552c88b2b1a7cbea Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sun, 2 Jun 2019 18:55:09 +0800 +Subject: [PATCH 31/31] bcache: replace system_wq to bcache_mod_wq + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/bcache.h | 3 ++- + drivers/md/bcache/btree.c | 4 ++-- + drivers/md/bcache/journal.c | 2 +- + drivers/md/bcache/sysfs.c | 2 +- + drivers/md/bcache/writeback.c | 4 ++-- + 5 files changed, 8 insertions(+), 7 deletions(-) + +diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h +index aae69060db7a..e7f0c42ab234 100644 +--- a/drivers/md/bcache/bcache.h ++++ b/drivers/md/bcache/bcache.h +@@ -870,10 +870,11 @@ do { \ + for (b = (ca)->buckets + (ca)->sb.first_bucket; \ + b < (ca)->buckets + (ca)->sb.nbuckets; b++) + ++extern struct workqueue_struct *bcache_mod_wq; + static inline void cached_dev_put(struct cached_dev *dc) + { + if (refcount_dec_and_test(&dc->count)) +- schedule_work(&dc->detach); ++ queue_work(bcache_mod_wq, &dc->detach); + } + + static inline bool cached_dev_get(struct cached_dev *dc) +diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c +index c0dd8fde37af..8325a2d11717 100644 +--- a/drivers/md/bcache/btree.c ++++ b/drivers/md/bcache/btree.c +@@ -366,7 +366,7 @@ static void __btree_node_write_done(struct closure *cl) + btree_complete_write(b, w); + + if (btree_node_dirty(b)) +- schedule_delayed_work(&b->work, 30 * HZ); ++ queue_delayed_work(bcache_mod_wq, &b->work, 30 * HZ); + + closure_return_with_destructor(cl, btree_node_write_unlock); + } +@@ -539,7 +539,7 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) + BUG_ON(!i->keys); + + if (!btree_node_dirty(b)) +- schedule_delayed_work(&b->work, 30 * HZ); ++ queue_delayed_work(bcache_mod_wq, &b->work, 30 * HZ); + + set_btree_node_dirty(b); + +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index cc83c612937f..4205b24179bd 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -802,7 +802,7 @@ atomic_t *bch_journal(struct cache_set *c, + journal_try_write(c); + } else if (!w->dirty) { + w->dirty = true; +- schedule_delayed_work(&c->journal.work, ++ queue_delayed_work(bcache_mod_wq, &c->journal.work, + msecs_to_jiffies(c->journal_delay_ms)); + spin_unlock(&c->journal.lock); + } else { +diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c +index 4bb1592270b1..849146d539c9 100644 +--- a/drivers/md/bcache/sysfs.c ++++ b/drivers/md/bcache/sysfs.c +@@ -447,7 +447,7 @@ STORE(bch_cached_dev) + + if (attr == &sysfs_writeback_percent) + if (!test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) +- schedule_delayed_work(&dc->writeback_rate_update, ++ queue_delayed_work(bcache_mod_wq, &dc->writeback_rate_update, + dc->writeback_rate_update_seconds * HZ); + + mutex_unlock(&bch_register_lock); +diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c +index 73f0efac2b9f..54f68ae9d343 100644 +--- a/drivers/md/bcache/writeback.c ++++ b/drivers/md/bcache/writeback.c +@@ -212,7 +212,7 @@ static void update_writeback_rate(struct work_struct *work) + */ + if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) && + !test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { +- schedule_delayed_work(&dc->writeback_rate_update, ++ queue_delayed_work(bcache_mod_wq, &dc->writeback_rate_update, + dc->writeback_rate_update_seconds * HZ); + } + +@@ -835,7 +835,7 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc) + dc->writeback_running = true; + + WARN_ON(test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); +- schedule_delayed_work(&dc->writeback_rate_update, ++ queue_delayed_work(bcache_mod_wq, &dc->writeback_rate_update, + dc->writeback_rate_update_seconds * HZ); + + bch_writeback_queue(dc); +-- +2.16.4 + diff --git a/for-next/old3/0000-cover-letter.patch b/for-next/old3/0000-cover-letter.patch new file mode 100644 index 0000000..3cc1673 --- /dev/null +++ b/for-next/old3/0000-cover-letter.patch @@ -0,0 +1,63 @@ +From 095ebbae1dbccae3c04ac4432e8dc550568ab11e Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Tue, 4 Jun 2019 14:53:24 +0800 +Subject: [PATCH 00/32] *** SUBJECT HERE *** + +*** BLURB HERE *** + +Alexandru Ardelean (1): + bcache: use sysfs_match_string() instead of __sysfs_match_string() + +Coly Li (31): + bcache: avoid flushing btree node in cache_set_flush() if io disabled + bcache: Revert "bcache: fix high CPU occupancy during journal" + bcache: Revert "bcache: free heap cache_set->flush_btree in + bch_journal_free" + bcache: ignore read-ahead request failure on backing device + bcache: add io error counting in write_bdev_super_endio() + bcache: remove "XXX:" comment line from run_cache_set() + bcache: remove unnecessary prefetch() in bset_search_tree() + bcache: make bset_search_tree() be more understandable + bcache: add return value check to bch_cached_dev_run() + bcache: remove unncessary code in bch_btree_keys_init() + bcache: avoid a deadlock in bcache_reboot() + bcache: check CACHE_SET_IO_DISABLE in allocator code + bcache: check CACHE_SET_IO_DISABLE bit in bch_journal() + bcache: add pendings_cleanup to stop pending bcache device + bcache: add code comments for journal_read_bucket() + bcache: set largest seq to ja->seq[bucket_index] in + journal_read_bucket() + bcache: more detailed error message to bcache_device_link() + bcache: add more error message in bch_cached_dev_attach() + bcache: improve error message in bch_cached_dev_run() + bcache: fix race in btree_flush_write() + bcache: remove retry_flush_write from struct cache_set + bcache: use bcache_mod_wq to replace system wide system_wq + bcache: add reclaimed_journal_buckets to struct cache_set + bcache: acquire bch_register_lock later in cached_dev_detach_finish() + bcache: move dc->io_disable into dc->flags + bcache: add CACHED_DEV_FREEING to dc->flags + bcache: acquire bch_register_lock later in cached_dev_free() + bcache: replace system_wq to bcache_mod_wq + bcache: shrink btree node cache after bch_btree_check() + bcache: fix potential deadlock in cached_def_free() + bcache: fix return value error in bch_journal_read() + + drivers/md/bcache/alloc.c | 9 ++ + drivers/md/bcache/bcache.h | 13 ++- + drivers/md/bcache/bset.c | 48 ++------ + drivers/md/bcache/btree.c | 23 +++- + drivers/md/bcache/btree.h | 2 + + drivers/md/bcache/io.c | 12 ++ + drivers/md/bcache/journal.c | 143 ++++++++++++++++------- + drivers/md/bcache/journal.h | 4 + + drivers/md/bcache/request.c | 4 +- + drivers/md/bcache/super.c | 262 ++++++++++++++++++++++++++++++++++-------- + drivers/md/bcache/sysfs.c | 74 ++++++++---- + drivers/md/bcache/util.h | 2 - + drivers/md/bcache/writeback.c | 8 +- + 13 files changed, 443 insertions(+), 161 deletions(-) + +-- +2.16.4 + diff --git a/for-next/old3/0001-bcache-avoid-flushing-btree-node-in-cache_set_flush-.patch b/for-next/old3/0001-bcache-avoid-flushing-btree-node-in-cache_set_flush-.patch new file mode 100644 index 0000000..b2f3a13 --- /dev/null +++ b/for-next/old3/0001-bcache-avoid-flushing-btree-node-in-cache_set_flush-.patch @@ -0,0 +1,53 @@ +From c10fba3925fb78d2a0d4122a6e86f58154a3c82e Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Thu, 23 May 2019 23:18:10 +0800 +Subject: [PATCH 01/32] bcache: avoid flushing btree node in cache_set_flush() + if io disabled + +When cache_set_flush() is called for too many I/O errors detected on +cache device and the cache set is retiring, inside the function it +doesn't make sense to flushing cached btree nodes from c->btree_cache +because CACHE_SET_IO_DISABLE is set on c->flags already and all I/Os +onto cache device will be rejected. + +This patch checks in cache_set_flush() that whether CACHE_SET_IO_DISABLE +is set. If yes, then avoids to flush the cached btree nodes to reduce +more time and make cache set retiring more faster. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/super.c | 18 +++++++++++------- + 1 file changed, 11 insertions(+), 7 deletions(-) + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 1b63ac876169..f44a666271f5 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -1570,13 +1570,17 @@ static void cache_set_flush(struct closure *cl) + if (!IS_ERR_OR_NULL(c->root)) + list_add(&c->root->list, &c->btree_cache); + +- /* Should skip this if we're unregistering because of an error */ +- list_for_each_entry(b, &c->btree_cache, list) { +- mutex_lock(&b->write_lock); +- if (btree_node_dirty(b)) +- __bch_btree_node_write(b, NULL); +- mutex_unlock(&b->write_lock); +- } ++ /* ++ * Avoid flushing cached nodes if cache set is retiring ++ * due to too many I/O errors detected. ++ */ ++ if (!test_bit(CACHE_SET_IO_DISABLE, &c->flags)) ++ list_for_each_entry(b, &c->btree_cache, list) { ++ mutex_lock(&b->write_lock); ++ if (btree_node_dirty(b)) ++ __bch_btree_node_write(b, NULL); ++ mutex_unlock(&b->write_lock); ++ } + + for_each_cache(ca, c, i) + if (ca->alloc_thread) +-- +2.16.4 + diff --git a/for-next/old3/0002-bcache-Revert-bcache-fix-high-CPU-occupancy-during-j.patch b/for-next/old3/0002-bcache-Revert-bcache-fix-high-CPU-occupancy-during-j.patch new file mode 100644 index 0000000..311c88e --- /dev/null +++ b/for-next/old3/0002-bcache-Revert-bcache-fix-high-CPU-occupancy-during-j.patch @@ -0,0 +1,129 @@ +From 8d978651ae76cf942f6cc404ef4d7c70e463dc3d Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Tue, 28 May 2019 21:19:38 +0800 +Subject: [PATCH 02/32] bcache: Revert "bcache: fix high CPU occupancy during + journal" + +This reverts commit c4dc2497d50d9c6fb16aa0d07b6a14f3b2adb1e0. + +This patch enlarges a race between normal btree flush code path and +flush_btree_write(), which causes deadlock when journal space is +exhausted. Reverts this patch makes the race window from 128 btree +nodes to only 1 btree nodes. + +Fixes: c4dc2497d50d ("bcache: fix high CPU occupancy during journal") +Signed-off-by: Coly Li <colyli@suse.de> +Cc: stable@vger.kernel.org +Cc: Tang Junhui <tang.junhui.linux@gmail.com> +--- + drivers/md/bcache/bcache.h | 2 -- + drivers/md/bcache/journal.c | 47 +++++++++++++++------------------------------ + drivers/md/bcache/util.h | 2 -- + 3 files changed, 15 insertions(+), 36 deletions(-) + +diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h +index fdf75352e16a..e30a983a68cd 100644 +--- a/drivers/md/bcache/bcache.h ++++ b/drivers/md/bcache/bcache.h +@@ -726,8 +726,6 @@ struct cache_set { + + #define BUCKET_HASH_BITS 12 + struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; +- +- DECLARE_HEAP(struct btree *, flush_btree); + }; + + struct bbio { +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index 12dae9348147..a7ff60100755 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -391,12 +391,6 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) + } + + /* Journalling */ +-#define journal_max_cmp(l, r) \ +- (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) < \ +- fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal)) +-#define journal_min_cmp(l, r) \ +- (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) > \ +- fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal)) + + static void btree_flush_write(struct cache_set *c) + { +@@ -404,35 +398,25 @@ static void btree_flush_write(struct cache_set *c) + * Try to find the btree node with that references the oldest journal + * entry, best is our current candidate and is locked if non NULL: + */ +- struct btree *b; +- int i; ++ struct btree *b, *best; ++ unsigned i; + + atomic_long_inc(&c->flush_write); +- + retry: +- spin_lock(&c->journal.lock); +- if (heap_empty(&c->flush_btree)) { +- for_each_cached_btree(b, c, i) +- if (btree_current_write(b)->journal) { +- if (!heap_full(&c->flush_btree)) +- heap_add(&c->flush_btree, b, +- journal_max_cmp); +- else if (journal_max_cmp(b, +- heap_peek(&c->flush_btree))) { +- c->flush_btree.data[0] = b; +- heap_sift(&c->flush_btree, 0, +- journal_max_cmp); +- } ++ best = NULL; ++ ++ for_each_cached_btree(b, c, i) ++ if (btree_current_write(b)->journal) { ++ if (!best) ++ best = b; ++ else if (journal_pin_cmp(c, ++ btree_current_write(best)->journal, ++ btree_current_write(b)->journal)) { ++ best = b; + } ++ } + +- for (i = c->flush_btree.used / 2 - 1; i >= 0; --i) +- heap_sift(&c->flush_btree, i, journal_min_cmp); +- } +- +- b = NULL; +- heap_pop(&c->flush_btree, b, journal_min_cmp); +- spin_unlock(&c->journal.lock); +- ++ b = best; + if (b) { + mutex_lock(&b->write_lock); + if (!btree_current_write(b)->journal) { +@@ -870,8 +854,7 @@ int bch_journal_alloc(struct cache_set *c) + j->w[0].c = c; + j->w[1].c = c; + +- if (!(init_heap(&c->flush_btree, 128, GFP_KERNEL)) || +- !(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || ++ if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || + !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) || + !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS))) + return -ENOMEM; +diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h +index 1fbced94e4cc..c029f7443190 100644 +--- a/drivers/md/bcache/util.h ++++ b/drivers/md/bcache/util.h +@@ -113,8 +113,6 @@ do { \ + + #define heap_full(h) ((h)->used == (h)->size) + +-#define heap_empty(h) ((h)->used == 0) +- + #define DECLARE_FIFO(type, name) \ + struct { \ + size_t front, back, size, mask; \ +-- +2.16.4 + diff --git a/for-next/old3/0003-bcache-Revert-bcache-free-heap-cache_set-flush_btree.patch b/for-next/old3/0003-bcache-Revert-bcache-free-heap-cache_set-flush_btree.patch new file mode 100644 index 0000000..d29132f --- /dev/null +++ b/for-next/old3/0003-bcache-Revert-bcache-free-heap-cache_set-flush_btree.patch @@ -0,0 +1,35 @@ +From adf1ee061acd018bd6329661175fb6c442f8969a Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Tue, 28 May 2019 21:36:56 +0800 +Subject: [PATCH 03/32] bcache: Revert "bcache: free heap + cache_set->flush_btree in bch_journal_free" + +This reverts commit 6268dc2c4703aabfb0b35681be709acf4c2826c6. + +This patch depends on commit c4dc2497d50d ("bcache: fix high CPU +occupancy during journal") which is reverted in previous patch. So +revert this one too. + +Fixes: 6268dc2c4703 ("bcache: free heap cache_set->flush_btree in bch_journal_free") +Signed-off-by: Coly Li <colyli@suse.de> +Cc: stable@vger.kernel.org +Cc: Shenghui Wang <shhuiw@foxmail.com> +--- + drivers/md/bcache/journal.c | 1 - + 1 file changed, 1 deletion(-) + +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index a7ff60100755..38849736fa1c 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -839,7 +839,6 @@ void bch_journal_free(struct cache_set *c) + free_pages((unsigned long) c->journal.w[1].data, JSET_BITS); + free_pages((unsigned long) c->journal.w[0].data, JSET_BITS); + free_fifo(&c->journal.pin); +- free_heap(&c->flush_btree); + } + + int bch_journal_alloc(struct cache_set *c) +-- +2.16.4 + diff --git a/for-next/old3/0004-bcache-ignore-read-ahead-request-failure-on-backing-.patch b/for-next/old3/0004-bcache-ignore-read-ahead-request-failure-on-backing-.patch new file mode 100644 index 0000000..e9fe77e --- /dev/null +++ b/for-next/old3/0004-bcache-ignore-read-ahead-request-failure-on-backing-.patch @@ -0,0 +1,55 @@ +From 732aba1ff2549438105b085b0387a270586ebdc3 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Mon, 13 May 2019 22:48:09 +0800 +Subject: [PATCH 04/32] bcache: ignore read-ahead request failure on backing + device + +When md raid device (e.g. raid456) is used as backing device, read-ahead +requests on a degrading and recovering md raid device might be failured +immediately by md raid code, but indeed this md raid array can still be +read or write for normal I/O requests. Therefore such failed read-ahead +request are not real hardware failure. Further more, after degrading and +recovering accomplished, read-ahead requests will be handled by md raid +array again. + +For such condition, I/O failures of read-ahead requests don't indicate +real health status (because normal I/O still be served), they should not +be counted into I/O error counter dc->io_errors. + +Since there is no simple way to detect whether the backing divice is a +md raid device, this patch simply ignores I/O failures for read-ahead +bios on backing device, to avoid bogus backing device failure on a +degrading md raid array. + +Suggested-and-tested-by: Thorsten Knabe <linux@thorsten-knabe.de> +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/io.c | 12 ++++++++++++ + 1 file changed, 12 insertions(+) + +diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c +index c25097968319..4d93f07f63e5 100644 +--- a/drivers/md/bcache/io.c ++++ b/drivers/md/bcache/io.c +@@ -58,6 +58,18 @@ void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio) + + WARN_ONCE(!dc, "NULL pointer of struct cached_dev"); + ++ /* ++ * Read-ahead requests on a degrading and recovering md raid ++ * (e.g. raid6) device might be failured immediately by md ++ * raid code, which is not a real hardware media failure. So ++ * we shouldn't count failed REQ_RAHEAD bio to dc->io_errors. ++ */ ++ if (bio->bi_opf & REQ_RAHEAD) { ++ pr_warn_ratelimited("%s: Read-ahead I/O failed on backing device, ignore", ++ dc->backing_dev_name); ++ return; ++ } ++ + errors = atomic_add_return(1, &dc->io_errors); + if (errors < dc->error_limit) + pr_err("%s: IO error on backing device, unrecoverable", +-- +2.16.4 + diff --git a/for-next/old3/0005-bcache-add-io-error-counting-in-write_bdev_super_end.patch b/for-next/old3/0005-bcache-add-io-error-counting-in-write_bdev_super_end.patch new file mode 100644 index 0000000..31d7539 --- /dev/null +++ b/for-next/old3/0005-bcache-add-io-error-counting-in-write_bdev_super_end.patch @@ -0,0 +1,38 @@ +From 4570746d46fbbdc0cd0ad54fa454f4ba9e16c8c2 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Mon, 13 May 2019 23:42:39 +0800 +Subject: [PATCH 05/32] bcache: add io error counting in + write_bdev_super_endio() + +When backing device super block is written by bch_write_bdev_super(), +the bio complete callback write_bdev_super_endio() simply ignores I/O +status. Indeed such write request also contribute to backing device +health status if the request failed. + +This patch checkes bio->bi_status in write_bdev_super_endio(), if there +is error, bch_count_backing_io_errors() will be called to count an I/O +error to dc->io_errors. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/super.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index f44a666271f5..c486a9de1219 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -197,7 +197,9 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, + static void write_bdev_super_endio(struct bio *bio) + { + struct cached_dev *dc = bio->bi_private; +- /* XXX: error checking */ ++ ++ if (bio->bi_status) ++ bch_count_backing_io_errors(dc, bio); + + closure_put(&dc->sb_write); + } +-- +2.16.4 + diff --git a/for-next/old3/0006-bcache-remove-XXX-comment-line-from-run_cache_set.patch b/for-next/old3/0006-bcache-remove-XXX-comment-line-from-run_cache_set.patch new file mode 100644 index 0000000..994b11d --- /dev/null +++ b/for-next/old3/0006-bcache-remove-XXX-comment-line-from-run_cache_set.patch @@ -0,0 +1,31 @@ +From e84087569e15dca4859bd6413a2638f7eefa40f1 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Mon, 13 May 2019 23:47:38 +0800 +Subject: [PATCH 06/32] bcache: remove "XXX:" comment line from run_cache_set() + +In previous bcache patches for Linux v5.2, the failure code path of +run_cache_set() is tested and fixed. So now the following comment +line can be removed from run_cache_set(), + /* XXX: test this, it's broken */ + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/super.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index c486a9de1219..962c53493cf0 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -1963,7 +1963,7 @@ static int run_cache_set(struct cache_set *c) + } + + closure_sync(&cl); +- /* XXX: test this, it's broken */ ++ + bch_cache_set_error(c, "%s", err); + + return -EIO; +-- +2.16.4 + diff --git a/for-next/old3/0007-bcache-remove-unnecessary-prefetch-in-bset_search_tr.patch b/for-next/old3/0007-bcache-remove-unnecessary-prefetch-in-bset_search_tr.patch new file mode 100644 index 0000000..e7a90f3 --- /dev/null +++ b/for-next/old3/0007-bcache-remove-unnecessary-prefetch-in-bset_search_tr.patch @@ -0,0 +1,56 @@ +From 327c62d3e7e990243c2ea930e3f60770cffb0124 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Tue, 14 May 2019 22:23:35 +0800 +Subject: [PATCH 07/32] bcache: remove unnecessary prefetch() in + bset_search_tree() + +In function bset_search_tree(), when p >= t->size, t->tree[0] will be +prefetched by the following code piece, + 974 unsigned int p = n << 4; + 975 + 976 p &= ((int) (p - t->size)) >> 31; + 977 + 978 prefetch(&t->tree[p]); + +The purpose of the above code is to avoid a branch instruction, but +when p >= t->size, prefetch(&t->tree[0]) has no positive performance +contribution at all. This patch avoids the unncessary prefetch by only +calling prefetch() when p < t->size. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/bset.c | 16 ++-------------- + 1 file changed, 2 insertions(+), 14 deletions(-) + +diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c +index 8f07fa6e1739..aa2e4ab0fab9 100644 +--- a/drivers/md/bcache/bset.c ++++ b/drivers/md/bcache/bset.c +@@ -960,22 +960,10 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t, + unsigned int inorder, j, n = 1; + + do { +- /* +- * A bit trick here. +- * If p < t->size, (int)(p - t->size) is a minus value and +- * the most significant bit is set, right shifting 31 bits +- * gets 1. If p >= t->size, the most significant bit is +- * not set, right shifting 31 bits gets 0. +- * So the following 2 lines equals to +- * if (p >= t->size) +- * p = 0; +- * but a branch instruction is avoided. +- */ + unsigned int p = n << 4; + +- p &= ((int) (p - t->size)) >> 31; +- +- prefetch(&t->tree[p]); ++ if (p < t->size) ++ prefetch(&t->tree[p]); + + j = n; + f = &t->tree[j]; +-- +2.16.4 + diff --git a/for-next/old3/0008-bcache-make-bset_search_tree-be-more-understandable.patch b/for-next/old3/0008-bcache-make-bset_search_tree-be-more-understandable.patch new file mode 100644 index 0000000..dd3b408 --- /dev/null +++ b/for-next/old3/0008-bcache-make-bset_search_tree-be-more-understandable.patch @@ -0,0 +1,58 @@ +From 6a3180c3fea9089b72d0e1ff51816b1552c26117 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Tue, 14 May 2019 22:51:40 +0800 +Subject: [PATCH 08/32] bcache: make bset_search_tree() be more understandable + +The purpose of following code in bset_search_tree() is to avoid a branch +instruction, + 994 if (likely(f->exponent != 127)) + 995 n = j * 2 + (((unsigned int) + 996 (f->mantissa - + 997 bfloat_mantissa(search, f))) >> 31); + 998 else + 999 n = (bkey_cmp(tree_to_bkey(t, j), search) > 0) +1000 ? j * 2 +1001 : j * 2 + 1; + +This piece of code is not very clear to understand, even when I tried to +add code comment for it, I made mistake. This patch removes the implict +bit operation and uses explicit branch to calculate next location in +binary tree search. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/bset.c | 17 +++-------------- + 1 file changed, 3 insertions(+), 14 deletions(-) + +diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c +index aa2e4ab0fab9..f752cc791f50 100644 +--- a/drivers/md/bcache/bset.c ++++ b/drivers/md/bcache/bset.c +@@ -968,21 +968,10 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t, + j = n; + f = &t->tree[j]; + +- /* +- * Similar bit trick, use subtract operation to avoid a branch +- * instruction. +- * +- * n = (f->mantissa > bfloat_mantissa()) +- * ? j * 2 +- * : j * 2 + 1; +- * +- * We need to subtract 1 from f->mantissa for the sign bit trick +- * to work - that's done in make_bfloat() +- */ + if (likely(f->exponent != 127)) +- n = j * 2 + (((unsigned int) +- (f->mantissa - +- bfloat_mantissa(search, f))) >> 31); ++ n = (f->mantissa >= bfloat_mantissa(search, f)) ++ ? j * 2 ++ : j * 2 + 1; + else + n = (bkey_cmp(tree_to_bkey(t, j), search) > 0) + ? j * 2 +-- +2.16.4 + diff --git a/for-next/old3/0009-bcache-use-sysfs_match_string-instead-of-__sysfs_mat.patch b/for-next/old3/0009-bcache-use-sysfs_match_string-instead-of-__sysfs_mat.patch new file mode 100644 index 0000000..c83acc1 --- /dev/null +++ b/for-next/old3/0009-bcache-use-sysfs_match_string-instead-of-__sysfs_mat.patch @@ -0,0 +1,97 @@ +From de0a1c6ff54f0aa7005470614fcc07e57d6477d5 Mon Sep 17 00:00:00 2001 +From: Alexandru Ardelean <alexandru.ardelean@analog.com> +Date: Tue, 7 May 2019 12:43:12 +0300 +Subject: [PATCH 09/32] bcache: use sysfs_match_string() instead of + __sysfs_match_string() + +The arrays (of strings) that are passed to __sysfs_match_string() are +static, so use sysfs_match_string() which does an implicit ARRAY_SIZE() +over these arrays. + +Functionally, this doesn't change anything. +The change is more cosmetic. + +It only shrinks the static arrays by 1 byte each. + +Signed-off-by: Alexandru Ardelean <alexandru.ardelean@analog.com> +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/sysfs.c | 20 ++++++++------------ + 1 file changed, 8 insertions(+), 12 deletions(-) + +diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c +index 6cd44d3cf906..3a520262933d 100644 +--- a/drivers/md/bcache/sysfs.c ++++ b/drivers/md/bcache/sysfs.c +@@ -21,28 +21,24 @@ static const char * const bch_cache_modes[] = { + "writethrough", + "writeback", + "writearound", +- "none", +- NULL ++ "none" + }; + + /* Default is 0 ("auto") */ + static const char * const bch_stop_on_failure_modes[] = { + "auto", +- "always", +- NULL ++ "always" + }; + + static const char * const cache_replacement_policies[] = { + "lru", + "fifo", +- "random", +- NULL ++ "random" + }; + + static const char * const error_actions[] = { + "unregister", +- "panic", +- NULL ++ "panic" + }; + + write_attribute(attach); +@@ -333,7 +329,7 @@ STORE(__cached_dev) + bch_cached_dev_run(dc); + + if (attr == &sysfs_cache_mode) { +- v = __sysfs_match_string(bch_cache_modes, -1, buf); ++ v = sysfs_match_string(bch_cache_modes, buf); + if (v < 0) + return v; + +@@ -344,7 +340,7 @@ STORE(__cached_dev) + } + + if (attr == &sysfs_stop_when_cache_set_failed) { +- v = __sysfs_match_string(bch_stop_on_failure_modes, -1, buf); ++ v = sysfs_match_string(bch_stop_on_failure_modes, buf); + if (v < 0) + return v; + +@@ -794,7 +790,7 @@ STORE(__bch_cache_set) + 0, UINT_MAX); + + if (attr == &sysfs_errors) { +- v = __sysfs_match_string(error_actions, -1, buf); ++ v = sysfs_match_string(error_actions, buf); + if (v < 0) + return v; + +@@ -1058,7 +1054,7 @@ STORE(__bch_cache) + } + + if (attr == &sysfs_cache_replacement_policy) { +- v = __sysfs_match_string(cache_replacement_policies, -1, buf); ++ v = sysfs_match_string(cache_replacement_policies, buf); + if (v < 0) + return v; + +-- +2.16.4 + diff --git a/for-next/old3/0010-bcache-add-return-value-check-to-bch_cached_dev_run.patch b/for-next/old3/0010-bcache-add-return-value-check-to-bch_cached_dev_run.patch new file mode 100644 index 0000000..54f83c4 --- /dev/null +++ b/for-next/old3/0010-bcache-add-return-value-check-to-bch_cached_dev_run.patch @@ -0,0 +1,151 @@ +From d4d152860628f81b058c84f759fe74b635998204 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Tue, 21 May 2019 22:16:38 +0800 +Subject: [PATCH 10/32] bcache: add return value check to bch_cached_dev_run() + +This patch adds return value check to bch_cached_dev_run(), now if there +is error happens inside bch_cached_dev_run(), it can be catched. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/bcache.h | 2 +- + drivers/md/bcache/super.c | 32 +++++++++++++++++++++++++------- + drivers/md/bcache/sysfs.c | 7 +++++-- + 3 files changed, 31 insertions(+), 10 deletions(-) + +diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h +index e30a983a68cd..cb268d7c6cea 100644 +--- a/drivers/md/bcache/bcache.h ++++ b/drivers/md/bcache/bcache.h +@@ -1004,7 +1004,7 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size); + int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, + uint8_t *set_uuid); + void bch_cached_dev_detach(struct cached_dev *dc); +-void bch_cached_dev_run(struct cached_dev *dc); ++int bch_cached_dev_run(struct cached_dev *dc); + void bcache_device_stop(struct bcache_device *d); + + void bch_cache_set_unregister(struct cache_set *c); +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 962c53493cf0..8bc5c55d0ee2 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -910,7 +910,7 @@ static int cached_dev_status_update(void *arg) + } + + +-void bch_cached_dev_run(struct cached_dev *dc) ++int bch_cached_dev_run(struct cached_dev *dc) + { + struct bcache_device *d = &dc->disk; + char *buf = kmemdup_nul(dc->sb.label, SB_LABEL_SIZE, GFP_KERNEL); +@@ -921,11 +921,14 @@ void bch_cached_dev_run(struct cached_dev *dc) + NULL, + }; + ++ if (dc->io_disable) ++ return -EIO; ++ + if (atomic_xchg(&dc->running, 1)) { + kfree(env[1]); + kfree(env[2]); + kfree(buf); +- return; ++ return -EBUSY; + } + + if (!d->c && +@@ -951,8 +954,10 @@ void bch_cached_dev_run(struct cached_dev *dc) + kfree(buf); + + if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || +- sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) ++ sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) { + pr_debug("error creating sysfs link"); ++ return -ENOMEM; ++ } + + dc->status_update_thread = kthread_run(cached_dev_status_update, + dc, "bcache_status_update"); +@@ -961,6 +966,8 @@ void bch_cached_dev_run(struct cached_dev *dc) + "continue to run without monitoring backing " + "device status"); + } ++ ++ return 0; + } + + /* +@@ -1056,6 +1063,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, + uint32_t rtime = cpu_to_le32((u32)ktime_get_real_seconds()); + struct uuid_entry *u; + struct cached_dev *exist_dc, *t; ++ int ret = 0; + + if ((set_uuid && memcmp(set_uuid, c->sb.set_uuid, 16)) || + (!set_uuid && memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16))) +@@ -1165,7 +1173,12 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, + + bch_sectors_dirty_init(&dc->disk); + +- bch_cached_dev_run(dc); ++ ret = bch_cached_dev_run(dc); ++ if (ret && (ret != -EBUSY)) { ++ up_write(&dc->writeback_lock); ++ return ret; ++ } ++ + bcache_device_link(&dc->disk, c, "bdev"); + atomic_inc(&c->attached_dev_nr); + +@@ -1292,6 +1305,7 @@ static int register_bdev(struct cache_sb *sb, struct page *sb_page, + { + const char *err = "cannot allocate memory"; + struct cache_set *c; ++ int ret = -ENOMEM; + + bdevname(bdev, dc->backing_dev_name); + memcpy(&dc->sb, sb, sizeof(struct cache_sb)); +@@ -1321,14 +1335,18 @@ static int register_bdev(struct cache_sb *sb, struct page *sb_page, + bch_cached_dev_attach(dc, c, NULL); + + if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE || +- BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) +- bch_cached_dev_run(dc); ++ BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) { ++ err = "failed to run cached device"; ++ ret = bch_cached_dev_run(dc); ++ if (ret) ++ goto err; ++ } + + return 0; + err: + pr_notice("error %s: %s", dc->backing_dev_name, err); + bcache_device_stop(&dc->disk); +- return -EIO; ++ return ret; + } + + /* Flash only volumes */ +diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c +index 3a520262933d..129031663cc8 100644 +--- a/drivers/md/bcache/sysfs.c ++++ b/drivers/md/bcache/sysfs.c +@@ -325,8 +325,11 @@ STORE(__cached_dev) + bch_cache_accounting_clear(&dc->accounting); + + if (attr == &sysfs_running && +- strtoul_or_return(buf)) +- bch_cached_dev_run(dc); ++ strtoul_or_return(buf)) { ++ v = bch_cached_dev_run(dc); ++ if (v) ++ return v; ++ } + + if (attr == &sysfs_cache_mode) { + v = sysfs_match_string(bch_cache_modes, buf); +-- +2.16.4 + diff --git a/for-next/old3/0011-bcache-remove-unncessary-code-in-bch_btree_keys_init.patch b/for-next/old3/0011-bcache-remove-unncessary-code-in-bch_btree_keys_init.patch new file mode 100644 index 0000000..d65ff35 --- /dev/null +++ b/for-next/old3/0011-bcache-remove-unncessary-code-in-bch_btree_keys_init.patch @@ -0,0 +1,72 @@ +From 916a60181db538619e372a4f76e5a01120478b79 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Tue, 21 May 2019 22:36:35 +0800 +Subject: [PATCH 11/32] bcache: remove unncessary code in bch_btree_keys_init() + +Function bch_btree_keys_init() initializes b->set[].size and +b->set[].data to zero. As the code comments indicates, these code indeed +is unncessary, because both struct btree_keys and struct bset_tree are +nested embedded into struct btree, when struct btree is filled with 0 +bits by kzalloc() in mca_bucket_alloc(), b->set[].size and +b->set[].data are initialized to 0 (a.k.a NULL) already. + +This patch removes the redundant code, and add comments in +bch_btree_keys_init() and mca_bucket_alloc() to explain why it's safe. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/bset.c | 15 ++++++--------- + drivers/md/bcache/btree.c | 4 ++++ + 2 files changed, 10 insertions(+), 9 deletions(-) + +diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c +index f752cc791f50..32e2e4d8fa6c 100644 +--- a/drivers/md/bcache/bset.c ++++ b/drivers/md/bcache/bset.c +@@ -347,22 +347,19 @@ EXPORT_SYMBOL(bch_btree_keys_alloc); + void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops, + bool *expensive_debug_checks) + { +- unsigned int i; +- + b->ops = ops; + b->expensive_debug_checks = expensive_debug_checks; + b->nsets = 0; + b->last_set_unwritten = 0; + +- /* XXX: shouldn't be needed */ +- for (i = 0; i < MAX_BSETS; i++) +- b->set[i].size = 0; + /* +- * Second loop starts at 1 because b->keys[0]->data is the memory we +- * allocated ++ * struct btree_keys in embedded in struct btree, and struct ++ * bset_tree is embedded into struct btree_keys. They are all ++ * initialized as 0 by kzalloc() in mca_bucket_alloc(), and ++ * b->set[0].data is allocated in bch_btree_keys_alloc(), so we ++ * don't have to initiate b->set[].size and b->set[].data here ++ * any more. + */ +- for (i = 1; i < MAX_BSETS; i++) +- b->set[i].data = NULL; + } + EXPORT_SYMBOL(bch_btree_keys_init); + +diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c +index 773f5fdad25f..cf38a1b031fa 100644 +--- a/drivers/md/bcache/btree.c ++++ b/drivers/md/bcache/btree.c +@@ -613,6 +613,10 @@ static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) + static struct btree *mca_bucket_alloc(struct cache_set *c, + struct bkey *k, gfp_t gfp) + { ++ /* ++ * kzalloc() is necessary here for initialization, ++ * see code comments in bch_btree_keys_init(). ++ */ + struct btree *b = kzalloc(sizeof(struct btree), gfp); + + if (!b) +-- +2.16.4 + diff --git a/for-next/old3/0012-bcache-avoid-a-deadlock-in-bcache_reboot.patch b/for-next/old3/0012-bcache-avoid-a-deadlock-in-bcache_reboot.patch new file mode 100644 index 0000000..c8f5660 --- /dev/null +++ b/for-next/old3/0012-bcache-avoid-a-deadlock-in-bcache_reboot.patch @@ -0,0 +1,212 @@ +From 6fccf1970e4705e9924329dc4dbb24181338a257 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Tue, 21 May 2019 23:19:55 +0800 +Subject: [PATCH 12/32] bcache: avoid a deadlock in bcache_reboot() + +It is quite frequently to observe deadlock in bcache_reboot() happens +and hang the system reboot process. The reason is, in bcache_reboot() +when calling bch_cache_set_stop() and bcache_device_stop() the mutex +bch_register_lock is held. But in the process to stop cache set and +bcache device, bch_register_lock will be acquired again. If this mutex +is held here, deadlock will happen inside the stopping process. The +aftermath of the deadlock is, whole system reboot gets hung. + +The fix is to avoid holding bch_register_lock for the following loops +in bcache_reboot(), + list_for_each_entry_safe(c, tc, &bch_cache_sets, list) + bch_cache_set_stop(c); + + list_for_each_entry_safe(dc, tdc, &uncached_devices, list) + bcache_device_stop(&dc->disk); + +A module range variable 'bcache_is_reboot' is added, it sets to true +in bcache_reboot(). In register_bcache(), if bcache_is_reboot is checked +to be true, reject the registration by returning -EBUSY immediately. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/super.c | 41 ++++++++++++++++++++++++++++++++++++++++- + drivers/md/bcache/sysfs.c | 26 ++++++++++++++++++++++++++ + 2 files changed, 66 insertions(+), 1 deletion(-) + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 8bc5c55d0ee2..978689d4363c 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -40,6 +40,7 @@ static const char invalid_uuid[] = { + + static struct kobject *bcache_kobj; + struct mutex bch_register_lock; ++bool bcache_is_reboot; + LIST_HEAD(bch_cache_sets); + static LIST_HEAD(uncached_devices); + +@@ -49,6 +50,7 @@ static wait_queue_head_t unregister_wait; + struct workqueue_struct *bcache_wq; + struct workqueue_struct *bch_journal_wq; + ++ + #define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE) + /* limitation of partitions number on single bcache device */ + #define BCACHE_MINORS 128 +@@ -2325,6 +2327,11 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, + if (!try_module_get(THIS_MODULE)) + return -EBUSY; + ++ /* For latest state of bcache_is_reboot */ ++ smp_mb(); ++ if (bcache_is_reboot) ++ return -EBUSY; ++ + path = kstrndup(buffer, size, GFP_KERNEL); + if (!path) + goto err; +@@ -2404,6 +2411,9 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, + + static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) + { ++ if (bcache_is_reboot) ++ return NOTIFY_DONE; ++ + if (code == SYS_DOWN || + code == SYS_HALT || + code == SYS_POWER_OFF) { +@@ -2416,19 +2426,46 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) + + mutex_lock(&bch_register_lock); + ++ if (bcache_is_reboot) { ++ goto out; ++ } ++ ++ /* New registration is rejected since now */ ++ bcache_is_reboot = true; ++ /* ++ * Make registering caller (if there is) on other CPU ++ * core know bcache_is_reboot set to true earlier ++ */ ++ smp_mb(); ++ + if (list_empty(&bch_cache_sets) && + list_empty(&uncached_devices)) + goto out; + ++ mutex_unlock(&bch_register_lock); ++ + pr_info("Stopping all devices:"); + ++ /* ++ * The reason bch_register_lock is not held to call ++ * bch_cache_set_stop() and bcache_device_stop() is to ++ * avoid potential deadlock during reboot, because cache ++ * set or bcache device stopping process will acqurie ++ * bch_register_lock too. ++ * ++ * We are safe here because bcache_is_reboot sets to ++ * true already, register_bcache() will reject new ++ * registration now. bcache_is_reboot also makes sure ++ * bcache_reboot() won't be re-entered on by other thread, ++ * so there is no race in following list iteration by ++ * list_for_each_entry_safe(). ++ */ + list_for_each_entry_safe(c, tc, &bch_cache_sets, list) + bch_cache_set_stop(c); + + list_for_each_entry_safe(dc, tdc, &uncached_devices, list) + bcache_device_stop(&dc->disk); + +- mutex_unlock(&bch_register_lock); + + /* + * Give an early chance for other kthreads and +@@ -2555,6 +2592,8 @@ static int __init bcache_init(void) + bch_debug_init(); + closure_debug_init(); + ++ bcache_is_reboot = false; ++ + return 0; + err: + bcache_exit(); +diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c +index 129031663cc8..961a13a223ee 100644 +--- a/drivers/md/bcache/sysfs.c ++++ b/drivers/md/bcache/sysfs.c +@@ -16,6 +16,8 @@ + #include <linux/sort.h> + #include <linux/sched/clock.h> + ++extern bool bcache_is_reboot; ++ + /* Default is 0 ("writethrough") */ + static const char * const bch_cache_modes[] = { + "writethrough", +@@ -267,6 +269,10 @@ STORE(__cached_dev) + struct cache_set *c; + struct kobj_uevent_env *env; + ++ /* no user space access if system is rebooting */ ++ if (bcache_is_reboot) ++ return -EBUSY; ++ + #define d_strtoul(var) sysfs_strtoul(var, dc->var) + #define d_strtoul_nonzero(var) sysfs_strtoul_clamp(var, dc->var, 1, INT_MAX) + #define d_strtoi_h(var) sysfs_hatoi(var, dc->var) +@@ -407,6 +413,10 @@ STORE(bch_cached_dev) + struct cached_dev *dc = container_of(kobj, struct cached_dev, + disk.kobj); + ++ /* no user space access if system is rebooting */ ++ if (bcache_is_reboot) ++ return -EBUSY; ++ + mutex_lock(&bch_register_lock); + size = __cached_dev_store(kobj, attr, buf, size); + +@@ -505,6 +515,10 @@ STORE(__bch_flash_dev) + kobj); + struct uuid_entry *u = &d->c->uuids[d->id]; + ++ /* no user space access if system is rebooting */ ++ if (bcache_is_reboot) ++ return -EBUSY; ++ + sysfs_strtoul(data_csum, d->data_csum); + + if (attr == &sysfs_size) { +@@ -740,6 +754,10 @@ STORE(__bch_cache_set) + struct cache_set *c = container_of(kobj, struct cache_set, kobj); + ssize_t v; + ++ /* no user space access if system is rebooting */ ++ if (bcache_is_reboot) ++ return -EBUSY; ++ + if (attr == &sysfs_unregister) + bch_cache_set_unregister(c); + +@@ -859,6 +877,10 @@ STORE(bch_cache_set_internal) + { + struct cache_set *c = container_of(kobj, struct cache_set, internal); + ++ /* no user space access if system is rebooting */ ++ if (bcache_is_reboot) ++ return -EBUSY; ++ + return bch_cache_set_store(&c->kobj, attr, buf, size); + } + +@@ -1044,6 +1066,10 @@ STORE(__bch_cache) + struct cache *ca = container_of(kobj, struct cache, kobj); + ssize_t v; + ++ /* no user space access if system is rebooting */ ++ if (bcache_is_reboot) ++ return -EBUSY; ++ + if (attr == &sysfs_discard) { + bool v = strtoul_or_return(buf); + +-- +2.16.4 + diff --git a/for-next/old3/0013-bcache-check-CACHE_SET_IO_DISABLE-in-allocator-code.patch b/for-next/old3/0013-bcache-check-CACHE_SET_IO_DISABLE-in-allocator-code.patch new file mode 100644 index 0000000..4f1d0e4 --- /dev/null +++ b/for-next/old3/0013-bcache-check-CACHE_SET_IO_DISABLE-in-allocator-code.patch @@ -0,0 +1,52 @@ +From e5b879a8c9c431f7dfce7ebf1edd7f277a5463c0 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Wed, 22 May 2019 21:55:09 +0800 +Subject: [PATCH 13/32] bcache: check CACHE_SET_IO_DISABLE in allocator code + +If CACHE_SET_IO_DISABLE of a cache set flag is set by too many I/O +errors, currently allocator routines can still continue allocate +space which may introduce inconsistent metadata state. + +This patch checkes CACHE_SET_IO_DISABLE bit in following allocator +routines, +- bch_bucket_alloc() +- __bch_bucket_alloc_set() +Once CACHE_SET_IO_DISABLE is set on cache set, the allocator routines +may reject allocation request earlier to avoid potential inconsistent +metadata. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/alloc.c | 9 +++++++++ + 1 file changed, 9 insertions(+) + +diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c +index f8986effcb50..34ae5bb6724a 100644 +--- a/drivers/md/bcache/alloc.c ++++ b/drivers/md/bcache/alloc.c +@@ -393,6 +393,11 @@ long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait) + struct bucket *b; + long r; + ++ ++ /* No allocation if CACHE_SET_IO_DISABLE set */ ++ if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &ca->set->flags))) ++ return -1; ++ + /* fastpath */ + if (fifo_pop(&ca->free[RESERVE_NONE], r) || + fifo_pop(&ca->free[reserve], r)) +@@ -484,6 +489,10 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, + { + int i; + ++ /* No allocation if CACHE_SET_IO_DISABLE set */ ++ if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) ++ return -1; ++ + lockdep_assert_held(&c->bucket_lock); + BUG_ON(!n || n > c->caches_loaded || n > MAX_CACHES_PER_SET); + +-- +2.16.4 + diff --git a/for-next/old3/0014-bcache-check-CACHE_SET_IO_DISABLE-bit-in-bch_journal.patch b/for-next/old3/0014-bcache-check-CACHE_SET_IO_DISABLE-bit-in-bch_journal.patch new file mode 100644 index 0000000..811bdce --- /dev/null +++ b/for-next/old3/0014-bcache-check-CACHE_SET_IO_DISABLE-bit-in-bch_journal.patch @@ -0,0 +1,39 @@ +From 3ed211c0914ea6b5fca88b386b586b17e3ddb835 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Wed, 22 May 2019 22:06:21 +0800 +Subject: [PATCH 14/32] bcache: check CACHE_SET_IO_DISABLE bit in bch_journal() + +When too many I/O errors happen on cache set and CACHE_SET_IO_DISABLE +bit is set, bch_journal() may continue to work because the journaling +bkey might be still in write set yet. The caller of bch_journal() may +believe the journal still work but the truth is in-memory journal write +set won't be written into cache device any more. This behavior may +introduce potential inconsistent metadata status. + +This patch checks CACHE_SET_IO_DISABLE bit at the head of bch_journal(), +if the bit is set, bch_journal() returns NULL immediately to notice +caller to know journal does not work. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/journal.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index 38849736fa1c..ddbdbeb758e8 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -795,6 +795,10 @@ atomic_t *bch_journal(struct cache_set *c, + struct journal_write *w; + atomic_t *ret; + ++ /* No journaling if CACHE_SET_IO_DISABLE set already */ ++ if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) ++ return NULL; ++ + if (!CACHE_SYNC(&c->sb)) + return NULL; + +-- +2.16.4 + diff --git a/for-next/old3/0015-bcache-add-pendings_cleanup-to-stop-pending-bcache-d.patch b/for-next/old3/0015-bcache-add-pendings_cleanup-to-stop-pending-bcache-d.patch new file mode 100644 index 0000000..3ab3f1a --- /dev/null +++ b/for-next/old3/0015-bcache-add-pendings_cleanup-to-stop-pending-bcache-d.patch @@ -0,0 +1,107 @@ +From ac49935ca5ad1a451e2b47091866f9e20fb87c4a Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Wed, 20 Mar 2019 23:11:59 +0800 +Subject: [PATCH 15/32] bcache: add pendings_cleanup to stop pending bcache + device + +If a bcache device is in dirty state and its cache set is not +registered, this bcache deivce will not appear in /dev/bcache<N>, +and there is no way to stop it or remove the bcache kernel module. + +This is an as-designed behavior, but sometimes people has to reboot +whole system to release or stop the pending backing device. + +This sysfs interface may remove such pending bcache devices when +write anything into the sysfs file manually. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/super.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 55 insertions(+) + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 978689d4363c..905aece72664 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -2277,9 +2277,13 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page, + + static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, + const char *buffer, size_t size); ++static ssize_t bch_pending_bdevs_cleanup(struct kobject *k, ++ struct kobj_attribute *attr, ++ const char *buffer, size_t size); + + kobj_attribute_write(register, register_bcache); + kobj_attribute_write(register_quiet, register_bcache); ++kobj_attribute_write(pendings_cleanup, bch_pending_bdevs_cleanup); + + static bool bch_is_open_backing(struct block_device *bdev) + { +@@ -2409,6 +2413,56 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, + goto out; + } + ++ ++struct pdev { ++ struct list_head list; ++ struct cached_dev *dc; ++}; ++ ++static ssize_t bch_pending_bdevs_cleanup(struct kobject *k, ++ struct kobj_attribute *attr, ++ const char *buffer, ++ size_t size) ++{ ++ LIST_HEAD(pending_devs); ++ ssize_t ret = size; ++ struct cached_dev *dc, *tdc; ++ struct pdev *pdev, *tpdev; ++ struct cache_set *c, *tc; ++ ++ mutex_lock(&bch_register_lock); ++ list_for_each_entry_safe(dc, tdc, &uncached_devices, list) { ++ pdev = kmalloc(sizeof(struct pdev), GFP_KERNEL); ++ if (!pdev) ++ break; ++ pdev->dc = dc; ++ list_add(&pdev->list, &pending_devs); ++ } ++ ++ list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) { ++ list_for_each_entry_safe(c, tc, &bch_cache_sets, list) { ++ char *pdev_set_uuid = pdev->dc->sb.set_uuid; ++ char *set_uuid = c->sb.uuid; ++ ++ if (!memcmp(pdev_set_uuid, set_uuid, 16)) { ++ list_del(&pdev->list); ++ kfree(pdev); ++ break; ++ } ++ } ++ } ++ mutex_unlock(&bch_register_lock); ++ ++ list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) { ++ pr_info("delete pdev %p", pdev); ++ list_del(&pdev->list); ++ bcache_device_stop(&pdev->dc->disk); ++ kfree(pdev); ++ } ++ ++ return ret; ++} ++ + static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) + { + if (bcache_is_reboot) +@@ -2557,6 +2611,7 @@ static int __init bcache_init(void) + static const struct attribute *files[] = { + &ksysfs_register.attr, + &ksysfs_register_quiet.attr, ++ &ksysfs_pendings_cleanup.attr, + NULL + }; + +-- +2.16.4 + diff --git a/for-next/old3/0016-bcache-add-code-comments-for-journal_read_bucket.patch b/for-next/old3/0016-bcache-add-code-comments-for-journal_read_bucket.patch new file mode 100644 index 0000000..5c2a03a --- /dev/null +++ b/for-next/old3/0016-bcache-add-code-comments-for-journal_read_bucket.patch @@ -0,0 +1,71 @@ +From 275ca04c5e2105b800bcb8440fcd1d9d5de8aca9 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Thu, 30 May 2019 18:39:17 +0800 +Subject: [PATCH 16/32] bcache: add code comments for journal_read_bucket() + +make code to be more understandible + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/journal.c | 24 ++++++++++++++++++++++++ + 1 file changed, 24 insertions(+) + +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index ddbdbeb758e8..7f7f5e947d7e 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -100,6 +100,20 @@ reread: left = ca->sb.bucket_size - offset; + + blocks = set_blocks(j, block_bytes(ca->set)); + ++ /* ++ * Nodes in 'list' are in linear increasing order of ++ * i->j.seq, the node on head has the smallest (oldest) ++ * journal seq, the node on tail has the biggest ++ * (latest) journal seq. ++ */ ++ ++ /* ++ * Check from the oldest jset for last_seq. If ++ * i->j.seq < j->last_seq, it means the oldest jset ++ * in list is expired and useless, remove it from ++ * this list. Otherwise, j is a condidate jset for ++ * further following checks. ++ */ + while (!list_empty(list)) { + i = list_first_entry(list, + struct journal_replay, list); +@@ -109,13 +123,22 @@ reread: left = ca->sb.bucket_size - offset; + kfree(i); + } + ++ /* iterate list in reverse order (from latest jset) */ + list_for_each_entry_reverse(i, list, list) { + if (j->seq == i->j.seq) + goto next_set; + ++ /* ++ * if j->seq is less than any i->j.last_seq ++ * in list, j is an expired and useless jset. ++ */ + if (j->seq < i->j.last_seq) + goto next_set; + ++ /* ++ * 'where' points to first jset in list which ++ * is elder then j. ++ */ + if (j->seq > i->j.seq) { + where = &i->list; + goto add; +@@ -129,6 +152,7 @@ reread: left = ca->sb.bucket_size - offset; + if (!i) + return -ENOMEM; + memcpy(&i->j, j, bytes); ++ /* Add to the location after 'where' points to */ + list_add(&i->list, where); + ret = 1; + +-- +2.16.4 + diff --git a/for-next/old3/0017-bcache-set-largest-seq-to-ja-seq-bucket_index-in-jou.patch b/for-next/old3/0017-bcache-set-largest-seq-to-ja-seq-bucket_index-in-jou.patch new file mode 100644 index 0000000..341f156 --- /dev/null +++ b/for-next/old3/0017-bcache-set-largest-seq-to-ja-seq-bucket_index-in-jou.patch @@ -0,0 +1,30 @@ +From 3ccac67fe77c5735537e609f9d3aeb70e72eaaca Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Thu, 30 May 2019 18:40:37 +0800 +Subject: [PATCH 17/32] bcache: set largest seq to ja->seq[bucket_index] in + journal_read_bucket() + +Make sure always setting largest seq to ja->seq[bucket_index] + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/journal.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index 7f7f5e947d7e..152ec33981be 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -156,7 +156,8 @@ reread: left = ca->sb.bucket_size - offset; + list_add(&i->list, where); + ret = 1; + +- ja->seq[bucket_index] = j->seq; ++ if (j->seq > ja->seq[bucket_index]) ++ ja->seq[bucket_index] = j->seq; + next_set: + offset += blocks * ca->sb.block_size; + len -= blocks * ca->sb.block_size; +-- +2.16.4 + diff --git a/for-next/old3/0018-bcache-more-detailed-error-message-to-bcache_device_.patch b/for-next/old3/0018-bcache-more-detailed-error-message-to-bcache_device_.patch new file mode 100644 index 0000000..9d15aa5 --- /dev/null +++ b/for-next/old3/0018-bcache-more-detailed-error-message-to-bcache_device_.patch @@ -0,0 +1,43 @@ +From 6ba232dc1f9e4929de7359fe4a021223bed110ed Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sat, 1 Jun 2019 00:57:38 +0800 +Subject: [PATCH 18/32] bcache: more detailed error message to + bcache_device_link() + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/super.c | 11 ++++++++--- + 1 file changed, 8 insertions(+), 3 deletions(-) + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 905aece72664..f0ec25b42603 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -695,6 +695,7 @@ static void bcache_device_link(struct bcache_device *d, struct cache_set *c, + { + unsigned int i; + struct cache *ca; ++ int ret; + + for_each_cache(ca, d->c, i) + bd_link_disk_holder(ca->bdev, d->disk); +@@ -702,9 +703,13 @@ static void bcache_device_link(struct bcache_device *d, struct cache_set *c, + snprintf(d->name, BCACHEDEVNAME_SIZE, + "%s%u", name, d->id); + +- WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") || +- sysfs_create_link(&c->kobj, &d->kobj, d->name), +- "Couldn't create device <-> cache set symlinks"); ++ ret = sysfs_create_link(&d->kobj, &c->kobj, "cache"); ++ if (ret < 0) ++ pr_err("Couldn't create device -> cache set symlink"); ++ ++ ret = sysfs_create_link(&c->kobj, &d->kobj, d->name); ++ if (ret < 0) ++ pr_err("Couldn't create cache set -> device symlink"); + + clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags); + } +-- +2.16.4 + diff --git a/for-next/old3/0019-bcache-add-more-error-message-in-bch_cached_dev_atta.patch b/for-next/old3/0019-bcache-add-more-error-message-in-bch_cached_dev_atta.patch new file mode 100644 index 0000000..c4b4719 --- /dev/null +++ b/for-next/old3/0019-bcache-add-more-error-message-in-bch_cached_dev_atta.patch @@ -0,0 +1,36 @@ +From 28bbad50499a843afa59cdf77c0d7beda423fcdd Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sat, 1 Jun 2019 01:03:00 +0800 +Subject: [PATCH 19/32] bcache: add more error message in + bch_cached_dev_attach() + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/super.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index f0ec25b42603..2300a8dead63 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -1170,6 +1170,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, + down_write(&dc->writeback_lock); + if (bch_cached_dev_writeback_start(dc)) { + up_write(&dc->writeback_lock); ++ pr_err("Couldn't start writeback facilities for %s", ++ dc->disk.disk->disk_name); + return -ENOMEM; + } + +@@ -1183,6 +1185,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, + ret = bch_cached_dev_run(dc); + if (ret && (ret != -EBUSY)) { + up_write(&dc->writeback_lock); ++ pr_err("Couldn't run cached device %s", ++ dc->backing_dev_name); + return ret; + } + +-- +2.16.4 + diff --git a/for-next/old3/0020-bcache-improve-error-message-in-bch_cached_dev_run.patch b/for-next/old3/0020-bcache-improve-error-message-in-bch_cached_dev_run.patch new file mode 100644 index 0000000..33fb6ad --- /dev/null +++ b/for-next/old3/0020-bcache-improve-error-message-in-bch_cached_dev_run.patch @@ -0,0 +1,49 @@ +From 32a5ee7398cb9d1b9be7fde9ee43e339491b2920 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sat, 1 Jun 2019 01:09:06 +0800 +Subject: [PATCH 20/32] bcache: improve error message in bch_cached_dev_run() + +should use pr_err(), not pr_debug(), and provide more detailed error +message. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/super.c | 9 +++++++-- + 1 file changed, 7 insertions(+), 2 deletions(-) + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 2300a8dead63..97dbe3151a9c 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -928,13 +928,18 @@ int bch_cached_dev_run(struct cached_dev *dc) + NULL, + }; + +- if (dc->io_disable) ++ if (dc->io_disable) { ++ pr_err("I/O disabled on cached dev %s", ++ dc->backing_dev_name); + return -EIO; ++ } + + if (atomic_xchg(&dc->running, 1)) { + kfree(env[1]); + kfree(env[2]); + kfree(buf); ++ pr_info("cached dev %s is running already", ++ dc->backing_dev_name); + return -EBUSY; + } + +@@ -962,7 +967,7 @@ int bch_cached_dev_run(struct cached_dev *dc) + + if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || + sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) { +- pr_debug("error creating sysfs link"); ++ pr_err("Couldn't create bcache dev <-> disk sysfs symlinks"); + return -ENOMEM; + } + +-- +2.16.4 + diff --git a/for-next/old3/0021-bcache-fix-race-in-btree_flush_write.patch b/for-next/old3/0021-bcache-fix-race-in-btree_flush_write.patch new file mode 100644 index 0000000..7d0367e --- /dev/null +++ b/for-next/old3/0021-bcache-fix-race-in-btree_flush_write.patch @@ -0,0 +1,213 @@ +From b9d3d0b6267b7996038a0ffb9be2e9c7a4115799 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sat, 1 Jun 2019 01:55:30 +0800 +Subject: [PATCH 21/32] bcache: fix race in btree_flush_write() + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/btree.c | 15 +++++++- + drivers/md/bcache/btree.h | 2 + + drivers/md/bcache/journal.c | 93 ++++++++++++++++++++++++++++++++++----------- + drivers/md/bcache/journal.h | 4 ++ + 4 files changed, 90 insertions(+), 24 deletions(-) + +diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c +index cf38a1b031fa..c0dd8fde37af 100644 +--- a/drivers/md/bcache/btree.c ++++ b/drivers/md/bcache/btree.c +@@ -660,6 +660,13 @@ static int mca_reap(struct btree *b, unsigned int min_order, bool flush) + } + + mutex_lock(&b->write_lock); ++ /* don't reap btree node handling in btree_flush_write() */ ++ if (btree_node_journal_flush(b)) { ++ pr_debug("bnode %p is flushing by journal, ignore", b); ++ mutex_unlock(&b->write_lock); ++ goto out_unlock; ++ } ++ + if (btree_node_dirty(b)) + __bch_btree_node_write(b, &cl); + mutex_unlock(&b->write_lock); +@@ -1071,8 +1078,14 @@ static void btree_node_free(struct btree *b) + + BUG_ON(b == b->c->root); + ++retry: + mutex_lock(&b->write_lock); +- ++ if (btree_node_journal_flush(b)) { ++ mutex_unlock(&b->write_lock); ++ pr_debug("bnode %p journal_flush set, retry", b); ++ schedule_timeout_interruptible(1); ++ goto retry; ++ } + if (btree_node_dirty(b)) + btree_complete_write(b, btree_current_write(b)); + clear_bit(BTREE_NODE_dirty, &b->flags); +diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h +index d1c72ef64edf..76cfd121a486 100644 +--- a/drivers/md/bcache/btree.h ++++ b/drivers/md/bcache/btree.h +@@ -158,11 +158,13 @@ enum btree_flags { + BTREE_NODE_io_error, + BTREE_NODE_dirty, + BTREE_NODE_write_idx, ++ BTREE_NODE_journal_flush, + }; + + BTREE_FLAG(io_error); + BTREE_FLAG(dirty); + BTREE_FLAG(write_idx); ++BTREE_FLAG(journal_flush); + + static inline struct btree_write *btree_current_write(struct btree *b) + { +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index 152ec33981be..6dc02f8ae079 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -419,41 +419,87 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) + + static void btree_flush_write(struct cache_set *c) + { +- /* +- * Try to find the btree node with that references the oldest journal +- * entry, best is our current candidate and is locked if non NULL: +- */ +- struct btree *b, *best; +- unsigned i; ++ struct btree *b, *btree_nodes[BTREE_FLUSH_NR]; ++ unsigned i, n; ++ ++ if (c->journal.btree_flushing) ++ return; ++ ++ spin_lock(&c->journal.flush_write_lock); ++ if (c->journal.btree_flushing) { ++ spin_unlock(&c->journal.flush_write_lock); ++ return; ++ } ++ c->journal.btree_flushing = true; ++ spin_unlock(&c->journal.flush_write_lock); + + atomic_long_inc(&c->flush_write); +-retry: +- best = NULL; +- +- for_each_cached_btree(b, c, i) +- if (btree_current_write(b)->journal) { +- if (!best) +- best = b; +- else if (journal_pin_cmp(c, +- btree_current_write(best)->journal, +- btree_current_write(b)->journal)) { +- best = b; +- } ++ memset(btree_nodes, 0, sizeof(btree_nodes)); ++ n = 0; ++ ++ mutex_lock(&c->bucket_lock); ++ list_for_each_entry_reverse(b, &c->btree_cache, list) { ++ if (btree_node_journal_flush(b)) ++ pr_err("BUG: flush_write bit should not be set here!"); ++ ++ mutex_lock(&b->write_lock); ++ ++ if(!btree_node_dirty(b)) { ++ mutex_unlock(&b->write_lock); ++ continue; ++ } ++ ++ if (!btree_current_write(b)->journal) { ++ mutex_unlock(&b->write_lock); ++ continue; ++ } ++ ++ set_btree_node_journal_flush(b); ++ ++ mutex_unlock(&b->write_lock); ++ ++ btree_nodes[n++] = b; ++ if (n == BTREE_FLUSH_NR) ++ break; ++ } ++ mutex_unlock(&c->bucket_lock); ++ ++ for (i = 0; i < n; i++) { ++ b = btree_nodes[i]; ++ if (!b) { ++ pr_err("BUG: btree_nodes[%d] is NULL", i); ++ continue; ++ } ++ ++ /* safe to check without holding b->write_lock */ ++ if (!btree_node_journal_flush(b)) { ++ pr_err("BUG: bnode %p: journal_flush bit cleaned", b); ++ continue; + } + +- b = best; +- if (b) { + mutex_lock(&b->write_lock); + if (!btree_current_write(b)->journal) { + mutex_unlock(&b->write_lock); +- /* We raced */ +- atomic_long_inc(&c->retry_flush_write); +- goto retry; ++ pr_debug("bnode %p: written by others", b); ++ clear_bit(BTREE_NODE_journal_flush, &b->flags); ++ continue; ++ } ++ ++ if (!btree_node_dirty(b)) { ++ pr_debug("bnode %p: dirty bit cleaned by others", b); ++ clear_bit(BTREE_NODE_journal_flush, &b->flags); ++ mutex_unlock(&b->write_lock); ++ continue; + } + + __bch_btree_node_write(b, NULL); ++ clear_bit(BTREE_NODE_journal_flush, &b->flags); + mutex_unlock(&b->write_lock); + } ++ ++ spin_lock(&c->journal.flush_write_lock); ++ c->journal.btree_flushing = false; ++ spin_unlock(&c->journal.flush_write_lock); + } + + #define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1) +@@ -875,6 +921,7 @@ int bch_journal_alloc(struct cache_set *c) + struct journal *j = &c->journal; + + spin_lock_init(&j->lock); ++ spin_lock_init(&j->flush_write_lock); + INIT_DELAYED_WORK(&j->work, journal_write_work); + + c->journal_delay_ms = 100; +diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h +index 66f0facff84b..aeed791f05e7 100644 +--- a/drivers/md/bcache/journal.h ++++ b/drivers/md/bcache/journal.h +@@ -103,6 +103,8 @@ struct journal_write { + /* Embedded in struct cache_set */ + struct journal { + spinlock_t lock; ++ spinlock_t flush_write_lock; ++ bool btree_flushing; + /* used when waiting because the journal was full */ + struct closure_waitlist wait; + struct closure io; +@@ -154,6 +156,8 @@ struct journal_device { + struct bio_vec bv[8]; + }; + ++#define BTREE_FLUSH_NR 32 ++ + #define journal_pin_cmp(c, l, r) \ + (fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r))) + +-- +2.16.4 + diff --git a/for-next/old3/0022-bcache-remove-retry_flush_write-from-struct-cache_se.patch b/for-next/old3/0022-bcache-remove-retry_flush_write-from-struct-cache_se.patch new file mode 100644 index 0000000..9f06dec --- /dev/null +++ b/for-next/old3/0022-bcache-remove-retry_flush_write-from-struct-cache_se.patch @@ -0,0 +1,58 @@ +From 6438edae034748b198ff4b301e5b435113387d01 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sat, 1 Jun 2019 01:58:23 +0800 +Subject: [PATCH 22/32] bcache: remove retry_flush_write from struct cache_set + +useless anymore + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/bcache.h | 1 - + drivers/md/bcache/sysfs.c | 5 ----- + 2 files changed, 6 deletions(-) + +diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h +index cb268d7c6cea..35396248a7d5 100644 +--- a/drivers/md/bcache/bcache.h ++++ b/drivers/md/bcache/bcache.h +@@ -706,7 +706,6 @@ struct cache_set { + + atomic_long_t reclaim; + atomic_long_t flush_write; +- atomic_long_t retry_flush_write; + + enum { + ON_ERROR_UNREGISTER, +diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c +index 961a13a223ee..0bfe4e30c501 100644 +--- a/drivers/md/bcache/sysfs.c ++++ b/drivers/md/bcache/sysfs.c +@@ -83,7 +83,6 @@ read_attribute(state); + read_attribute(cache_read_races); + read_attribute(reclaim); + read_attribute(flush_write); +-read_attribute(retry_flush_write); + read_attribute(writeback_keys_done); + read_attribute(writeback_keys_failed); + read_attribute(io_errors); +@@ -704,9 +703,6 @@ SHOW(__bch_cache_set) + sysfs_print(flush_write, + atomic_long_read(&c->flush_write)); + +- sysfs_print(retry_flush_write, +- atomic_long_read(&c->retry_flush_write)); +- + sysfs_print(writeback_keys_done, + atomic_long_read(&c->writeback_keys_done)); + sysfs_print(writeback_keys_failed, +@@ -931,7 +927,6 @@ static struct attribute *bch_cache_set_internal_files[] = { + &sysfs_cache_read_races, + &sysfs_reclaim, + &sysfs_flush_write, +- &sysfs_retry_flush_write, + &sysfs_writeback_keys_done, + &sysfs_writeback_keys_failed, + +-- +2.16.4 + diff --git a/for-next/old3/0023-bcache-use-bcache_mod_wq-to-replace-system-wide-syst.patch b/for-next/old3/0023-bcache-use-bcache_mod_wq-to-replace-system-wide-syst.patch new file mode 100644 index 0000000..ecb5500 --- /dev/null +++ b/for-next/old3/0023-bcache-use-bcache_mod_wq-to-replace-system-wide-syst.patch @@ -0,0 +1,107 @@ +From fa0d3525fd1572c44f2568513670dc7742c62ccd Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sun, 2 Jun 2019 00:36:18 +0800 +Subject: [PATCH 23/32] bcache: use bcache_mod_wq to replace system wide + system_wq + +to avoid blocking happens in bcache worker blocks other kernel +subsystem kworker (e.g. network). + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/super.c | 21 ++++++++++++++------- + 1 file changed, 14 insertions(+), 7 deletions(-) + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 97dbe3151a9c..915ff9365ec6 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -47,6 +47,7 @@ static LIST_HEAD(uncached_devices); + static int bcache_major; + static DEFINE_IDA(bcache_device_idx); + static wait_queue_head_t unregister_wait; ++struct workqueue_struct *bcache_mod_wq; + struct workqueue_struct *bcache_wq; + struct workqueue_struct *bch_journal_wq; + +@@ -1260,7 +1261,7 @@ static void cached_dev_flush(struct closure *cl) + bch_cache_accounting_destroy(&dc->accounting); + kobject_del(&d->kobj); + +- continue_at(cl, cached_dev_free, system_wq); ++ continue_at(cl, cached_dev_free, bcache_mod_wq); + } + + static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) +@@ -1272,7 +1273,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) + __module_get(THIS_MODULE); + INIT_LIST_HEAD(&dc->list); + closure_init(&dc->disk.cl, NULL); +- set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq); ++ set_closure_fn(&dc->disk.cl, cached_dev_flush, bcache_mod_wq); + kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype); + INIT_WORK(&dc->detach, cached_dev_detach_finish); + sema_init(&dc->sb_write_mutex, 1); +@@ -1395,7 +1396,7 @@ static void flash_dev_flush(struct closure *cl) + bcache_device_unlink(d); + mutex_unlock(&bch_register_lock); + kobject_del(&d->kobj); +- continue_at(cl, flash_dev_free, system_wq); ++ continue_at(cl, flash_dev_free, bcache_mod_wq); + } + + static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) +@@ -1406,7 +1407,7 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) + return -ENOMEM; + + closure_init(&d->cl, NULL); +- set_closure_fn(&d->cl, flash_dev_flush, system_wq); ++ set_closure_fn(&d->cl, flash_dev_flush, bcache_mod_wq); + + kobject_init(&d->kobj, &bch_flash_dev_ktype); + +@@ -1714,7 +1715,7 @@ static void __cache_set_unregister(struct closure *cl) + + mutex_unlock(&bch_register_lock); + +- continue_at(cl, cache_set_flush, system_wq); ++ continue_at(cl, cache_set_flush, bcache_mod_wq); + } + + void bch_cache_set_stop(struct cache_set *c) +@@ -1743,10 +1744,10 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) + + __module_get(THIS_MODULE); + closure_init(&c->cl, NULL); +- set_closure_fn(&c->cl, cache_set_free, system_wq); ++ set_closure_fn(&c->cl, cache_set_free, bcache_mod_wq); + + closure_init(&c->caching, &c->cl); +- set_closure_fn(&c->caching, __cache_set_unregister, system_wq); ++ set_closure_fn(&c->caching, __cache_set_unregister, bcache_mod_wq); + + /* Maybe create continue_at_noreturn() and use it here? */ + closure_set_stopped(&c->cl); +@@ -2583,6 +2584,8 @@ static void bcache_exit(void) + bch_request_exit(); + if (bcache_kobj) + kobject_put(bcache_kobj); ++ if (bcache_mod_wq) ++ destroy_workqueue(bcache_mod_wq); + if (bcache_wq) + destroy_workqueue(bcache_wq); + if (bch_journal_wq) +@@ -2642,6 +2645,10 @@ static int __init bcache_init(void) + return bcache_major; + } + ++ bcache_mod_wq = alloc_workqueue("bcache_mod_wq", WQ_MEM_RECLAIM, 0); ++ if (!bcache_mod_wq) ++ goto err; ++ + bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0); + if (!bcache_wq) + goto err; +-- +2.16.4 + diff --git a/for-next/old3/0024-bcache-add-reclaimed_journal_buckets-to-struct-cache.patch b/for-next/old3/0024-bcache-add-reclaimed_journal_buckets-to-struct-cache.patch new file mode 100644 index 0000000..0d6de33 --- /dev/null +++ b/for-next/old3/0024-bcache-add-reclaimed_journal_buckets-to-struct-cache.patch @@ -0,0 +1,73 @@ +From d1be6ec13c2c64ba3834c23f096c7d9adbcdf5f5 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sun, 2 Jun 2019 00:47:23 +0800 +Subject: [PATCH 24/32] bcache: add reclaimed_journal_buckets to struct + cache_set + +An increase-only counter, to count how many buckets are indeed +reclaimed in journal_reclaim(). + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/bcache.h | 1 + + drivers/md/bcache/journal.c | 1 + + drivers/md/bcache/sysfs.c | 5 +++++ + 3 files changed, 7 insertions(+) + +diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h +index 35396248a7d5..013e35a9e317 100644 +--- a/drivers/md/bcache/bcache.h ++++ b/drivers/md/bcache/bcache.h +@@ -705,6 +705,7 @@ struct cache_set { + atomic_long_t writeback_keys_failed; + + atomic_long_t reclaim; ++ atomic_long_t reclaimed_journal_buckets; + atomic_long_t flush_write; + + enum { +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index 6dc02f8ae079..ef4142c623fe 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -614,6 +614,7 @@ static void journal_reclaim(struct cache_set *c) + k->ptr[n++] = MAKE_PTR(0, + bucket_to_sector(c, ca->sb.d[ja->cur_idx]), + ca->sb.nr_this_dev); ++ atomic_long_inc(&c->reclaimed_journal_buckets); + } + + if (n) { +diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c +index 0bfe4e30c501..4ab15442cab5 100644 +--- a/drivers/md/bcache/sysfs.c ++++ b/drivers/md/bcache/sysfs.c +@@ -82,6 +82,7 @@ read_attribute(bset_tree_stats); + read_attribute(state); + read_attribute(cache_read_races); + read_attribute(reclaim); ++read_attribute(reclaimed_journal_buckets); + read_attribute(flush_write); + read_attribute(writeback_keys_done); + read_attribute(writeback_keys_failed); +@@ -700,6 +701,9 @@ SHOW(__bch_cache_set) + sysfs_print(reclaim, + atomic_long_read(&c->reclaim)); + ++ sysfs_print(reclaimed_journal_buckets, ++ atomic_long_read(&c->reclaimed_journal_buckets)); ++ + sysfs_print(flush_write, + atomic_long_read(&c->flush_write)); + +@@ -926,6 +930,7 @@ static struct attribute *bch_cache_set_internal_files[] = { + &sysfs_bset_tree_stats, + &sysfs_cache_read_races, + &sysfs_reclaim, ++ &sysfs_reclaimed_journal_buckets, + &sysfs_flush_write, + &sysfs_writeback_keys_done, + &sysfs_writeback_keys_failed, +-- +2.16.4 + diff --git a/for-next/old3/0025-bcache-acquire-bch_register_lock-later-in-cached_dev.patch b/for-next/old3/0025-bcache-acquire-bch_register_lock-later-in-cached_dev.patch new file mode 100644 index 0000000..758c25d --- /dev/null +++ b/for-next/old3/0025-bcache-acquire-bch_register_lock-later-in-cached_dev.patch @@ -0,0 +1,38 @@ +From 2f9473f7715691adc07165ac9bf8394693071fbb Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sun, 2 Jun 2019 01:06:12 +0800 +Subject: [PATCH 25/32] bcache: acquire bch_register_lock later in + cached_dev_detach_finish() + +To avoid potential deadlock on bch_register_lock when stopping kthread +or cancel kworker (which is done in sync way) + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/super.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 915ff9365ec6..bf28a51dbdea 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -1018,7 +1018,6 @@ static void cached_dev_detach_finish(struct work_struct *w) + BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)); + BUG_ON(refcount_read(&dc->count)); + +- mutex_lock(&bch_register_lock); + + if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) + cancel_writeback_rate_update_dwork(dc); +@@ -1034,6 +1033,8 @@ static void cached_dev_detach_finish(struct work_struct *w) + bch_write_bdev_super(dc, &cl); + closure_sync(&cl); + ++ mutex_lock(&bch_register_lock); ++ + calc_cached_dev_sectors(dc->disk.c); + bcache_device_detach(&dc->disk); + list_move(&dc->list, &uncached_devices); +-- +2.16.4 + diff --git a/for-next/old3/0026-bcache-move-dc-io_disable-into-dc-flags.patch b/for-next/old3/0026-bcache-move-dc-io_disable-into-dc-flags.patch new file mode 100644 index 0000000..705b89a --- /dev/null +++ b/for-next/old3/0026-bcache-move-dc-io_disable-into-dc-flags.patch @@ -0,0 +1,170 @@ +From 3153d5b784eb8a6008cbd7a6087d8eaf1e8f9fe8 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sun, 2 Jun 2019 01:41:01 +0800 +Subject: [PATCH 26/32] bcache: move dc->io_disable into dc->flags + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/bcache.h | 3 ++- + drivers/md/bcache/request.c | 4 ++-- + drivers/md/bcache/super.c | 36 ++++++++++++++++++++++-------------- + drivers/md/bcache/sysfs.c | 9 +++++++-- + 4 files changed, 33 insertions(+), 19 deletions(-) + +diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h +index 013e35a9e317..ccfc3b245462 100644 +--- a/drivers/md/bcache/bcache.h ++++ b/drivers/md/bcache/bcache.h +@@ -362,7 +362,8 @@ struct cached_dev { + unsigned int sequential_cutoff; + unsigned int readahead; + +- unsigned int io_disable:1; ++#define CACHED_DEV_IO_DISABLED 0 ++ unsigned long flags; + unsigned int verify:1; + unsigned int bypass_torture_test:1; + +diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c +index 41adcd1546f1..4bdf5be04c0a 100644 +--- a/drivers/md/bcache/request.c ++++ b/drivers/md/bcache/request.c +@@ -1175,7 +1175,7 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, + int rw = bio_data_dir(bio); + + if (unlikely((d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags)) || +- dc->io_disable)) { ++ test_bit(CACHED_DEV_IO_DISABLED, &dc->flags))) { + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + return BLK_QC_T_NONE; +@@ -1236,7 +1236,7 @@ static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode, + { + struct cached_dev *dc = container_of(d, struct cached_dev, disk); + +- if (dc->io_disable) ++ if (test_bit(CACHED_DEV_IO_DISABLED, &dc->flags)) + return -EIO; + + return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg); +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index bf28a51dbdea..c219a1aeef02 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -888,10 +888,11 @@ static int cached_dev_status_update(void *arg) + + /* + * If this delayed worker is stopping outside, directly quit here. +- * dc->io_disable might be set via sysfs interface, so check it +- * here too. ++ * CACHED_DEV_IO_DISABLED might be set via sysfs interface, so check ++ * it here too. + */ +- while (!kthread_should_stop() && !dc->io_disable) { ++ while (!kthread_should_stop() && ++ !test_bit(CACHED_DEV_IO_DISABLED, &dc->flags)) { + q = bdev_get_queue(dc->bdev); + if (blk_queue_dying(q)) + dc->offline_seconds++; +@@ -904,8 +905,11 @@ static int cached_dev_status_update(void *arg) + BACKING_DEV_OFFLINE_TIMEOUT); + pr_err("%s: disable I/O request due to backing " + "device offline", dc->disk.name); +- dc->io_disable = true; +- /* let others know earlier that io_disable is true */ ++ set_bit(CACHED_DEV_IO_DISABLED, &dc->flags); ++ /* ++ * let others know earlier that CACHED_DEV_IO_DISABLED ++ * is set. ++ */ + smp_mb(); + bcache_device_stop(&dc->disk); + break; +@@ -929,7 +933,7 @@ int bch_cached_dev_run(struct cached_dev *dc) + NULL, + }; + +- if (dc->io_disable) { ++ if (test_bit(CACHED_DEV_IO_DISABLED, &dc->flags)) { + pr_err("I/O disabled on cached dev %s", + dc->backing_dev_name); + return -EIO; +@@ -1305,7 +1309,11 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) + q->backing_dev_info->ra_pages); + + atomic_set(&dc->io_errors, 0); +- dc->io_disable = false; ++ /* ++ * Clear following bit position in dc->flags ++ * - CACHED_DEV_IO_DISABLED ++ */ ++ dc->flags = 0; + dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT; + /* default to auto */ + dc->stop_when_cache_set_failed = BCH_CACHED_DEV_STOP_AUTO; +@@ -1480,8 +1488,8 @@ bool bch_cached_dev_error(struct cached_dev *dc) + if (!dc || test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags)) + return false; + +- dc->io_disable = true; +- /* make others know io_disable is true earlier */ ++ set_bit(CACHED_DEV_IO_DISABLED, &dc->flags); ++ /* make others know CACHED_DEV_IO_DISABLED is set earlier */ + smp_mb(); + + pr_err("stop %s: too many IO errors on backing device %s\n", +@@ -1489,7 +1497,7 @@ bool bch_cached_dev_error(struct cached_dev *dc) + + /* + * If the cached device is still attached to a cache set, +- * even dc->io_disable is true and no more I/O requests ++ * even CACHED_DEV_IO_DISABLED is set and no more I/O requests + * accepted, cache device internal I/O (writeback scan or + * garbage collection) may still prevent bcache device from + * being stopped. So here CACHE_SET_IO_DISABLE should be +@@ -1672,11 +1680,11 @@ static void conditional_stop_bcache_device(struct cache_set *c, + * behavior may also introduce potential inconsistence + * data in writeback mode while cache is dirty. + * Therefore before calling bcache_device_stop() due +- * to a broken cache device, dc->io_disable should be +- * explicitly set to true. ++ * to a broken cache device, CACHED_DEV_IO_DISABLED should ++ * be explicitly set. + */ +- dc->io_disable = true; +- /* make others know io_disable is true earlier */ ++ set_bit(CACHED_DEV_IO_DISABLED, &dc->flags); ++ /* make others know CACHED_DEV_IO_DISABLED is set earlier */ + smp_mb(); + bcache_device_stop(d); + } else { +diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c +index 4ab15442cab5..4bb1592270b1 100644 +--- a/drivers/md/bcache/sysfs.c ++++ b/drivers/md/bcache/sysfs.c +@@ -180,7 +180,8 @@ SHOW(__bch_cached_dev) + wb ? atomic_long_read(&dc->writeback_rate.rate) << 9 : 0); + sysfs_hprint(io_errors, atomic_read(&dc->io_errors)); + sysfs_printf(io_error_limit, "%i", dc->error_limit); +- sysfs_printf(io_disable, "%i", dc->io_disable); ++ sysfs_printf(io_disable, "%i", ++ (int)test_bit(CACHED_DEV_IO_DISABLED, &dc->flags)); + var_print(writeback_rate_update_seconds); + var_print(writeback_rate_i_term_inverse); + var_print(writeback_rate_p_term_inverse); +@@ -319,7 +320,11 @@ STORE(__cached_dev) + if (attr == &sysfs_io_disable) { + int v = strtoul_or_return(buf); + +- dc->io_disable = v ? 1 : 0; ++ if (v > 0) ++ set_bit(CACHED_DEV_IO_DISABLED, &dc->flags); ++ else ++ clear_bit(CACHED_DEV_IO_DISABLED, &dc->flags); ++ return size; + } + + sysfs_strtoul_clamp(sequential_cutoff, +-- +2.16.4 + diff --git a/for-next/old3/0027-bcache-add-CACHED_DEV_FREEING-to-dc-flags.patch b/for-next/old3/0027-bcache-add-CACHED_DEV_FREEING-to-dc-flags.patch new file mode 100644 index 0000000..df965a1 --- /dev/null +++ b/for-next/old3/0027-bcache-add-CACHED_DEV_FREEING-to-dc-flags.patch @@ -0,0 +1,51 @@ +From 036abf0061161f55312f85314308eea5e6e04988 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sun, 2 Jun 2019 03:18:23 +0800 +Subject: [PATCH 27/32] bcache: add CACHED_DEV_FREEING to dc->flags + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/bcache.h | 1 + + drivers/md/bcache/super.c | 7 +++++++ + 2 files changed, 8 insertions(+) + +diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h +index ccfc3b245462..aae69060db7a 100644 +--- a/drivers/md/bcache/bcache.h ++++ b/drivers/md/bcache/bcache.h +@@ -363,6 +363,7 @@ struct cached_dev { + unsigned int readahead; + + #define CACHED_DEV_IO_DISABLED 0 ++#define CACHED_DEV_FREEING 1 + unsigned long flags; + unsigned int verify:1; + unsigned int bypass_torture_test:1; +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index c219a1aeef02..6cde434edaa9 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -1227,6 +1227,12 @@ static void cached_dev_free(struct closure *cl) + { + struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); + ++ if (test_and_set_bit(CACHED_DEV_FREEING, &dc->flags)) { ++ pr_info("cached device %s is freeing already", ++ dc->backing_dev_name); ++ return; ++ } ++ + mutex_lock(&bch_register_lock); + + if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) +@@ -1312,6 +1318,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) + /* + * Clear following bit position in dc->flags + * - CACHED_DEV_IO_DISABLED ++ * - CACHED_DEV_FREEING + */ + dc->flags = 0; + dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT; +-- +2.16.4 + diff --git a/for-next/old3/0028-bcache-acquire-bch_register_lock-later-in-cached_dev.patch b/for-next/old3/0028-bcache-acquire-bch_register_lock-later-in-cached_dev.patch new file mode 100644 index 0000000..feee299 --- /dev/null +++ b/for-next/old3/0028-bcache-acquire-bch_register_lock-later-in-cached_dev.patch @@ -0,0 +1,39 @@ +From 4dd4c8a26edc64e9a6b072de3ed0b5ba19d72c3c Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sun, 2 Jun 2019 03:21:16 +0800 +Subject: [PATCH 28/32] bcache: acquire bch_register_lock later in + cached_dev_free() + +to avoid deadlock when stopping workqueue. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/super.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 6cde434edaa9..4ebbf6930d31 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -1233,8 +1233,6 @@ static void cached_dev_free(struct closure *cl) + return; + } + +- mutex_lock(&bch_register_lock); +- + if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) + cancel_writeback_rate_update_dwork(dc); + +@@ -1247,6 +1245,9 @@ static void cached_dev_free(struct closure *cl) + + if (atomic_read(&dc->running)) + bd_unlink_disk_holder(dc->bdev, dc->disk.disk); ++ ++ mutex_lock(&bch_register_lock); ++ + bcache_device_free(&dc->disk); + list_del(&dc->list); + +-- +2.16.4 + diff --git a/for-next/old3/0029-bcache-replace-system_wq-to-bcache_mod_wq.patch b/for-next/old3/0029-bcache-replace-system_wq-to-bcache_mod_wq.patch new file mode 100644 index 0000000..4897e85 --- /dev/null +++ b/for-next/old3/0029-bcache-replace-system_wq-to-bcache_mod_wq.patch @@ -0,0 +1,104 @@ +From b51fb8f54a265b7734d916016c20889a92ca0882 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sun, 2 Jun 2019 18:55:09 +0800 +Subject: [PATCH 29/32] bcache: replace system_wq to bcache_mod_wq + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/bcache.h | 3 ++- + drivers/md/bcache/btree.c | 4 ++-- + drivers/md/bcache/journal.c | 2 +- + drivers/md/bcache/sysfs.c | 2 +- + drivers/md/bcache/writeback.c | 4 ++-- + 5 files changed, 8 insertions(+), 7 deletions(-) + +diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h +index aae69060db7a..e7f0c42ab234 100644 +--- a/drivers/md/bcache/bcache.h ++++ b/drivers/md/bcache/bcache.h +@@ -870,10 +870,11 @@ do { \ + for (b = (ca)->buckets + (ca)->sb.first_bucket; \ + b < (ca)->buckets + (ca)->sb.nbuckets; b++) + ++extern struct workqueue_struct *bcache_mod_wq; + static inline void cached_dev_put(struct cached_dev *dc) + { + if (refcount_dec_and_test(&dc->count)) +- schedule_work(&dc->detach); ++ queue_work(bcache_mod_wq, &dc->detach); + } + + static inline bool cached_dev_get(struct cached_dev *dc) +diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c +index c0dd8fde37af..8325a2d11717 100644 +--- a/drivers/md/bcache/btree.c ++++ b/drivers/md/bcache/btree.c +@@ -366,7 +366,7 @@ static void __btree_node_write_done(struct closure *cl) + btree_complete_write(b, w); + + if (btree_node_dirty(b)) +- schedule_delayed_work(&b->work, 30 * HZ); ++ queue_delayed_work(bcache_mod_wq, &b->work, 30 * HZ); + + closure_return_with_destructor(cl, btree_node_write_unlock); + } +@@ -539,7 +539,7 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) + BUG_ON(!i->keys); + + if (!btree_node_dirty(b)) +- schedule_delayed_work(&b->work, 30 * HZ); ++ queue_delayed_work(bcache_mod_wq, &b->work, 30 * HZ); + + set_btree_node_dirty(b); + +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index ef4142c623fe..646e0386de4a 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -887,7 +887,7 @@ atomic_t *bch_journal(struct cache_set *c, + journal_try_write(c); + } else if (!w->dirty) { + w->dirty = true; +- schedule_delayed_work(&c->journal.work, ++ queue_delayed_work(bcache_mod_wq, &c->journal.work, + msecs_to_jiffies(c->journal_delay_ms)); + spin_unlock(&c->journal.lock); + } else { +diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c +index 4bb1592270b1..849146d539c9 100644 +--- a/drivers/md/bcache/sysfs.c ++++ b/drivers/md/bcache/sysfs.c +@@ -447,7 +447,7 @@ STORE(bch_cached_dev) + + if (attr == &sysfs_writeback_percent) + if (!test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) +- schedule_delayed_work(&dc->writeback_rate_update, ++ queue_delayed_work(bcache_mod_wq, &dc->writeback_rate_update, + dc->writeback_rate_update_seconds * HZ); + + mutex_unlock(&bch_register_lock); +diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c +index 73f0efac2b9f..54f68ae9d343 100644 +--- a/drivers/md/bcache/writeback.c ++++ b/drivers/md/bcache/writeback.c +@@ -212,7 +212,7 @@ static void update_writeback_rate(struct work_struct *work) + */ + if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) && + !test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { +- schedule_delayed_work(&dc->writeback_rate_update, ++ queue_delayed_work(bcache_mod_wq, &dc->writeback_rate_update, + dc->writeback_rate_update_seconds * HZ); + } + +@@ -835,7 +835,7 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc) + dc->writeback_running = true; + + WARN_ON(test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); +- schedule_delayed_work(&dc->writeback_rate_update, ++ queue_delayed_work(bcache_mod_wq, &dc->writeback_rate_update, + dc->writeback_rate_update_seconds * HZ); + + bch_writeback_queue(dc); +-- +2.16.4 + diff --git a/for-next/old3/0030-bcache-shrink-btree-node-cache-after-bch_btree_check.patch b/for-next/old3/0030-bcache-shrink-btree-node-cache-after-bch_btree_check.patch new file mode 100644 index 0000000..bccb7bb --- /dev/null +++ b/for-next/old3/0030-bcache-shrink-btree-node-cache-after-bch_btree_check.patch @@ -0,0 +1,45 @@ +From 39e7f0126c8d7c851c275d563cdf5c0d6d40fbe9 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Fri, 31 May 2019 17:29:56 +0800 +Subject: [PATCH 30/32] bcache: shrink btree node cache after bch_btree_check() + +To release memory proactively for memory allocation in following +routines. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/super.c | 18 ++++++++++++++++++ + 1 file changed, 18 insertions(+) + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 4ebbf6930d31..1ead10e5253a 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -1907,6 +1907,24 @@ static int run_cache_set(struct cache_set *c) + if (bch_btree_check(c)) + goto err; + ++ /* ++ * bch_btree_check() may occupy too much system memory which ++ * will fail memory allocation operations in the following ++ * routines before kernel triggers memory shrinker call backs. ++ * Shrinking 25% mca cache memory proactively here to avoid ++ * potential memory allocation failure. ++ */ ++ if (!c->shrinker_disabled) { ++ struct shrink_control sc; ++ ++ sc.gfp_mask = GFP_KERNEL; ++ sc.nr_to_scan = ++ c->shrink.count_objects(&c->shrink, &sc) / 4; ++ pr_debug("try to shrink %lu (25%%) cached btree node", ++ sc.nr_to_scan); ++ c->shrink.scan_objects(&c->shrink, &sc); ++ } ++ + bch_journal_mark(c, &journal); + bch_initial_gc_finish(c); + pr_debug("btree_check() done"); +-- +2.16.4 + diff --git a/for-next/old3/0031-bcache-fix-potential-deadlock-in-cached_def_free.patch b/for-next/old3/0031-bcache-fix-potential-deadlock-in-cached_def_free.patch new file mode 100644 index 0000000..bceb03c --- /dev/null +++ b/for-next/old3/0031-bcache-fix-potential-deadlock-in-cached_def_free.patch @@ -0,0 +1,134 @@ +From 2bf5d272e3a6fd93802a093229c922c67ff949e9 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Tue, 4 Jun 2019 14:28:33 +0800 +Subject: [PATCH 31/32] bcache: fix potential deadlock in cached_def_free() + +This patch moves destroying dc->writeback_write_wq from cached_dev_free() +to bch_writeback_thread(). Then a potential deadlock reported by lockdep +engine can be fixed: + +[ 325.736697][ T3394] +====================================================== +[ 325.736700][ T3394] WARNING: possible circular locking dependency +detected +[ 325.927249][ T3394] 5.1.0-lp151.28.4-default+ #5 Tainted: G W +[ 325.927252][ T3394] +------------------------------------------------------ +[ 326.702279][ T3394] kworker/32:8/3394 is trying to acquire lock: +[ 326.702282][ T3394] 0000000006300cd1 +((wq_completion)bcache_writeback_wq){+.+.}, at: flush_workqueue+0x87/0x540 +[ 326.868901][ T3394] +[ 326.868901][ T3394] but task is already holding lock: +[ 326.868904][ T3394] 00000000efbccbbd +((work_completion)(&cl->work)#2){+.+.}, at: process_one_work+0x1b5/0x5e0 +[ 327.065686][ T3394] +[ 327.065686][ T3394] which lock already depends on the new lock. +[ 327.065686][ T3394] +[ 327.065688][ T3394] +[ 327.065688][ T3394] the existing dependency chain (in reverse order) is: +[ 327.319678][ T3394] +[ 327.319678][ T3394] -> #1 ((work_completion)(&cl->work)#2){+.+.}: +[ 327.492545][ T3394] process_one_work+0x20e/0x5e0 +[ 327.492547][ T3394] worker_thread+0x3c/0x390 +[ 327.672704][ T3394] kthread+0x125/0x140 +[ 327.672708][ T3394] ret_from_fork+0x3a/0x50 +[ 327.901739][ T3394] +[ 327.901739][ T3394] -> #0 ((wq_completion)bcache_writeback_wq){+.+.}: +[ 328.233746][ T3394] lock_acquire+0xb4/0x1c0 +[ 328.233749][ T3394] flush_workqueue+0xaa/0x540 +[ 328.367099][ T3394] drain_workqueue+0xa9/0x180 +[ 328.367102][ T3394] destroy_workqueue+0x17/0x250 +[ 328.519179][ T3394] cached_dev_free+0x5b/0x200 [bcache] +[ 328.519182][ T3394] process_one_work+0x23b/0x5e0 +[ 328.666057][ T3394] worker_thread+0x3c/0x390 +[ 328.666060][ T3394] kthread+0x125/0x140 +[ 328.817088][ T3394] ret_from_fork+0x3a/0x50 +[ 328.817091][ T3394] +[ 328.817091][ T3394] other info that might help us debug this: +[ 328.817091][ T3394] +[ 329.013887][ T3394] Possible unsafe locking scenario: +[ 329.013887][ T3394] +[ 329.013889][ T3394] CPU0 CPU1 +[ 329.164918][ T3394] ---- ---- +[ 329.164920][ T3394] lock((work_completion)(&cl->work)#2); +[ 329.330509][ T3394] +lock((wq_completion)bcache_writeback_wq); +[ 329.453461][ T3394] +lock((work_completion)(&cl->work)#2); +[ 329.600333][ T3394] lock((wq_completion)bcache_writeback_wq); +[ 329.756566][ T3394] +[ 329.756566][ T3394] *** DEADLOCK *** +[ 329.756566][ T3394] +[ 329.756568][ T3394] 2 locks held by kworker/32:8/3394: +[ 329.908637][ T3394] #0: 00000000a9fd42af +((wq_completion)bcache_mod_wq){+.+.}, at: process_one_work+0x1b5/0x5e0 +[ 330.055510][ T3394] #1: 00000000efbccbbd +((work_completion)(&cl->work)#2){+.+.}, at: process_one_work+0x1b5/0x5e0 +[ 330.153501][ T3394] +[ 330.153501][ T3394] stack backtrace: +[ 330.153504][ T3394] CPU: 32 PID: 3394 Comm: kworker/32:8 Tainted: G + W 5.1.0-lp151.28.4-default+ #5 +[ 330.311808][ T3394] Hardware name: Lenovo ThinkSystem SR650 +-[7X05CTO1WW]-/-[7X05CTO1WW]-, BIOS -[IVE136T-2.10]- 03/22/2019 +[ 330.311815][ T3394] Workqueue: bcache_mod_wq cached_dev_free [bcache] +[ 330.431640][ T3394] Call Trace: +[ 330.431644][ T3394] dump_stack+0x85/0xcb +[ 330.562909][ T3394] print_circular_bug+0x19a/0x1f0 +[ 330.692101][ T3394] __lock_acquire+0x177e/0x17d0 +[ 330.844173][ T3394] ? __lock_acquire+0x69a/0x17d0 +[ 331.009763][ T3394] ? lock_acquire+0xb4/0x1c0 +[ 331.139994][ T3394] ? find_held_lock+0x34/0xa0 +[ 331.139997][ T3394] lock_acquire+0xb4/0x1c0 +[ 331.293107][ T3394] ? flush_workqueue+0x87/0x540 +[ 331.411893][ T3394] flush_workqueue+0xaa/0x540 +[ 331.596203][ T3394] ? flush_workqueue+0x87/0x540 +[ 331.596206][ T3394] ? sched_clock+0x5/0x10 +[ 331.768030][ T3394] ? drain_workqueue+0xa9/0x180 +[ 331.959621][ T3394] drain_workqueue+0xa9/0x180 +[ 331.959624][ T3394] destroy_workqueue+0x17/0x250 +[ 332.157456][ T3394] cached_dev_free+0x5b/0x200 [bcache] +[ 332.296003][ T3394] process_one_work+0x23b/0x5e0 +[ 332.381511][ T3394] worker_thread+0x3c/0x390 +[ 332.494057][ T3394] ? process_one_work+0x5e0/0x5e0 +[ 332.604523][ T3394] kthread+0x125/0x140 +[ 332.604526][ T3394] ? kthread_create_worker_on_cpu+0x70/0x70 +[ 332.709790][ T3394] ret_from_fork+0x3a/0x50 +[ 337.090520][ T3394] bcache: cached_dev_free() + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/super.c | 2 -- + drivers/md/bcache/writeback.c | 4 ++++ + 2 files changed, 4 insertions(+), 2 deletions(-) + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 1ead10e5253a..9539a8cd67f5 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -1238,8 +1238,6 @@ static void cached_dev_free(struct closure *cl) + + if (!IS_ERR_OR_NULL(dc->writeback_thread)) + kthread_stop(dc->writeback_thread); +- if (dc->writeback_write_wq) +- destroy_workqueue(dc->writeback_write_wq); + if (!IS_ERR_OR_NULL(dc->status_update_thread)) + kthread_stop(dc->status_update_thread); + +diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c +index 54f68ae9d343..7536fac249b7 100644 +--- a/drivers/md/bcache/writeback.c ++++ b/drivers/md/bcache/writeback.c +@@ -735,6 +735,10 @@ static int bch_writeback_thread(void *arg) + } + } + ++ if (dc->writeback_write_wq) { ++ flush_workqueue(dc->writeback_write_wq); ++ destroy_workqueue(dc->writeback_write_wq); ++ } + cached_dev_put(dc); + wait_for_kthread_stop(); + +-- +2.16.4 + diff --git a/for-next/old3/0032-bcache-fix-return-value-error-in-bch_journal_read.patch b/for-next/old3/0032-bcache-fix-return-value-error-in-bch_journal_read.patch new file mode 100644 index 0000000..f2de204 --- /dev/null +++ b/for-next/old3/0032-bcache-fix-return-value-error-in-bch_journal_read.patch @@ -0,0 +1,42 @@ +From 095ebbae1dbccae3c04ac4432e8dc550568ab11e Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Tue, 4 Jun 2019 14:43:08 +0800 +Subject: [PATCH 32/32] bcache: fix return value error in bch_journal_read() + +When everything is OK in bch_journal_read(), finally the return value +is returned by, + return ret; +which assumes ret will be 0 here. This assumption is wrong when all +journal buckets as are full and filled with valid journal entries. In +such cache the last location referencess read_bucket() sets 'ret' to +1, which means new jset added into jset list. The jset list is list +'journal' in caller run_cache_set(). + +Return 1 to run_cache_set() means something wrong and the cache set +won't start, but indeed everything is OK. + +This patch changes the line at end of bch_journal_read() to directly +return 0 since everything if verything is good. Then a bogus error +is fixed. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/journal.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index 646e0386de4a..f5730dd1d3c2 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -293,7 +293,7 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) + struct journal_replay, + list)->j.seq; + +- return ret; ++ return 0; + #undef read_bucket + } + +-- +2.16.4 + diff --git a/for-next/old3/list b/for-next/old3/list new file mode 100644 index 0000000..b93bd26 --- /dev/null +++ b/for-next/old3/list @@ -0,0 +1,3 @@ +0023-bcache-use-bcache_mod_wq-to-replace-system-wide-syst.patch +0026-bcache-move-dc-io_disable-into-dc-flags.patch +0029-bcache-replace-system_wq-to-bcache_mod_wq.patch |