aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorColy Li <colyli@suse.de>2019-11-13 12:01:56 +0800
committerColy Li <colyli@suse.de>2019-11-13 12:01:56 +0800
commit04b62bc7b6b66568c57a2e487f456b5fb3168540 (patch)
treefee7115594e5b162ede3de5938697211d91196ac
parentc435d8a502aafa090103345ecc4a94228c4d1152 (diff)
downloadbcache-patches-04b62bc7b6b66568c57a2e487f456b5fb3168540.tar.gz
for-next: patches for 5.5
-rw-r--r--for-next/0001-bcache-add-cond_resched-in-__bch_cache_cmp.patch29
-rw-r--r--for-next/0001-bcache-fix-fifo-index-swapping-condition-in-journal_.patch81
-rw-r--r--for-next/0001-closures-fix-a-race-on-wakeup-from-closure_sync.patch35
-rw-r--r--for-next/0002-bcache-fix-a-lost-wake-up-problem-caused-by-mca_cann.patch92
-rw-r--r--for-next/0003-bcache-fix-static-checker-warning-in-bcache_device_f.patch94
-rw-r--r--for-next/0004-bcache-add-more-accurate-error-messages-in-read_supe.patch39
-rw-r--r--for-next/0005-bcache-deleted-code-comments-for-dead-code-in-bch_da.patch41
-rw-r--r--for-next/0006-bcache-add-code-comment-bch_keylist_pop-and-bch_keyl.patch38
-rw-r--r--for-next/0007-bcache-fix-deadlock-in-bcache_allocator.patch150
-rw-r--r--for-next/0008-bcache-add-code-comments-in-bch_btree_leaf_dirty.patch34
-rw-r--r--for-next/0009-bcache-add-idle_max_writeback_rate-sysfs-interface.patch105
-rw-r--r--for-next/0010-bcache-at-least-try-to-shrink-1-node-in-bch_mca_scan.patch46
-rw-r--r--for-test/0001-bcache-fix-deadlock-in-bcache_allocator.patch1
-rw-r--r--for-test/0001-bcache-only-set-b-accessed-1-for-dirty-btree-node-ca.patch28
14 files changed, 721 insertions, 92 deletions
diff --git a/for-next/0001-bcache-add-cond_resched-in-__bch_cache_cmp.patch b/for-next/0001-bcache-add-cond_resched-in-__bch_cache_cmp.patch
deleted file mode 100644
index edd8fb6..0000000
--- a/for-next/0001-bcache-add-cond_resched-in-__bch_cache_cmp.patch
+++ /dev/null
@@ -1,29 +0,0 @@
-From: Shile Zhang <shile.zhang@linux.alibaba.com>
-Date: Thu, 15 Aug 2019 00:51:51 +0800
-Subject: [PATCH] bcache: add cond_resched() in __bch_cache_cmp()
-
-Read /sys/fs/bcache/<uuid>/cacheN/priority_stats can take very long
-time with huge cache after long run.
-
-Signed-off-by: Shile Zhang <shile.zhang@linux.alibaba.com>
-Tested-by: Heitor Alves de Siqueira <halves@canonical.com>
-Signed-off-by: Coly Li <colyli@suse.de>
----
- drivers/md/bcache/sysfs.c | 1 +
- 1 file changed, 1 insertion(+)
-
-diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
-index 9f0826712845..6b29e34acf7a 100644
---- a/drivers/md/bcache/sysfs.c
-+++ b/drivers/md/bcache/sysfs.c
-@@ -960,6 +960,7 @@ KTYPE(bch_cache_set_internal);
-
- static int __bch_cache_cmp(const void *l, const void *r)
- {
-+ cond_resched();
- return *((uint16_t *)r) - *((uint16_t *)l);
- }
-
---
-2.16.4
-
diff --git a/for-next/0001-bcache-fix-fifo-index-swapping-condition-in-journal_.patch b/for-next/0001-bcache-fix-fifo-index-swapping-condition-in-journal_.patch
new file mode 100644
index 0000000..259d0df
--- /dev/null
+++ b/for-next/0001-bcache-fix-fifo-index-swapping-condition-in-journal_.patch
@@ -0,0 +1,81 @@
+From 77cedea4557df9dca93d6bfb33854a688e5cbce6 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Wed, 25 Sep 2019 22:16:33 +0800
+Subject: [PATCH 01/10] bcache: fix fifo index swapping condition in
+ journal_pin_cmp()
+
+Fifo structure journal.pin is implemented by a cycle buffer, if the back
+index reaches highest location of the cycle buffer, it will be swapped
+to 0. Once the swapping happens, it means a smaller fifo index might be
+associated to a newer journal entry. So the btree node with oldest
+journal entry won't be selected in bch_btree_leaf_dirty() to reference
+the dirty B+tree leaf node. This problem may cause bcache journal won't
+protect unflushed oldest B+tree dirty leaf node in power failure, and
+this B+tree leaf node is possible to beinconsistent after reboot from
+power failure.
+
+This patch fixes the fifo index comparing logic in journal_pin_cmp(),
+to avoid potential corrupted B+tree leaf node when the back index of
+journal pin is swapped.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+---
+ drivers/md/bcache/btree.c | 26 ++++++++++++++++++++++++++
+ drivers/md/bcache/journal.h | 4 ----
+ 2 files changed, 26 insertions(+), 4 deletions(-)
+
+diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
+index ba434d9ac720..00523cd1db80 100644
+--- a/drivers/md/bcache/btree.c
++++ b/drivers/md/bcache/btree.c
+@@ -528,6 +528,32 @@ static void btree_node_write_work(struct work_struct *w)
+ mutex_unlock(&b->write_lock);
+ }
+
++/* return true if journal pin 'l' is newer than 'r' */
++static bool journal_pin_cmp(struct cache_set *c,
++ atomic_t *l,
++ atomic_t *r)
++{
++ int l_idx, r_idx, f_idx, b_idx;
++ bool ret = false;
++
++ l_idx = fifo_idx(&(c)->journal.pin, (l));
++ r_idx = fifo_idx(&(c)->journal.pin, (r));
++ f_idx = (c)->journal.pin.front;
++ b_idx = (c)->journal.pin.back;
++
++ if (l_idx > r_idx)
++ ret = true;
++ /* in case fifo back pointer is swapped */
++ if (b_idx < f_idx) {
++ if (l_idx <= b_idx && r_idx >= f_idx)
++ ret = true;
++ else if (l_idx >= f_idx && r_idx <= b_idx)
++ ret = false;
++ }
++
++ return ret;
++}
++
+ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
+ {
+ struct bset *i = btree_bset_last(b);
+diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
+index f2ea34d5f431..06b3eaab7d16 100644
+--- a/drivers/md/bcache/journal.h
++++ b/drivers/md/bcache/journal.h
+@@ -157,10 +157,6 @@ struct journal_device {
+ };
+
+ #define BTREE_FLUSH_NR 8
+-
+-#define journal_pin_cmp(c, l, r) \
+- (fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r)))
+-
+ #define JOURNAL_PIN 20000
+
+ #define journal_full(j) \
+--
+2.16.4
+
diff --git a/for-next/0001-closures-fix-a-race-on-wakeup-from-closure_sync.patch b/for-next/0001-closures-fix-a-race-on-wakeup-from-closure_sync.patch
deleted file mode 100644
index 44096e4..0000000
--- a/for-next/0001-closures-fix-a-race-on-wakeup-from-closure_sync.patch
+++ /dev/null
@@ -1,35 +0,0 @@
-From 3c3c34a87be58548a302573dbe32b518f047db09 Mon Sep 17 00:00:00 2001
-From: Kent Overstreet <kent.overstreet@gmail.com>
-Date: Mon, 10 Jun 2019 15:14:20 -0400
-Subject: [PATCH] closures: fix a race on wakeup from closure_sync
-
-Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
-Acked-by: Coly Li <colyli@suse.de>
----
- drivers/md/bcache/closure.c | 10 ++++++++--
- 1 file changed, 8 insertions(+), 2 deletions(-)
-
-diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
-index 73f5319295bc..c12cd809ab19 100644
---- a/drivers/md/bcache/closure.c
-+++ b/drivers/md/bcache/closure.c
-@@ -105,8 +105,14 @@ struct closure_syncer {
-
- static void closure_sync_fn(struct closure *cl)
- {
-- cl->s->done = 1;
-- wake_up_process(cl->s->task);
-+ struct closure_syncer *s = cl->s;
-+ struct task_struct *p;
-+
-+ rcu_read_lock();
-+ p = READ_ONCE(s->task);
-+ s->done = 1;
-+ wake_up_process(p);
-+ rcu_read_unlock();
- }
-
- void __sched __closure_sync(struct closure *cl)
---
-2.16.4
-
diff --git a/for-next/0002-bcache-fix-a-lost-wake-up-problem-caused-by-mca_cann.patch b/for-next/0002-bcache-fix-a-lost-wake-up-problem-caused-by-mca_cann.patch
new file mode 100644
index 0000000..ae29ce1
--- /dev/null
+++ b/for-next/0002-bcache-fix-a-lost-wake-up-problem-caused-by-mca_cann.patch
@@ -0,0 +1,92 @@
+From b2641da321dd27c968515988b97e1cc20be6c937 Mon Sep 17 00:00:00 2001
+From: Guoju Fang <fangguoju@gmail.com>
+Date: Tue, 20 Aug 2019 06:13:55 -0400
+Subject: [PATCH 02/10] bcache: fix a lost wake-up problem caused by
+ mca_cannibalize_lock
+
+This patch fix a lost wake-up problem caused by the race between
+mca_cannibalize_lock and bch_cannibalize_unlock.
+
+Consider two processes, A and B. Process A is executing
+mca_cannibalize_lock, while process B takes c->btree_cache_alloc_lock
+and is executing bch_cannibalize_unlock. The problem happens that after
+process A executes cmpxchg and will execute prepare_to_wait. In this
+timeslice process B executes wake_up, but after that process A executes
+prepare_to_wait and set the state to TASK_INTERRUPTIBLE. Then process A
+goes to sleep but no one will wake up it. This problem may cause bcache
+device to dead.
+
+Signed-off-by: Guoju Fang <fangguoju@gmail.com>
+Signed-off-by: Coly Li <colyli@suse.de>
+---
+ drivers/md/bcache/bcache.h | 1 +
+ drivers/md/bcache/btree.c | 12 ++++++++----
+ drivers/md/bcache/super.c | 1 +
+ 3 files changed, 10 insertions(+), 4 deletions(-)
+
+diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
+index 013e35a9e317..3653faf3bf48 100644
+--- a/drivers/md/bcache/bcache.h
++++ b/drivers/md/bcache/bcache.h
+@@ -582,6 +582,7 @@ struct cache_set {
+ */
+ wait_queue_head_t btree_cache_wait;
+ struct task_struct *btree_cache_alloc_lock;
++ spinlock_t btree_cannibalize_lock;
+
+ /*
+ * When we free a btree node, we increment the gen of the bucket the
+diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
+index 00523cd1db80..39d7fc1ef1ee 100644
+--- a/drivers/md/bcache/btree.c
++++ b/drivers/md/bcache/btree.c
+@@ -910,15 +910,17 @@ static struct btree *mca_find(struct cache_set *c, struct bkey *k)
+
+ static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op)
+ {
+- struct task_struct *old;
+-
+- old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current);
+- if (old && old != current) {
++ spin_lock(&c->btree_cannibalize_lock);
++ if (likely(c->btree_cache_alloc_lock == NULL)) {
++ c->btree_cache_alloc_lock = current;
++ } else if (c->btree_cache_alloc_lock != current) {
+ if (op)
+ prepare_to_wait(&c->btree_cache_wait, &op->wait,
+ TASK_UNINTERRUPTIBLE);
++ spin_unlock(&c->btree_cannibalize_lock);
+ return -EINTR;
+ }
++ spin_unlock(&c->btree_cannibalize_lock);
+
+ return 0;
+ }
+@@ -953,10 +955,12 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op,
+ */
+ static void bch_cannibalize_unlock(struct cache_set *c)
+ {
++ spin_lock(&c->btree_cannibalize_lock);
+ if (c->btree_cache_alloc_lock == current) {
+ c->btree_cache_alloc_lock = NULL;
+ wake_up(&c->btree_cache_wait);
+ }
++ spin_unlock(&c->btree_cannibalize_lock);
+ }
+
+ static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op,
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index 20ed838e9413..ebb854ed05a4 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -1769,6 +1769,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
+ sema_init(&c->sb_write_mutex, 1);
+ mutex_init(&c->bucket_lock);
+ init_waitqueue_head(&c->btree_cache_wait);
++ spin_lock_init(&c->btree_cannibalize_lock);
+ init_waitqueue_head(&c->bucket_wait);
+ init_waitqueue_head(&c->gc_wait);
+ sema_init(&c->uuid_write_mutex, 1);
+--
+2.16.4
+
diff --git a/for-next/0003-bcache-fix-static-checker-warning-in-bcache_device_f.patch b/for-next/0003-bcache-fix-static-checker-warning-in-bcache_device_f.patch
new file mode 100644
index 0000000..d4faa45
--- /dev/null
+++ b/for-next/0003-bcache-fix-static-checker-warning-in-bcache_device_f.patch
@@ -0,0 +1,94 @@
+From 995330a14b286eed7407f8840e916720c31e440f Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sat, 28 Sep 2019 14:21:23 +0800
+Subject: [PATCH 03/10] bcache: fix static checker warning in
+ bcache_device_free()
+
+Commit cafe56359144 ("bcache: A block layer cache") leads to the
+following static checker warning:
+
+ ./drivers/md/bcache/super.c:770 bcache_device_free()
+ warn: variable dereferenced before check 'd->disk' (see line 766)
+
+drivers/md/bcache/super.c
+ 762 static void bcache_device_free(struct bcache_device *d)
+ 763 {
+ 764 lockdep_assert_held(&bch_register_lock);
+ 765
+ 766 pr_info("%s stopped", d->disk->disk_name);
+ ^^^^^^^^^
+Unchecked dereference.
+
+ 767
+ 768 if (d->c)
+ 769 bcache_device_detach(d);
+ 770 if (d->disk && d->disk->flags & GENHD_FL_UP)
+ ^^^^^^^
+Check too late.
+
+ 771 del_gendisk(d->disk);
+ 772 if (d->disk && d->disk->queue)
+ 773 blk_cleanup_queue(d->disk->queue);
+ 774 if (d->disk) {
+ 775 ida_simple_remove(&bcache_device_idx,
+ 776 first_minor_to_idx(d->disk->first_minor));
+ 777 put_disk(d->disk);
+ 778 }
+ 779
+
+It is not 100% sure that the gendisk struct of bcache device will always
+be there, the warning makes sense when there is problem in block core.
+
+This patch tries to remove the static checking warning by checking
+d->disk to avoid NULL pointer deferences.
+
+Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
+Signed-off-by: Coly Li <colyli@suse.de>
+---
+ drivers/md/bcache/super.c | 24 ++++++++++++++++--------
+ 1 file changed, 16 insertions(+), 8 deletions(-)
+
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index ebb854ed05a4..7beccede5360 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -761,20 +761,28 @@ static inline int idx_to_first_minor(int idx)
+
+ static void bcache_device_free(struct bcache_device *d)
+ {
++ struct gendisk *disk = d->disk;
++
+ lockdep_assert_held(&bch_register_lock);
+
+- pr_info("%s stopped", d->disk->disk_name);
++ if (disk)
++ pr_info("%s stopped", disk->disk_name);
++ else
++ pr_err("bcache device (NULL gendisk) stopped");
+
+ if (d->c)
+ bcache_device_detach(d);
+- if (d->disk && d->disk->flags & GENHD_FL_UP)
+- del_gendisk(d->disk);
+- if (d->disk && d->disk->queue)
+- blk_cleanup_queue(d->disk->queue);
+- if (d->disk) {
++
++ if (disk) {
++ if (disk->flags & GENHD_FL_UP)
++ del_gendisk(disk);
++
++ if (disk->queue)
++ blk_cleanup_queue(disk->queue);
++
+ ida_simple_remove(&bcache_device_idx,
+- first_minor_to_idx(d->disk->first_minor));
+- put_disk(d->disk);
++ first_minor_to_idx(disk->first_minor));
++ put_disk(disk);
+ }
+
+ bioset_exit(&d->bio_split);
+--
+2.16.4
+
diff --git a/for-next/0004-bcache-add-more-accurate-error-messages-in-read_supe.patch b/for-next/0004-bcache-add-more-accurate-error-messages-in-read_supe.patch
new file mode 100644
index 0000000..306712d
--- /dev/null
+++ b/for-next/0004-bcache-add-more-accurate-error-messages-in-read_supe.patch
@@ -0,0 +1,39 @@
+From fbe2ab194731b8ef42b3cc94c649b8db8c8e7440 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Mon, 30 Sep 2019 15:30:44 +0800
+Subject: [PATCH 04/10] bcache: add more accurate error messages in
+ read_super()
+
+Previous code only returns "Not a bcache superblock" for both bcache
+super block offset and magic error. This patch addss more accurate error
+messages,
+- for super block unmatched offset:
+ "Not a bcache superblock (bad offset)"
+- for super block unmatched magic number:
+ "Not a bcache superblock (bad magic)"
+
+Signed-off-by: Coly Li <colyli@suse.de>
+---
+ drivers/md/bcache/super.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index 7beccede5360..623fdaf10c4c 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -92,10 +92,11 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
+ pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
+ sb->version, sb->flags, sb->seq, sb->keys);
+
+- err = "Not a bcache superblock";
++ err = "Not a bcache superblock (bad offset)";
+ if (sb->offset != SB_SECTOR)
+ goto err;
+
++ err = "Not a bcache superblock (bad magic)";
+ if (memcmp(sb->magic, bcache_magic, 16))
+ goto err;
+
+--
+2.16.4
+
diff --git a/for-next/0005-bcache-deleted-code-comments-for-dead-code-in-bch_da.patch b/for-next/0005-bcache-deleted-code-comments-for-dead-code-in-bch_da.patch
new file mode 100644
index 0000000..016a694
--- /dev/null
+++ b/for-next/0005-bcache-deleted-code-comments-for-dead-code-in-bch_da.patch
@@ -0,0 +1,41 @@
+From 8dbad7b7f9bcbfd6ea04e640378b8f3f49f9f275 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Tue, 8 Oct 2019 21:37:01 +0800
+Subject: [PATCH 05/10] bcache: deleted code comments for dead code in
+ bch_data_insert_keys()
+
+In request.c:bch_data_insert_keys(), there is code comment for a piece
+of dead code. This patch deletes the dead code and its code comment
+since they are useless in practice.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+---
+ drivers/md/bcache/request.c | 12 ------------
+ 1 file changed, 12 deletions(-)
+
+diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
+index 41adcd1546f1..73478a91a342 100644
+--- a/drivers/md/bcache/request.c
++++ b/drivers/md/bcache/request.c
+@@ -62,18 +62,6 @@ static void bch_data_insert_keys(struct closure *cl)
+ struct bkey *replace_key = op->replace ? &op->replace_key : NULL;
+ int ret;
+
+- /*
+- * If we're looping, might already be waiting on
+- * another journal write - can't wait on more than one journal write at
+- * a time
+- *
+- * XXX: this looks wrong
+- */
+-#if 0
+- while (atomic_read(&s->cl.remaining) & CLOSURE_WAITING)
+- closure_sync(&s->cl);
+-#endif
+-
+ if (!op->replace)
+ journal_ref = bch_journal(op->c, &op->insert_keys,
+ op->flush_journal ? cl : NULL);
+--
+2.16.4
+
diff --git a/for-next/0006-bcache-add-code-comment-bch_keylist_pop-and-bch_keyl.patch b/for-next/0006-bcache-add-code-comment-bch_keylist_pop-and-bch_keyl.patch
new file mode 100644
index 0000000..97e1c24
--- /dev/null
+++ b/for-next/0006-bcache-add-code-comment-bch_keylist_pop-and-bch_keyl.patch
@@ -0,0 +1,38 @@
+From 7e659fd9ed2facc6d6782d2741c3f04293f9bf7a Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Thu, 10 Oct 2019 14:19:08 +0800
+Subject: [PATCH 06/10] bcache: add code comment bch_keylist_pop() and
+ bch_keylist_pop_front()
+
+This patch adds simple code comments for bch_keylist_pop() and
+bch_keylist_pop_front() in bset.c, to make the code more easier to
+be understand.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+---
+ drivers/md/bcache/bset.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
+index 08768796b543..f37a429f093d 100644
+--- a/drivers/md/bcache/bset.c
++++ b/drivers/md/bcache/bset.c
+@@ -155,6 +155,7 @@ int __bch_keylist_realloc(struct keylist *l, unsigned int u64s)
+ return 0;
+ }
+
++/* Pop the top key of keylist by pointing l->top to its previous key */
+ struct bkey *bch_keylist_pop(struct keylist *l)
+ {
+ struct bkey *k = l->keys;
+@@ -168,6 +169,7 @@ struct bkey *bch_keylist_pop(struct keylist *l)
+ return l->top = k;
+ }
+
++/* Pop the bottom key of keylist and update l->top_p */
+ void bch_keylist_pop_front(struct keylist *l)
+ {
+ l->top_p -= bkey_u64s(l->keys);
+--
+2.16.4
+
diff --git a/for-next/0007-bcache-fix-deadlock-in-bcache_allocator.patch b/for-next/0007-bcache-fix-deadlock-in-bcache_allocator.patch
new file mode 100644
index 0000000..2e7fccc
--- /dev/null
+++ b/for-next/0007-bcache-fix-deadlock-in-bcache_allocator.patch
@@ -0,0 +1,150 @@
+From 6932d005723a7a9d2b268c6c9065b2f6d47850cf Mon Sep 17 00:00:00 2001
+From: Andrea Righi <andrea.righi@canonical.com>
+Date: Wed, 7 Aug 2019 12:38:06 +0200
+Subject: [PATCH 07/10] bcache: fix deadlock in bcache_allocator
+
+bcache_allocator can call the following:
+
+ bch_allocator_thread()
+ -> bch_prio_write()
+ -> bch_bucket_alloc()
+ -> wait on &ca->set->bucket_wait
+
+But the wake up event on bucket_wait is supposed to come from
+bch_allocator_thread() itself => deadlock:
+
+[ 1158.490744] INFO: task bcache_allocato:15861 blocked for more than 10 seconds.
+[ 1158.495929] Not tainted 5.3.0-050300rc3-generic #201908042232
+[ 1158.500653] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
+[ 1158.504413] bcache_allocato D 0 15861 2 0x80004000
+[ 1158.504419] Call Trace:
+[ 1158.504429] __schedule+0x2a8/0x670
+[ 1158.504432] schedule+0x2d/0x90
+[ 1158.504448] bch_bucket_alloc+0xe5/0x370 [bcache]
+[ 1158.504453] ? wait_woken+0x80/0x80
+[ 1158.504466] bch_prio_write+0x1dc/0x390 [bcache]
+[ 1158.504476] bch_allocator_thread+0x233/0x490 [bcache]
+[ 1158.504491] kthread+0x121/0x140
+[ 1158.504503] ? invalidate_buckets+0x890/0x890 [bcache]
+[ 1158.504506] ? kthread_park+0xb0/0xb0
+[ 1158.504510] ret_from_fork+0x35/0x40
+
+Fix by making the call to bch_prio_write() non-blocking, so that
+bch_allocator_thread() never waits on itself.
+
+Moreover, make sure to wake up the garbage collector thread when
+bch_prio_write() is failing to allocate buckets.
+
+BugLink: https://bugs.launchpad.net/bugs/1784665
+BugLink: https://bugs.launchpad.net/bugs/1796292
+Signed-off-by: Andrea Righi <andrea.righi@canonical.com>
+Signed-off-by: Coly Li <colyli@suse.de>
+---
+ drivers/md/bcache/alloc.c | 5 ++++-
+ drivers/md/bcache/bcache.h | 2 +-
+ drivers/md/bcache/super.c | 27 +++++++++++++++++++++------
+ 3 files changed, 26 insertions(+), 8 deletions(-)
+
+diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
+index 6f776823b9ba..a1df0d95151c 100644
+--- a/drivers/md/bcache/alloc.c
++++ b/drivers/md/bcache/alloc.c
+@@ -377,7 +377,10 @@ static int bch_allocator_thread(void *arg)
+ if (!fifo_full(&ca->free_inc))
+ goto retry_invalidate;
+
+- bch_prio_write(ca);
++ if (bch_prio_write(ca, false) < 0) {
++ ca->invalidate_needs_gc = 1;
++ wake_up_gc(ca->set);
++ }
+ }
+ }
+ out:
+diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
+index 3653faf3bf48..50241e045c70 100644
+--- a/drivers/md/bcache/bcache.h
++++ b/drivers/md/bcache/bcache.h
+@@ -978,7 +978,7 @@ bool bch_cached_dev_error(struct cached_dev *dc);
+ __printf(2, 3)
+ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...);
+
+-void bch_prio_write(struct cache *ca);
++int bch_prio_write(struct cache *ca, bool wait);
+ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent);
+
+ extern struct workqueue_struct *bcache_wq;
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index 623fdaf10c4c..d1352fcc6ff2 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -530,12 +530,29 @@ static void prio_io(struct cache *ca, uint64_t bucket, int op,
+ closure_sync(cl);
+ }
+
+-void bch_prio_write(struct cache *ca)
++int bch_prio_write(struct cache *ca, bool wait)
+ {
+ int i;
+ struct bucket *b;
+ struct closure cl;
+
++ pr_debug("free_prio=%zu, free_none=%zu, free_inc=%zu",
++ fifo_used(&ca->free[RESERVE_PRIO]),
++ fifo_used(&ca->free[RESERVE_NONE]),
++ fifo_used(&ca->free_inc));
++
++ /*
++ * Pre-check if there are enough free buckets. In the non-blocking
++ * scenario it's better to fail early rather than starting to allocate
++ * buckets and do a cleanup later in case of failure.
++ */
++ if (!wait) {
++ size_t avail = fifo_used(&ca->free[RESERVE_PRIO]) +
++ fifo_used(&ca->free[RESERVE_NONE]);
++ if (prio_buckets(ca) > avail)
++ return -ENOMEM;
++ }
++
+ closure_init_stack(&cl);
+
+ lockdep_assert_held(&ca->set->bucket_lock);
+@@ -545,9 +562,6 @@ void bch_prio_write(struct cache *ca)
+ atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
+ &ca->meta_sectors_written);
+
+- //pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free),
+- // fifo_used(&ca->free_inc), fifo_used(&ca->unused));
+-
+ for (i = prio_buckets(ca) - 1; i >= 0; --i) {
+ long bucket;
+ struct prio_set *p = ca->disk_buckets;
+@@ -565,7 +579,7 @@ void bch_prio_write(struct cache *ca)
+ p->magic = pset_magic(&ca->sb);
+ p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8);
+
+- bucket = bch_bucket_alloc(ca, RESERVE_PRIO, true);
++ bucket = bch_bucket_alloc(ca, RESERVE_PRIO, wait);
+ BUG_ON(bucket == -1);
+
+ mutex_unlock(&ca->set->bucket_lock);
+@@ -594,6 +608,7 @@ void bch_prio_write(struct cache *ca)
+
+ ca->prio_last_buckets[i] = ca->prio_buckets[i];
+ }
++ return 0;
+ }
+
+ static void prio_read(struct cache *ca, uint64_t bucket)
+@@ -1964,7 +1979,7 @@ static int run_cache_set(struct cache_set *c)
+
+ mutex_lock(&c->bucket_lock);
+ for_each_cache(ca, c, i)
+- bch_prio_write(ca);
++ bch_prio_write(ca, true);
+ mutex_unlock(&c->bucket_lock);
+
+ err = "cannot allocate new UUID bucket";
+--
+2.16.4
+
diff --git a/for-next/0008-bcache-add-code-comments-in-bch_btree_leaf_dirty.patch b/for-next/0008-bcache-add-code-comments-in-bch_btree_leaf_dirty.patch
new file mode 100644
index 0000000..0296a83
--- /dev/null
+++ b/for-next/0008-bcache-add-code-comments-in-bch_btree_leaf_dirty.patch
@@ -0,0 +1,34 @@
+From 239fc6405b9264ac61ded073174cee1cbff16a9f Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Tue, 12 Nov 2019 17:03:18 +0800
+Subject: [PATCH 08/10] bcache: add code comments in bch_btree_leaf_dirty()
+
+This patch adds code comments in bch_btree_leaf_dirty() to explain
+why w->journal should always reference the eldest journal pin of
+all the writing bkeys in the btree node. To make the bcache journal
+code to be easier to be understood.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+---
+ drivers/md/bcache/btree.c | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
+index 39d7fc1ef1ee..48e33ee0d876 100644
+--- a/drivers/md/bcache/btree.c
++++ b/drivers/md/bcache/btree.c
+@@ -569,6 +569,11 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
+
+ set_btree_node_dirty(b);
+
++ /*
++ * w->journal is always the oldest journal pin of all bkeys
++ * in the leaf node, to make sure the oldest jset seq won't
++ * be increased before this btree node is flushed.
++ */
+ if (journal_ref) {
+ if (w->journal &&
+ journal_pin_cmp(b->c, w->journal, journal_ref)) {
+--
+2.16.4
+
diff --git a/for-next/0009-bcache-add-idle_max_writeback_rate-sysfs-interface.patch b/for-next/0009-bcache-add-idle_max_writeback_rate-sysfs-interface.patch
new file mode 100644
index 0000000..c8b3194
--- /dev/null
+++ b/for-next/0009-bcache-add-idle_max_writeback_rate-sysfs-interface.patch
@@ -0,0 +1,105 @@
+From 1a03ba923ac3dcf83518ab5f876df2ebf6e0e8a7 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Tue, 12 Nov 2019 18:24:36 +0800
+Subject: [PATCH 09/10] bcache: add idle_max_writeback_rate sysfs interface
+
+For writeback mode, if there is no regular I/O request for a while,
+the writeback rate will be set to the maximum value (1TB/s for now).
+This is good for most of the storage workload, but there are still
+people don't what the maximum writeback rate in I/O idle time.
+
+This patch adds a sysfs interface file idle_max_writeback_rate to
+permit people to disable maximum writeback rate. Then the minimum
+writeback rate can be advised by writeback_rate_minimum in the
+bcache device's sysfs interface.
+
+Reported-by: Christian Balzer <chibi@gol.com>
+Signed-off-by: Coly Li <colyli@suse.de>
+---
+ drivers/md/bcache/bcache.h | 1 +
+ drivers/md/bcache/super.c | 1 +
+ drivers/md/bcache/sysfs.c | 7 +++++++
+ drivers/md/bcache/writeback.c | 4 ++++
+ 4 files changed, 13 insertions(+)
+
+diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
+index 50241e045c70..9198c1b480d9 100644
+--- a/drivers/md/bcache/bcache.h
++++ b/drivers/md/bcache/bcache.h
+@@ -724,6 +724,7 @@ struct cache_set {
+ unsigned int gc_always_rewrite:1;
+ unsigned int shrinker_disabled:1;
+ unsigned int copy_gc_enabled:1;
++ unsigned int idle_max_writeback_rate_enabled:1;
+
+ #define BUCKET_HASH_BITS 12
+ struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS];
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index d1352fcc6ff2..77e9869345e7 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -1834,6 +1834,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
+ c->congested_read_threshold_us = 2000;
+ c->congested_write_threshold_us = 20000;
+ c->error_limit = DEFAULT_IO_ERROR_LIMIT;
++ c->idle_max_writeback_rate_enabled = 1;
+ WARN_ON(test_and_clear_bit(CACHE_SET_IO_DISABLE, &c->flags));
+
+ return c;
+diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
+index 627dcea0f5b6..733e2ddf3c78 100644
+--- a/drivers/md/bcache/sysfs.c
++++ b/drivers/md/bcache/sysfs.c
+@@ -134,6 +134,7 @@ rw_attribute(expensive_debug_checks);
+ rw_attribute(cache_replacement_policy);
+ rw_attribute(btree_shrinker_disabled);
+ rw_attribute(copy_gc_enabled);
++rw_attribute(idle_max_writeback_rate);
+ rw_attribute(gc_after_writeback);
+ rw_attribute(size);
+
+@@ -747,6 +748,8 @@ SHOW(__bch_cache_set)
+ sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite);
+ sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled);
+ sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
++ sysfs_printf(idle_max_writeback_rate, "%i",
++ c->idle_max_writeback_rate_enabled);
+ sysfs_printf(gc_after_writeback, "%i", c->gc_after_writeback);
+ sysfs_printf(io_disable, "%i",
+ test_bit(CACHE_SET_IO_DISABLE, &c->flags));
+@@ -864,6 +867,9 @@ STORE(__bch_cache_set)
+ sysfs_strtoul_bool(gc_always_rewrite, c->gc_always_rewrite);
+ sysfs_strtoul_bool(btree_shrinker_disabled, c->shrinker_disabled);
+ sysfs_strtoul_bool(copy_gc_enabled, c->copy_gc_enabled);
++ sysfs_strtoul_bool(idle_max_writeback_rate,
++ c->idle_max_writeback_rate_enabled);
++
+ /*
+ * write gc_after_writeback here may overwrite an already set
+ * BCH_DO_AUTO_GC, it doesn't matter because this flag will be
+@@ -954,6 +960,7 @@ static struct attribute *bch_cache_set_internal_files[] = {
+ &sysfs_gc_always_rewrite,
+ &sysfs_btree_shrinker_disabled,
+ &sysfs_copy_gc_enabled,
++ &sysfs_idle_max_writeback_rate,
+ &sysfs_gc_after_writeback,
+ &sysfs_io_disable,
+ &sysfs_cutoff_writeback,
+diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
+index d60268fe49e1..4a40f9eadeaf 100644
+--- a/drivers/md/bcache/writeback.c
++++ b/drivers/md/bcache/writeback.c
+@@ -122,6 +122,10 @@ static void __update_writeback_rate(struct cached_dev *dc)
+ static bool set_at_max_writeback_rate(struct cache_set *c,
+ struct cached_dev *dc)
+ {
++ /* Don't sst max writeback rate if it is disabled */
++ if (!c->idle_max_writeback_rate_enabled)
++ return false;
++
+ /* Don't set max writeback rate if gc is running */
+ if (!c->gc_mark_valid)
+ return false;
+--
+2.16.4
+
diff --git a/for-next/0010-bcache-at-least-try-to-shrink-1-node-in-bch_mca_scan.patch b/for-next/0010-bcache-at-least-try-to-shrink-1-node-in-bch_mca_scan.patch
new file mode 100644
index 0000000..bd2e023
--- /dev/null
+++ b/for-next/0010-bcache-at-least-try-to-shrink-1-node-in-bch_mca_scan.patch
@@ -0,0 +1,46 @@
+From 8cfdd5280f61070b29f0b4b711bf090397893dd0 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Tue, 12 Nov 2019 23:41:03 +0800
+Subject: [PATCH 10/10] bcache: at least try to shrink 1 node in bch_mca_scan()
+
+In bch_mca_scan(), the number of shrinking btree node is calculated
+by code like this,
+ unsigned long nr = sc->nr_to_scan;
+
+ nr /= c->btree_pages;
+ nr = min_t(unsigned long, nr, mca_can_free(c));
+variable sc->nr_to_scan is number of objects (here is bcache B+tree
+nodes' number) to shrink, and pointer variable sc is sent from memory
+management code as parametr of a callback.
+
+If sc->nr_to_scan is smaller than c->btree_pages, after the above
+calculation, variable 'nr' will be 0 and nothing will be shrunk. It is
+frequeently observed that only 1 or 2 is set to sc->nr_to_scan and make
+nr to be zero. Then bch_mca_scan() will do nothing more then acquiring
+and releasing mutex c->bucket_lock.
+
+This patch checkes whether nr is 0 after the above calculation, if 0
+is the result then set 1 to variable 'n'. Then at least bch_mca_scan()
+will try to shrink a single B+tree node.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+---
+ drivers/md/bcache/btree.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
+index 48e33ee0d876..3df5fa4a501c 100644
+--- a/drivers/md/bcache/btree.c
++++ b/drivers/md/bcache/btree.c
+@@ -754,6 +754,8 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
+ * IO can always make forward progress:
+ */
+ nr /= c->btree_pages;
++ if (nr == 0)
++ nr = 1;
+ nr = min_t(unsigned long, nr, mca_can_free(c));
+
+ i = 0;
+--
+2.16.4
+
diff --git a/for-test/0001-bcache-fix-deadlock-in-bcache_allocator.patch b/for-test/0001-bcache-fix-deadlock-in-bcache_allocator.patch
index 81646a5..b978abf 100644
--- a/for-test/0001-bcache-fix-deadlock-in-bcache_allocator.patch
+++ b/for-test/0001-bcache-fix-deadlock-in-bcache_allocator.patch
@@ -38,6 +38,7 @@ bch_prio_write() is failing to allocate buckets.
BugLink: https://bugs.launchpad.net/bugs/1784665
BugLink: https://bugs.launchpad.net/bugs/1796292
Signed-off-by: Andrea Righi <andrea.righi@canonical.com>
+Signed-off-by: Coly Li <colyli@suse.de>
---
drivers/md/bcache/alloc.c | 5 ++++-
drivers/md/bcache/bcache.h | 2 +-
diff --git a/for-test/0001-bcache-only-set-b-accessed-1-for-dirty-btree-node-ca.patch b/for-test/0001-bcache-only-set-b-accessed-1-for-dirty-btree-node-ca.patch
deleted file mode 100644
index 7ccd838..0000000
--- a/for-test/0001-bcache-only-set-b-accessed-1-for-dirty-btree-node-ca.patch
+++ /dev/null
@@ -1,28 +0,0 @@
-From 779bada095ec02a9bd400bc0a46039c4ead6c00d Mon Sep 17 00:00:00 2001
-From: Coly Li <colyli@suse.de>
-Date: Tue, 2 Jul 2019 22:30:29 +0800
-Subject: [PATCH] bcache: only set b->accessed = 1 for dirty btree node cache
-
----
- drivers/md/bcache/btree.c | 5 ++++-
- 1 file changed, 4 insertions(+), 1 deletion(-)
-
-diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
-index ba434d9ac720..1497f1114b10 100644
---- a/drivers/md/bcache/btree.c
-+++ b/drivers/md/bcache/btree.c
-@@ -1058,7 +1058,10 @@ struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op,
- BUG_ON(!b->written);
-
- b->parent = parent;
-- b->accessed = 1;
-+
-+ /* make clean btree node more easier to be reclaim */
-+ if (!write)
-+ b->accessed = 1;
-
- for (; i <= b->keys.nsets && b->keys.set[i].size; i++) {
- prefetch(b->keys.set[i].tree);
---
-2.16.4
-