for-next: patches for 5.5

author: Coly Li <colyli@suse.de> 2019-11-13 12:01:56 +0800
committer: Coly Li <colyli@suse.de> 2019-11-13 12:01:56 +0800
commit: 04b62bc7b6b66568c57a2e487f456b5fb3168540 (patch)
tree: fee7115594e5b162ede3de5938697211d91196ac
parent: c435d8a502aafa090103345ecc4a94228c4d1152 (diff)
download: bcache-patches-04b62bc7b6b66568c57a2e487f456b5fb3168540.tar.gz
14 files changed, 721 insertions, 92 deletions
diff --git a/for-next/0001-bcache-add-cond_resched-in-__bch_cache_cmp.patch b/for-next/0001-bcache-add-cond_resched-in-__bch_cache_cmp.patch
deleted file mode 100644
index edd8fb6..0000000
--- a/for-next/0001-bcache-add-cond_resched-in-__bch_cache_cmp.patch
+++ /dev/null
@@ -1,29 +0,0 @@
-From: Shile Zhang <shile.zhang@linux.alibaba.com>
-Date: Thu, 15 Aug 2019 00:51:51 +0800
-Subject: [PATCH] bcache: add cond_resched() in __bch_cache_cmp()
-
-Read /sys/fs/bcache/<uuid>/cacheN/priority_stats can take very long
-time with huge cache after long run.
-
-Signed-off-by: Shile Zhang <shile.zhang@linux.alibaba.com>
-Tested-by: Heitor Alves de Siqueira <halves@canonical.com>
-Signed-off-by: Coly Li <colyli@suse.de>
----
- drivers/md/bcache/sysfs.c | 1 +
- 1 file changed, 1 insertion(+)
-
-diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
-index 9f0826712845..6b29e34acf7a 100644
---- a/drivers/md/bcache/sysfs.c
-+++ b/drivers/md/bcache/sysfs.c
-@@ -960,6 +960,7 @@ KTYPE(bch_cache_set_internal);
- 
- static int __bch_cache_cmp(const void *l, const void *r)
- {
-+	cond_resched();
- 	return *((uint16_t *)r) - *((uint16_t *)l);
- }
- 
--- 
-2.16.4
-
diff --git a/for-next/0001-bcache-fix-fifo-index-swapping-condition-in-journal_.patch b/for-next/0001-bcache-fix-fifo-index-swapping-condition-in-journal_.patch
new file mode 100644
index 0000000..259d0df
--- /dev/null
+++ b/for-next/0001-bcache-fix-fifo-index-swapping-condition-in-journal_.patch
@@ -0,0 +1,81 @@
+From 77cedea4557df9dca93d6bfb33854a688e5cbce6 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Wed, 25 Sep 2019 22:16:33 +0800
+Subject: [PATCH 01/10] bcache: fix fifo index swapping condition in
+ journal_pin_cmp()
+
+Fifo structure journal.pin is implemented by a cycle buffer, if the back
+index reaches highest location of the cycle buffer, it will be swapped
+to 0. Once the swapping happens, it means a smaller fifo index might be
+associated to a newer journal entry. So the btree node with oldest
+journal entry won't be selected in bch_btree_leaf_dirty() to reference
+the dirty B+tree leaf node. This problem may cause bcache journal won't
+protect unflushed oldest B+tree dirty leaf node in power failure, and
+this B+tree leaf node is possible to beinconsistent after reboot from
+power failure.
+
+This patch fixes the fifo index comparing logic in journal_pin_cmp(),
+to avoid potential corrupted B+tree leaf node when the back index of
+journal pin is swapped.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+---
+ drivers/md/bcache/btree.c   | 26 ++++++++++++++++++++++++++
+ drivers/md/bcache/journal.h |  4 ----
+ 2 files changed, 26 insertions(+), 4 deletions(-)
+
+diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
+index ba434d9ac720..00523cd1db80 100644
+--- a/drivers/md/bcache/btree.c
++++ b/drivers/md/bcache/btree.c
+@@ -528,6 +528,32 @@ static void btree_node_write_work(struct work_struct *w)
+ 	mutex_unlock(&b->write_lock);
+ }
+ 
++/* return true if journal pin 'l' is newer than 'r' */
++static bool journal_pin_cmp(struct cache_set *c,
++			    atomic_t *l,
++			    atomic_t *r)
++{
++	int l_idx, r_idx, f_idx, b_idx;
++	bool ret = false;
++
++	l_idx = fifo_idx(&(c)->journal.pin, (l));
++	r_idx = fifo_idx(&(c)->journal.pin, (r));
++	f_idx = (c)->journal.pin.front;
++	b_idx = (c)->journal.pin.back;
++
++	if (l_idx > r_idx)
++		ret = true;
++	/* in case fifo back pointer is swapped */
++	if (b_idx < f_idx) {
++		if (l_idx <= b_idx && r_idx >= f_idx)
++			ret = true;
++		else if (l_idx >= f_idx && r_idx <= b_idx)
++			ret = false;
++	}
++
++	return ret;
++}
++
+ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
+ {
+ 	struct bset *i = btree_bset_last(b);
+diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
+index f2ea34d5f431..06b3eaab7d16 100644
+--- a/drivers/md/bcache/journal.h
++++ b/drivers/md/bcache/journal.h
+@@ -157,10 +157,6 @@ struct journal_device {
+ };
+ 
+ #define BTREE_FLUSH_NR	8
+-
+-#define journal_pin_cmp(c, l, r)				\
+-	(fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r)))
+-
+ #define JOURNAL_PIN	20000
+ 
+ #define journal_full(j)						\
+-- 
+2.16.4
+
diff --git a/for-next/0001-closures-fix-a-race-on-wakeup-from-closure_sync.patch b/for-next/0001-closures-fix-a-race-on-wakeup-from-closure_sync.patch
deleted file mode 100644
index 44096e4..0000000
--- a/for-next/0001-closures-fix-a-race-on-wakeup-from-closure_sync.patch
+++ /dev/null
@@ -1,35 +0,0 @@
-From 3c3c34a87be58548a302573dbe32b518f047db09 Mon Sep 17 00:00:00 2001
-From: Kent Overstreet <kent.overstreet@gmail.com>
-Date: Mon, 10 Jun 2019 15:14:20 -0400
-Subject: [PATCH] closures: fix a race on wakeup from closure_sync
-
-Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
-Acked-by: Coly Li <colyli@suse.de>
----
- drivers/md/bcache/closure.c | 10 ++++++++--
- 1 file changed, 8 insertions(+), 2 deletions(-)
-
-diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
-index 73f5319295bc..c12cd809ab19 100644
---- a/drivers/md/bcache/closure.c
-+++ b/drivers/md/bcache/closure.c
-@@ -105,8 +105,14 @@ struct closure_syncer {
- 
- static void closure_sync_fn(struct closure *cl)
- {
--	cl->s->done = 1;
--	wake_up_process(cl->s->task);
-+	struct closure_syncer *s = cl->s;
-+	struct task_struct *p;
-+
-+	rcu_read_lock();
-+	p = READ_ONCE(s->task);
-+	s->done = 1;
-+	wake_up_process(p);
-+	rcu_read_unlock();
- }
- 
- void __sched __closure_sync(struct closure *cl)
--- 
-2.16.4
-
diff --git a/for-next/0002-bcache-fix-a-lost-wake-up-problem-caused-by-mca_cann.patch b/for-next/0002-bcache-fix-a-lost-wake-up-problem-caused-by-mca_cann.patch
new file mode 100644
index 0000000..ae29ce1
--- /dev/null
+++ b/for-next/0002-bcache-fix-a-lost-wake-up-problem-caused-by-mca_cann.patch
@@ -0,0 +1,92 @@
+From b2641da321dd27c968515988b97e1cc20be6c937 Mon Sep 17 00:00:00 2001
+From: Guoju Fang <fangguoju@gmail.com>
+Date: Tue, 20 Aug 2019 06:13:55 -0400
+Subject: [PATCH 02/10] bcache: fix a lost wake-up problem caused by
+ mca_cannibalize_lock
+
+This patch fix a lost wake-up problem caused by the race between
+mca_cannibalize_lock and bch_cannibalize_unlock.
+
+Consider two processes, A and B. Process A is executing
+mca_cannibalize_lock, while process B takes c->btree_cache_alloc_lock
+and is executing bch_cannibalize_unlock. The problem happens that after
+process A executes cmpxchg and will execute prepare_to_wait. In this
+timeslice process B executes wake_up, but after that process A executes
+prepare_to_wait and set the state to TASK_INTERRUPTIBLE. Then process A
+goes to sleep but no one will wake up it. This problem may cause bcache
+device to dead.
+
+Signed-off-by: Guoju Fang <fangguoju@gmail.com>
+Signed-off-by: Coly Li <colyli@suse.de>
+---
+ drivers/md/bcache/bcache.h |  1 +
+ drivers/md/bcache/btree.c  | 12 ++++++++----
+ drivers/md/bcache/super.c  |  1 +
+ 3 files changed, 10 insertions(+), 4 deletions(-)
+
+diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
+index 013e35a9e317..3653faf3bf48 100644
+--- a/drivers/md/bcache/bcache.h
++++ b/drivers/md/bcache/bcache.h
+@@ -582,6 +582,7 @@ struct cache_set {
+ 	 */
+ 	wait_queue_head_t	btree_cache_wait;
+ 	struct task_struct	*btree_cache_alloc_lock;
++	spinlock_t		btree_cannibalize_lock;
+ 
+ 	/*
+ 	 * When we free a btree node, we increment the gen of the bucket the
+diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
+index 00523cd1db80..39d7fc1ef1ee 100644
+--- a/drivers/md/bcache/btree.c
++++ b/drivers/md/bcache/btree.c
+@@ -910,15 +910,17 @@ static struct btree *mca_find(struct cache_set *c, struct bkey *k)
+ 
+ static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op)
+ {
+-	struct task_struct *old;
+-
+-	old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current);
+-	if (old && old != current) {
++	spin_lock(&c->btree_cannibalize_lock);
++	if (likely(c->btree_cache_alloc_lock == NULL)) {
++		c->btree_cache_alloc_lock = current;
++	} else if (c->btree_cache_alloc_lock != current) {
+ 		if (op)
+ 			prepare_to_wait(&c->btree_cache_wait, &op->wait,
+ 					TASK_UNINTERRUPTIBLE);
++		spin_unlock(&c->btree_cannibalize_lock);
+ 		return -EINTR;
+ 	}
++	spin_unlock(&c->btree_cannibalize_lock);
+ 
+ 	return 0;
+ }
+@@ -953,10 +955,12 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op,
+  */
+ static void bch_cannibalize_unlock(struct cache_set *c)
+ {
++	spin_lock(&c->btree_cannibalize_lock);
+ 	if (c->btree_cache_alloc_lock == current) {
+ 		c->btree_cache_alloc_lock = NULL;
+ 		wake_up(&c->btree_cache_wait);
+ 	}
++	spin_unlock(&c->btree_cannibalize_lock);
+ }
+ 
+ static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op,
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index 20ed838e9413..ebb854ed05a4 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -1769,6 +1769,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
+ 	sema_init(&c->sb_write_mutex, 1);
+ 	mutex_init(&c->bucket_lock);
+ 	init_waitqueue_head(&c->btree_cache_wait);
++	spin_lock_init(&c->btree_cannibalize_lock);
+ 	init_waitqueue_head(&c->bucket_wait);
+ 	init_waitqueue_head(&c->gc_wait);
+ 	sema_init(&c->uuid_write_mutex, 1);
+-- 
+2.16.4
+
diff --git a/for-next/0003-bcache-fix-static-checker-warning-in-bcache_device_f.patch b/for-next/0003-bcache-fix-static-checker-warning-in-bcache_device_f.patch
new file mode 100644
index 0000000..d4faa45
--- /dev/null
+++ b/for-next/0003-bcache-fix-static-checker-warning-in-bcache_device_f.patch
@@ -0,0 +1,94 @@
+From 995330a14b286eed7407f8840e916720c31e440f Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sat, 28 Sep 2019 14:21:23 +0800
+Subject: [PATCH 03/10] bcache: fix static checker warning in
+ bcache_device_free()
+
+Commit cafe56359144 ("bcache: A block layer cache") leads to the
+following static checker warning:
+
+    ./drivers/md/bcache/super.c:770 bcache_device_free()
+    warn: variable dereferenced before check 'd->disk' (see line 766)
+
+drivers/md/bcache/super.c
+   762  static void bcache_device_free(struct bcache_device *d)
+   763  {
+   764          lockdep_assert_held(&bch_register_lock);
+   765
+   766          pr_info("%s stopped", d->disk->disk_name);
+                                      ^^^^^^^^^
+Unchecked dereference.
+
+   767
+   768          if (d->c)
+   769                  bcache_device_detach(d);
+   770          if (d->disk && d->disk->flags & GENHD_FL_UP)
+                    ^^^^^^^
+Check too late.
+
+   771                  del_gendisk(d->disk);
+   772          if (d->disk && d->disk->queue)
+   773                  blk_cleanup_queue(d->disk->queue);
+   774          if (d->disk) {
+   775                  ida_simple_remove(&bcache_device_idx,
+   776                                    first_minor_to_idx(d->disk->first_minor));
+   777                  put_disk(d->disk);
+   778          }
+   779
+
+It is not 100% sure that the gendisk struct of bcache device will always
+be there, the warning makes sense when there is problem in block core.
+
+This patch tries to remove the static checking warning by checking
+d->disk to avoid NULL pointer deferences.
+
+Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
+Signed-off-by: Coly Li <colyli@suse.de>
+---
+ drivers/md/bcache/super.c | 24 ++++++++++++++++--------
+ 1 file changed, 16 insertions(+), 8 deletions(-)
+
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index ebb854ed05a4..7beccede5360 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -761,20 +761,28 @@ static inline int idx_to_first_minor(int idx)
+ 
+ static void bcache_device_free(struct bcache_device *d)
+ {
++	struct gendisk *disk = d->disk;
++
+ 	lockdep_assert_held(&bch_register_lock);
+ 
+-	pr_info("%s stopped", d->disk->disk_name);
++	if (disk)
++		pr_info("%s stopped", disk->disk_name);
++	else
++		pr_err("bcache device (NULL gendisk) stopped");
+ 
+ 	if (d->c)
+ 		bcache_device_detach(d);
+-	if (d->disk && d->disk->flags & GENHD_FL_UP)
+-		del_gendisk(d->disk);
+-	if (d->disk && d->disk->queue)
+-		blk_cleanup_queue(d->disk->queue);
+-	if (d->disk) {
++
++	if (disk) {
++		if (disk->flags & GENHD_FL_UP)
++			del_gendisk(disk);
++
++		if (disk->queue)
++			blk_cleanup_queue(disk->queue);
++
+ 		ida_simple_remove(&bcache_device_idx,
+-				  first_minor_to_idx(d->disk->first_minor));
+-		put_disk(d->disk);
++				  first_minor_to_idx(disk->first_minor));
++		put_disk(disk);
+ 	}
+ 
+ 	bioset_exit(&d->bio_split);
+-- 
+2.16.4
+
diff --git a/for-next/0004-bcache-add-more-accurate-error-messages-in-read_supe.patch b/for-next/0004-bcache-add-more-accurate-error-messages-in-read_supe.patch
new file mode 100644
index 0000000..306712d
--- /dev/null
+++ b/for-next/0004-bcache-add-more-accurate-error-messages-in-read_supe.patch
@@ -0,0 +1,39 @@
+From fbe2ab194731b8ef42b3cc94c649b8db8c8e7440 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Mon, 30 Sep 2019 15:30:44 +0800
+Subject: [PATCH 04/10] bcache: add more accurate error messages in
+ read_super()
+
+Previous code only returns "Not a bcache superblock" for both bcache
+super block offset and magic error. This patch addss more accurate error
+messages,
+- for super block unmatched offset:
+  "Not a bcache superblock (bad offset)"
+- for super block unmatched magic number:
+  "Not a bcache superblock (bad magic)"
+
+Signed-off-by: Coly Li <colyli@suse.de>
+---
+ drivers/md/bcache/super.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index 7beccede5360..623fdaf10c4c 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -92,10 +92,11 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
+ 	pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
+ 		 sb->version, sb->flags, sb->seq, sb->keys);
+ 
+-	err = "Not a bcache superblock";
++	err = "Not a bcache superblock (bad offset)";
+ 	if (sb->offset != SB_SECTOR)
+ 		goto err;
+ 
++	err = "Not a bcache superblock (bad magic)";
+ 	if (memcmp(sb->magic, bcache_magic, 16))
+ 		goto err;
+ 
+-- 
+2.16.4
+
diff --git a/for-next/0005-bcache-deleted-code-comments-for-dead-code-in-bch_da.patch b/for-next/0005-bcache-deleted-code-comments-for-dead-code-in-bch_da.patch
new file mode 100644
index 0000000..016a694
--- /dev/null
+++ b/for-next/0005-bcache-deleted-code-comments-for-dead-code-in-bch_da.patch
@@ -0,0 +1,41 @@
+From 8dbad7b7f9bcbfd6ea04e640378b8f3f49f9f275 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Tue, 8 Oct 2019 21:37:01 +0800
+Subject: [PATCH 05/10] bcache: deleted code comments for dead code in
+ bch_data_insert_keys()
+
+In request.c:bch_data_insert_keys(), there is code comment for a piece
+of dead code. This patch deletes the dead code and its code comment
+since they are useless in practice.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+---
+ drivers/md/bcache/request.c | 12 ------------
+ 1 file changed, 12 deletions(-)
+
+diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
+index 41adcd1546f1..73478a91a342 100644
+--- a/drivers/md/bcache/request.c
++++ b/drivers/md/bcache/request.c
+@@ -62,18 +62,6 @@ static void bch_data_insert_keys(struct closure *cl)
+ 	struct bkey *replace_key = op->replace ? &op->replace_key : NULL;
+ 	int ret;
+ 
+-	/*
+-	 * If we're looping, might already be waiting on
+-	 * another journal write - can't wait on more than one journal write at
+-	 * a time
+-	 *
+-	 * XXX: this looks wrong
+-	 */
+-#if 0
+-	while (atomic_read(&s->cl.remaining) & CLOSURE_WAITING)
+-		closure_sync(&s->cl);
+-#endif
+-
+ 	if (!op->replace)
+ 		journal_ref = bch_journal(op->c, &op->insert_keys,
+ 					  op->flush_journal ? cl : NULL);
+-- 
+2.16.4
+
diff --git a/for-next/0006-bcache-add-code-comment-bch_keylist_pop-and-bch_keyl.patch b/for-next/0006-bcache-add-code-comment-bch_keylist_pop-and-bch_keyl.patch
new file mode 100644
index 0000000..97e1c24
--- /dev/null
+++ b/for-next/0006-bcache-add-code-comment-bch_keylist_pop-and-bch_keyl.patch
@@ -0,0 +1,38 @@
+From 7e659fd9ed2facc6d6782d2741c3f04293f9bf7a Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Thu, 10 Oct 2019 14:19:08 +0800
+Subject: [PATCH 06/10] bcache: add code comment bch_keylist_pop() and
+ bch_keylist_pop_front()
+
+This patch adds simple code comments for bch_keylist_pop() and
+bch_keylist_pop_front() in bset.c, to make the code more easier to
+be understand.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+---
+ drivers/md/bcache/bset.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
+index 08768796b543..f37a429f093d 100644
+--- a/drivers/md/bcache/bset.c
++++ b/drivers/md/bcache/bset.c
+@@ -155,6 +155,7 @@ int __bch_keylist_realloc(struct keylist *l, unsigned int u64s)
+ 	return 0;
+ }
+ 
++/* Pop the top key of keylist by pointing l->top to its previous key */
+ struct bkey *bch_keylist_pop(struct keylist *l)
+ {
+ 	struct bkey *k = l->keys;
+@@ -168,6 +169,7 @@ struct bkey *bch_keylist_pop(struct keylist *l)
+ 	return l->top = k;
+ }
+ 
++/* Pop the bottom key of keylist and update l->top_p */
+ void bch_keylist_pop_front(struct keylist *l)
+ {
+ 	l->top_p -= bkey_u64s(l->keys);
+-- 
+2.16.4
+
diff --git a/for-next/0007-bcache-fix-deadlock-in-bcache_allocator.patch b/for-next/0007-bcache-fix-deadlock-in-bcache_allocator.patch
new file mode 100644
index 0000000..2e7fccc
--- /dev/null
+++ b/for-next/0007-bcache-fix-deadlock-in-bcache_allocator.patch
@@ -0,0 +1,150 @@
+From 6932d005723a7a9d2b268c6c9065b2f6d47850cf Mon Sep 17 00:00:00 2001
+From: Andrea Righi <andrea.righi@canonical.com>
+Date: Wed, 7 Aug 2019 12:38:06 +0200
+Subject: [PATCH 07/10] bcache: fix deadlock in bcache_allocator
+
+bcache_allocator can call the following:
+
+ bch_allocator_thread()
+  -> bch_prio_write()
+     -> bch_bucket_alloc()
+        -> wait on &ca->set->bucket_wait
+
+But the wake up event on bucket_wait is supposed to come from
+bch_allocator_thread() itself => deadlock:
+
+[ 1158.490744] INFO: task bcache_allocato:15861 blocked for more than 10 seconds.
+[ 1158.495929]       Not tainted 5.3.0-050300rc3-generic #201908042232
+[ 1158.500653] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
+[ 1158.504413] bcache_allocato D    0 15861      2 0x80004000
+[ 1158.504419] Call Trace:
+[ 1158.504429]  __schedule+0x2a8/0x670
+[ 1158.504432]  schedule+0x2d/0x90
+[ 1158.504448]  bch_bucket_alloc+0xe5/0x370 [bcache]
+[ 1158.504453]  ? wait_woken+0x80/0x80
+[ 1158.504466]  bch_prio_write+0x1dc/0x390 [bcache]
+[ 1158.504476]  bch_allocator_thread+0x233/0x490 [bcache]
+[ 1158.504491]  kthread+0x121/0x140
+[ 1158.504503]  ? invalidate_buckets+0x890/0x890 [bcache]
+[ 1158.504506]  ? kthread_park+0xb0/0xb0
+[ 1158.504510]  ret_from_fork+0x35/0x40
+
+Fix by making the call to bch_prio_write() non-blocking, so that
+bch_allocator_thread() never waits on itself.
+
+Moreover, make sure to wake up the garbage collector thread when
+bch_prio_write() is failing to allocate buckets.
+
+BugLink: https://bugs.launchpad.net/bugs/1784665
+BugLink: https://bugs.launchpad.net/bugs/1796292
+Signed-off-by: Andrea Righi <andrea.righi@canonical.com>
+Signed-off-by: Coly Li <colyli@suse.de>
+---
+ drivers/md/bcache/alloc.c  |  5 ++++-
+ drivers/md/bcache/bcache.h |  2 +-
+ drivers/md/bcache/super.c  | 27 +++++++++++++++++++++------
+ 3 files changed, 26 insertions(+), 8 deletions(-)
+
+diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
+index 6f776823b9ba..a1df0d95151c 100644
+--- a/drivers/md/bcache/alloc.c
++++ b/drivers/md/bcache/alloc.c
+@@ -377,7 +377,10 @@ static int bch_allocator_thread(void *arg)
+ 			if (!fifo_full(&ca->free_inc))
+ 				goto retry_invalidate;
+ 
+-			bch_prio_write(ca);
++			if (bch_prio_write(ca, false) < 0) {
++				ca->invalidate_needs_gc = 1;
++				wake_up_gc(ca->set);
++			}
+ 		}
+ 	}
+ out:
+diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
+index 3653faf3bf48..50241e045c70 100644
+--- a/drivers/md/bcache/bcache.h
++++ b/drivers/md/bcache/bcache.h
+@@ -978,7 +978,7 @@ bool bch_cached_dev_error(struct cached_dev *dc);
+ __printf(2, 3)
+ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...);
+ 
+-void bch_prio_write(struct cache *ca);
++int bch_prio_write(struct cache *ca, bool wait);
+ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent);
+ 
+ extern struct workqueue_struct *bcache_wq;
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index 623fdaf10c4c..d1352fcc6ff2 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -530,12 +530,29 @@ static void prio_io(struct cache *ca, uint64_t bucket, int op,
+ 	closure_sync(cl);
+ }
+ 
+-void bch_prio_write(struct cache *ca)
++int bch_prio_write(struct cache *ca, bool wait)
+ {
+ 	int i;
+ 	struct bucket *b;
+ 	struct closure cl;
+ 
++	pr_debug("free_prio=%zu, free_none=%zu, free_inc=%zu",
++		 fifo_used(&ca->free[RESERVE_PRIO]),
++		 fifo_used(&ca->free[RESERVE_NONE]),
++		 fifo_used(&ca->free_inc));
++
++	/*
++	 * Pre-check if there are enough free buckets. In the non-blocking
++	 * scenario it's better to fail early rather than starting to allocate
++	 * buckets and do a cleanup later in case of failure.
++	 */
++	if (!wait) {
++		size_t avail = fifo_used(&ca->free[RESERVE_PRIO]) +
++			       fifo_used(&ca->free[RESERVE_NONE]);
++		if (prio_buckets(ca) > avail)
++			return -ENOMEM;
++	}
++
+ 	closure_init_stack(&cl);
+ 
+ 	lockdep_assert_held(&ca->set->bucket_lock);
+@@ -545,9 +562,6 @@ void bch_prio_write(struct cache *ca)
+ 	atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
+ 			&ca->meta_sectors_written);
+ 
+-	//pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free),
+-	//	 fifo_used(&ca->free_inc), fifo_used(&ca->unused));
+-
+ 	for (i = prio_buckets(ca) - 1; i >= 0; --i) {
+ 		long bucket;
+ 		struct prio_set *p = ca->disk_buckets;
+@@ -565,7 +579,7 @@ void bch_prio_write(struct cache *ca)
+ 		p->magic	= pset_magic(&ca->sb);
+ 		p->csum		= bch_crc64(&p->magic, bucket_bytes(ca) - 8);
+ 
+-		bucket = bch_bucket_alloc(ca, RESERVE_PRIO, true);
++		bucket = bch_bucket_alloc(ca, RESERVE_PRIO, wait);
+ 		BUG_ON(bucket == -1);
+ 
+ 		mutex_unlock(&ca->set->bucket_lock);
+@@ -594,6 +608,7 @@ void bch_prio_write(struct cache *ca)
+ 
+ 		ca->prio_last_buckets[i] = ca->prio_buckets[i];
+ 	}
++	return 0;
+ }
+ 
+ static void prio_read(struct cache *ca, uint64_t bucket)
+@@ -1964,7 +1979,7 @@ static int run_cache_set(struct cache_set *c)
+ 
+ 		mutex_lock(&c->bucket_lock);
+ 		for_each_cache(ca, c, i)
+-			bch_prio_write(ca);
++			bch_prio_write(ca, true);
+ 		mutex_unlock(&c->bucket_lock);
+ 
+ 		err = "cannot allocate new UUID bucket";
+-- 
+2.16.4
+
diff --git a/for-next/0008-bcache-add-code-comments-in-bch_btree_leaf_dirty.patch b/for-next/0008-bcache-add-code-comments-in-bch_btree_leaf_dirty.patch
new file mode 100644
index 0000000..0296a83
--- /dev/null
+++ b/for-next/0008-bcache-add-code-comments-in-bch_btree_leaf_dirty.patch
@@ -0,0 +1,34 @@
+From 239fc6405b9264ac61ded073174cee1cbff16a9f Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Tue, 12 Nov 2019 17:03:18 +0800
+Subject: [PATCH 08/10] bcache: add code comments in bch_btree_leaf_dirty()
+
+This patch adds code comments in bch_btree_leaf_dirty() to explain
+why w->journal should always reference the eldest journal pin of
+all the writing bkeys in the btree node. To make the bcache journal
+code to be easier to be understood.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+---
+ drivers/md/bcache/btree.c | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
+index 39d7fc1ef1ee..48e33ee0d876 100644
+--- a/drivers/md/bcache/btree.c
++++ b/drivers/md/bcache/btree.c
+@@ -569,6 +569,11 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
+ 
+ 	set_btree_node_dirty(b);
+ 
++	/*
++	 * w->journal is always the oldest journal pin of all bkeys
++	 * in the leaf node, to make sure the oldest jset seq won't
++	 * be increased before this btree node is flushed.
++	 */
+ 	if (journal_ref) {
+ 		if (w->journal &&
+ 		    journal_pin_cmp(b->c, w->journal, journal_ref)) {
+-- 
+2.16.4
+
diff --git a/for-next/0009-bcache-add-idle_max_writeback_rate-sysfs-interface.patch b/for-next/0009-bcache-add-idle_max_writeback_rate-sysfs-interface.patch
new file mode 100644
index 0000000..c8b3194
--- /dev/null
+++ b/for-next/0009-bcache-add-idle_max_writeback_rate-sysfs-interface.patch
@@ -0,0 +1,105 @@
+From 1a03ba923ac3dcf83518ab5f876df2ebf6e0e8a7 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Tue, 12 Nov 2019 18:24:36 +0800
+Subject: [PATCH 09/10] bcache: add idle_max_writeback_rate sysfs interface
+
+For writeback mode, if there is no regular I/O request for a while,
+the writeback rate will be set to the maximum value (1TB/s for now).
+This is good for most of the storage workload, but there are still
+people don't what the maximum writeback rate in I/O idle time.
+
+This patch adds a sysfs interface file idle_max_writeback_rate to
+permit people to disable maximum writeback rate. Then the minimum
+writeback rate can be advised by writeback_rate_minimum in the
+bcache device's sysfs interface.
+
+Reported-by: Christian Balzer <chibi@gol.com>
+Signed-off-by: Coly Li <colyli@suse.de>
+---
+ drivers/md/bcache/bcache.h    | 1 +
+ drivers/md/bcache/super.c     | 1 +
+ drivers/md/bcache/sysfs.c     | 7 +++++++
+ drivers/md/bcache/writeback.c | 4 ++++
+ 4 files changed, 13 insertions(+)
+
+diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
+index 50241e045c70..9198c1b480d9 100644
+--- a/drivers/md/bcache/bcache.h
++++ b/drivers/md/bcache/bcache.h
+@@ -724,6 +724,7 @@ struct cache_set {
+ 	unsigned int		gc_always_rewrite:1;
+ 	unsigned int		shrinker_disabled:1;
+ 	unsigned int		copy_gc_enabled:1;
++	unsigned int		idle_max_writeback_rate_enabled:1;
+ 
+ #define BUCKET_HASH_BITS	12
+ 	struct hlist_head	bucket_hash[1 << BUCKET_HASH_BITS];
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index d1352fcc6ff2..77e9869345e7 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -1834,6 +1834,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
+ 	c->congested_read_threshold_us	= 2000;
+ 	c->congested_write_threshold_us	= 20000;
+ 	c->error_limit	= DEFAULT_IO_ERROR_LIMIT;
++	c->idle_max_writeback_rate_enabled = 1;
+ 	WARN_ON(test_and_clear_bit(CACHE_SET_IO_DISABLE, &c->flags));
+ 
+ 	return c;
+diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
+index 627dcea0f5b6..733e2ddf3c78 100644
+--- a/drivers/md/bcache/sysfs.c
++++ b/drivers/md/bcache/sysfs.c
+@@ -134,6 +134,7 @@ rw_attribute(expensive_debug_checks);
+ rw_attribute(cache_replacement_policy);
+ rw_attribute(btree_shrinker_disabled);
+ rw_attribute(copy_gc_enabled);
++rw_attribute(idle_max_writeback_rate);
+ rw_attribute(gc_after_writeback);
+ rw_attribute(size);
+ 
+@@ -747,6 +748,8 @@ SHOW(__bch_cache_set)
+ 	sysfs_printf(gc_always_rewrite,		"%i", c->gc_always_rewrite);
+ 	sysfs_printf(btree_shrinker_disabled,	"%i", c->shrinker_disabled);
+ 	sysfs_printf(copy_gc_enabled,		"%i", c->copy_gc_enabled);
++	sysfs_printf(idle_max_writeback_rate,	"%i",
++		     c->idle_max_writeback_rate_enabled);
+ 	sysfs_printf(gc_after_writeback,	"%i", c->gc_after_writeback);
+ 	sysfs_printf(io_disable,		"%i",
+ 		     test_bit(CACHE_SET_IO_DISABLE, &c->flags));
+@@ -864,6 +867,9 @@ STORE(__bch_cache_set)
+ 	sysfs_strtoul_bool(gc_always_rewrite,	c->gc_always_rewrite);
+ 	sysfs_strtoul_bool(btree_shrinker_disabled, c->shrinker_disabled);
+ 	sysfs_strtoul_bool(copy_gc_enabled,	c->copy_gc_enabled);
++	sysfs_strtoul_bool(idle_max_writeback_rate,
++			   c->idle_max_writeback_rate_enabled);
++
+ 	/*
+ 	 * write gc_after_writeback here may overwrite an already set
+ 	 * BCH_DO_AUTO_GC, it doesn't matter because this flag will be
+@@ -954,6 +960,7 @@ static struct attribute *bch_cache_set_internal_files[] = {
+ 	&sysfs_gc_always_rewrite,
+ 	&sysfs_btree_shrinker_disabled,
+ 	&sysfs_copy_gc_enabled,
++	&sysfs_idle_max_writeback_rate,
+ 	&sysfs_gc_after_writeback,
+ 	&sysfs_io_disable,
+ 	&sysfs_cutoff_writeback,
+diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
+index d60268fe49e1..4a40f9eadeaf 100644
+--- a/drivers/md/bcache/writeback.c
++++ b/drivers/md/bcache/writeback.c
+@@ -122,6 +122,10 @@ static void __update_writeback_rate(struct cached_dev *dc)
+ static bool set_at_max_writeback_rate(struct cache_set *c,
+ 				       struct cached_dev *dc)
+ {
++	/* Don't sst max writeback rate if it is disabled */
++	if (!c->idle_max_writeback_rate_enabled)
++		return false;
++
+ 	/* Don't set max writeback rate if gc is running */
+ 	if (!c->gc_mark_valid)
+ 		return false;
+-- 
+2.16.4
+
diff --git a/for-next/0010-bcache-at-least-try-to-shrink-1-node-in-bch_mca_scan.patch b/for-next/0010-bcache-at-least-try-to-shrink-1-node-in-bch_mca_scan.patch
new file mode 100644
index 0000000..bd2e023
--- /dev/null
+++ b/for-next/0010-bcache-at-least-try-to-shrink-1-node-in-bch_mca_scan.patch
@@ -0,0 +1,46 @@
+From 8cfdd5280f61070b29f0b4b711bf090397893dd0 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Tue, 12 Nov 2019 23:41:03 +0800
+Subject: [PATCH 10/10] bcache: at least try to shrink 1 node in bch_mca_scan()
+
+In bch_mca_scan(), the number of shrinking btree node is calculated
+by code like this,
+	unsigned long nr = sc->nr_to_scan;
+
+        nr /= c->btree_pages;
+        nr = min_t(unsigned long, nr, mca_can_free(c));
+variable sc->nr_to_scan is number of objects (here is bcache B+tree
+nodes' number) to shrink, and pointer variable sc is sent from memory
+management code as parametr of a callback.
+
+If sc->nr_to_scan is smaller than c->btree_pages, after the above
+calculation, variable 'nr' will be 0 and nothing will be shrunk. It is
+frequeently observed that only 1 or 2 is set to sc->nr_to_scan and make
+nr to be zero. Then bch_mca_scan() will do nothing more then acquiring
+and releasing mutex c->bucket_lock.
+
+This patch checkes whether nr is 0 after the above calculation, if 0
+is the result then set 1 to variable 'n'. Then at least bch_mca_scan()
+will try to shrink a single B+tree node.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+---
+ drivers/md/bcache/btree.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
+index 48e33ee0d876..3df5fa4a501c 100644
+--- a/drivers/md/bcache/btree.c
++++ b/drivers/md/bcache/btree.c
+@@ -754,6 +754,8 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
+ 	 * IO can always make forward progress:
+ 	 */
+ 	nr /= c->btree_pages;
++	if (nr == 0)
++		nr = 1;
+ 	nr = min_t(unsigned long, nr, mca_can_free(c));
+ 
+ 	i = 0;
+-- 
+2.16.4
+
diff --git a/for-test/0001-bcache-fix-deadlock-in-bcache_allocator.patch b/for-test/0001-bcache-fix-deadlock-in-bcache_allocator.patch
index 81646a5..b978abf 100644
--- a/for-test/0001-bcache-fix-deadlock-in-bcache_allocator.patch
+++ b/for-test/0001-bcache-fix-deadlock-in-bcache_allocator.patch
@@ -38,6 +38,7 @@ bch_prio_write() is failing to allocate buckets.
 BugLink: https://bugs.launchpad.net/bugs/1784665
 BugLink: https://bugs.launchpad.net/bugs/1796292
 Signed-off-by: Andrea Righi <andrea.righi@canonical.com>
+Signed-off-by: Coly Li <colyli@suse.de>
 ---
  drivers/md/bcache/alloc.c  |  5 ++++-
  drivers/md/bcache/bcache.h |  2 +-
diff --git a/for-test/0001-bcache-only-set-b-accessed-1-for-dirty-btree-node-ca.patch b/for-test/0001-bcache-only-set-b-accessed-1-for-dirty-btree-node-ca.patch
deleted file mode 100644
index 7ccd838..0000000
--- a/for-test/0001-bcache-only-set-b-accessed-1-for-dirty-btree-node-ca.patch
+++ /dev/null
@@ -1,28 +0,0 @@
-From 779bada095ec02a9bd400bc0a46039c4ead6c00d Mon Sep 17 00:00:00 2001
-From: Coly Li <colyli@suse.de>
-Date: Tue, 2 Jul 2019 22:30:29 +0800
-Subject: [PATCH] bcache: only set b->accessed = 1 for dirty btree node cache
-
----
- drivers/md/bcache/btree.c | 5 ++++-
- 1 file changed, 4 insertions(+), 1 deletion(-)
-
-diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
-index ba434d9ac720..1497f1114b10 100644
---- a/drivers/md/bcache/btree.c
-+++ b/drivers/md/bcache/btree.c
-@@ -1058,7 +1058,10 @@ struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op,
- 	BUG_ON(!b->written);
- 
- 	b->parent = parent;
--	b->accessed = 1;
-+
-+	/* make clean btree node more easier to be reclaim */
-+	if (!write)
-+		b->accessed = 1;
- 
- 	for (; i <= b->keys.nsets && b->keys.set[i].size; i++) {
- 		prefetch(b->keys.set[i].tree);
--- 
-2.16.4
-
author	Coly Li <colyli@suse.de>	2019-11-13 12:01:56 +0800
committer	Coly Li <colyli@suse.de>	2019-11-13 12:01:56 +0800
commit	04b62bc7b6b66568c57a2e487f456b5fb3168540 (patch)
tree	fee7115594e5b162ede3de5938697211d91196ac
parent	c435d8a502aafa090103345ecc4a94228c4d1152 (diff)
download	bcache-patches-04b62bc7b6b66568c57a2e487f456b5fb3168540.tar.gz