diff options
author | Coly Li <colyli@suse.de> | 2019-11-13 12:01:56 +0800 |
---|---|---|
committer | Coly Li <colyli@suse.de> | 2019-11-13 12:01:56 +0800 |
commit | 04b62bc7b6b66568c57a2e487f456b5fb3168540 (patch) | |
tree | fee7115594e5b162ede3de5938697211d91196ac | |
parent | c435d8a502aafa090103345ecc4a94228c4d1152 (diff) | |
download | bcache-patches-04b62bc7b6b66568c57a2e487f456b5fb3168540.tar.gz |
for-next: patches for 5.5
14 files changed, 721 insertions, 92 deletions
diff --git a/for-next/0001-bcache-add-cond_resched-in-__bch_cache_cmp.patch b/for-next/0001-bcache-add-cond_resched-in-__bch_cache_cmp.patch deleted file mode 100644 index edd8fb6..0000000 --- a/for-next/0001-bcache-add-cond_resched-in-__bch_cache_cmp.patch +++ /dev/null @@ -1,29 +0,0 @@ -From: Shile Zhang <shile.zhang@linux.alibaba.com> -Date: Thu, 15 Aug 2019 00:51:51 +0800 -Subject: [PATCH] bcache: add cond_resched() in __bch_cache_cmp() - -Read /sys/fs/bcache/<uuid>/cacheN/priority_stats can take very long -time with huge cache after long run. - -Signed-off-by: Shile Zhang <shile.zhang@linux.alibaba.com> -Tested-by: Heitor Alves de Siqueira <halves@canonical.com> -Signed-off-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/sysfs.c | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c -index 9f0826712845..6b29e34acf7a 100644 ---- a/drivers/md/bcache/sysfs.c -+++ b/drivers/md/bcache/sysfs.c -@@ -960,6 +960,7 @@ KTYPE(bch_cache_set_internal); - - static int __bch_cache_cmp(const void *l, const void *r) - { -+ cond_resched(); - return *((uint16_t *)r) - *((uint16_t *)l); - } - --- -2.16.4 - diff --git a/for-next/0001-bcache-fix-fifo-index-swapping-condition-in-journal_.patch b/for-next/0001-bcache-fix-fifo-index-swapping-condition-in-journal_.patch new file mode 100644 index 0000000..259d0df --- /dev/null +++ b/for-next/0001-bcache-fix-fifo-index-swapping-condition-in-journal_.patch @@ -0,0 +1,81 @@ +From 77cedea4557df9dca93d6bfb33854a688e5cbce6 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Wed, 25 Sep 2019 22:16:33 +0800 +Subject: [PATCH 01/10] bcache: fix fifo index swapping condition in + journal_pin_cmp() + +Fifo structure journal.pin is implemented by a cycle buffer, if the back +index reaches highest location of the cycle buffer, it will be swapped +to 0. Once the swapping happens, it means a smaller fifo index might be +associated to a newer journal entry. So the btree node with oldest +journal entry won't be selected in bch_btree_leaf_dirty() to reference +the dirty B+tree leaf node. This problem may cause bcache journal won't +protect unflushed oldest B+tree dirty leaf node in power failure, and +this B+tree leaf node is possible to beinconsistent after reboot from +power failure. + +This patch fixes the fifo index comparing logic in journal_pin_cmp(), +to avoid potential corrupted B+tree leaf node when the back index of +journal pin is swapped. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/btree.c | 26 ++++++++++++++++++++++++++ + drivers/md/bcache/journal.h | 4 ---- + 2 files changed, 26 insertions(+), 4 deletions(-) + +diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c +index ba434d9ac720..00523cd1db80 100644 +--- a/drivers/md/bcache/btree.c ++++ b/drivers/md/bcache/btree.c +@@ -528,6 +528,32 @@ static void btree_node_write_work(struct work_struct *w) + mutex_unlock(&b->write_lock); + } + ++/* return true if journal pin 'l' is newer than 'r' */ ++static bool journal_pin_cmp(struct cache_set *c, ++ atomic_t *l, ++ atomic_t *r) ++{ ++ int l_idx, r_idx, f_idx, b_idx; ++ bool ret = false; ++ ++ l_idx = fifo_idx(&(c)->journal.pin, (l)); ++ r_idx = fifo_idx(&(c)->journal.pin, (r)); ++ f_idx = (c)->journal.pin.front; ++ b_idx = (c)->journal.pin.back; ++ ++ if (l_idx > r_idx) ++ ret = true; ++ /* in case fifo back pointer is swapped */ ++ if (b_idx < f_idx) { ++ if (l_idx <= b_idx && r_idx >= f_idx) ++ ret = true; ++ else if (l_idx >= f_idx && r_idx <= b_idx) ++ ret = false; ++ } ++ ++ return ret; ++} ++ + static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) + { + struct bset *i = btree_bset_last(b); +diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h +index f2ea34d5f431..06b3eaab7d16 100644 +--- a/drivers/md/bcache/journal.h ++++ b/drivers/md/bcache/journal.h +@@ -157,10 +157,6 @@ struct journal_device { + }; + + #define BTREE_FLUSH_NR 8 +- +-#define journal_pin_cmp(c, l, r) \ +- (fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r))) +- + #define JOURNAL_PIN 20000 + + #define journal_full(j) \ +-- +2.16.4 + diff --git a/for-next/0001-closures-fix-a-race-on-wakeup-from-closure_sync.patch b/for-next/0001-closures-fix-a-race-on-wakeup-from-closure_sync.patch deleted file mode 100644 index 44096e4..0000000 --- a/for-next/0001-closures-fix-a-race-on-wakeup-from-closure_sync.patch +++ /dev/null @@ -1,35 +0,0 @@ -From 3c3c34a87be58548a302573dbe32b518f047db09 Mon Sep 17 00:00:00 2001 -From: Kent Overstreet <kent.overstreet@gmail.com> -Date: Mon, 10 Jun 2019 15:14:20 -0400 -Subject: [PATCH] closures: fix a race on wakeup from closure_sync - -Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com> -Acked-by: Coly Li <colyli@suse.de> ---- - drivers/md/bcache/closure.c | 10 ++++++++-- - 1 file changed, 8 insertions(+), 2 deletions(-) - -diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c -index 73f5319295bc..c12cd809ab19 100644 ---- a/drivers/md/bcache/closure.c -+++ b/drivers/md/bcache/closure.c -@@ -105,8 +105,14 @@ struct closure_syncer { - - static void closure_sync_fn(struct closure *cl) - { -- cl->s->done = 1; -- wake_up_process(cl->s->task); -+ struct closure_syncer *s = cl->s; -+ struct task_struct *p; -+ -+ rcu_read_lock(); -+ p = READ_ONCE(s->task); -+ s->done = 1; -+ wake_up_process(p); -+ rcu_read_unlock(); - } - - void __sched __closure_sync(struct closure *cl) --- -2.16.4 - diff --git a/for-next/0002-bcache-fix-a-lost-wake-up-problem-caused-by-mca_cann.patch b/for-next/0002-bcache-fix-a-lost-wake-up-problem-caused-by-mca_cann.patch new file mode 100644 index 0000000..ae29ce1 --- /dev/null +++ b/for-next/0002-bcache-fix-a-lost-wake-up-problem-caused-by-mca_cann.patch @@ -0,0 +1,92 @@ +From b2641da321dd27c968515988b97e1cc20be6c937 Mon Sep 17 00:00:00 2001 +From: Guoju Fang <fangguoju@gmail.com> +Date: Tue, 20 Aug 2019 06:13:55 -0400 +Subject: [PATCH 02/10] bcache: fix a lost wake-up problem caused by + mca_cannibalize_lock + +This patch fix a lost wake-up problem caused by the race between +mca_cannibalize_lock and bch_cannibalize_unlock. + +Consider two processes, A and B. Process A is executing +mca_cannibalize_lock, while process B takes c->btree_cache_alloc_lock +and is executing bch_cannibalize_unlock. The problem happens that after +process A executes cmpxchg and will execute prepare_to_wait. In this +timeslice process B executes wake_up, but after that process A executes +prepare_to_wait and set the state to TASK_INTERRUPTIBLE. Then process A +goes to sleep but no one will wake up it. This problem may cause bcache +device to dead. + +Signed-off-by: Guoju Fang <fangguoju@gmail.com> +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/bcache.h | 1 + + drivers/md/bcache/btree.c | 12 ++++++++---- + drivers/md/bcache/super.c | 1 + + 3 files changed, 10 insertions(+), 4 deletions(-) + +diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h +index 013e35a9e317..3653faf3bf48 100644 +--- a/drivers/md/bcache/bcache.h ++++ b/drivers/md/bcache/bcache.h +@@ -582,6 +582,7 @@ struct cache_set { + */ + wait_queue_head_t btree_cache_wait; + struct task_struct *btree_cache_alloc_lock; ++ spinlock_t btree_cannibalize_lock; + + /* + * When we free a btree node, we increment the gen of the bucket the +diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c +index 00523cd1db80..39d7fc1ef1ee 100644 +--- a/drivers/md/bcache/btree.c ++++ b/drivers/md/bcache/btree.c +@@ -910,15 +910,17 @@ static struct btree *mca_find(struct cache_set *c, struct bkey *k) + + static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op) + { +- struct task_struct *old; +- +- old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current); +- if (old && old != current) { ++ spin_lock(&c->btree_cannibalize_lock); ++ if (likely(c->btree_cache_alloc_lock == NULL)) { ++ c->btree_cache_alloc_lock = current; ++ } else if (c->btree_cache_alloc_lock != current) { + if (op) + prepare_to_wait(&c->btree_cache_wait, &op->wait, + TASK_UNINTERRUPTIBLE); ++ spin_unlock(&c->btree_cannibalize_lock); + return -EINTR; + } ++ spin_unlock(&c->btree_cannibalize_lock); + + return 0; + } +@@ -953,10 +955,12 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op, + */ + static void bch_cannibalize_unlock(struct cache_set *c) + { ++ spin_lock(&c->btree_cannibalize_lock); + if (c->btree_cache_alloc_lock == current) { + c->btree_cache_alloc_lock = NULL; + wake_up(&c->btree_cache_wait); + } ++ spin_unlock(&c->btree_cannibalize_lock); + } + + static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op, +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 20ed838e9413..ebb854ed05a4 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -1769,6 +1769,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) + sema_init(&c->sb_write_mutex, 1); + mutex_init(&c->bucket_lock); + init_waitqueue_head(&c->btree_cache_wait); ++ spin_lock_init(&c->btree_cannibalize_lock); + init_waitqueue_head(&c->bucket_wait); + init_waitqueue_head(&c->gc_wait); + sema_init(&c->uuid_write_mutex, 1); +-- +2.16.4 + diff --git a/for-next/0003-bcache-fix-static-checker-warning-in-bcache_device_f.patch b/for-next/0003-bcache-fix-static-checker-warning-in-bcache_device_f.patch new file mode 100644 index 0000000..d4faa45 --- /dev/null +++ b/for-next/0003-bcache-fix-static-checker-warning-in-bcache_device_f.patch @@ -0,0 +1,94 @@ +From 995330a14b286eed7407f8840e916720c31e440f Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sat, 28 Sep 2019 14:21:23 +0800 +Subject: [PATCH 03/10] bcache: fix static checker warning in + bcache_device_free() + +Commit cafe56359144 ("bcache: A block layer cache") leads to the +following static checker warning: + + ./drivers/md/bcache/super.c:770 bcache_device_free() + warn: variable dereferenced before check 'd->disk' (see line 766) + +drivers/md/bcache/super.c + 762 static void bcache_device_free(struct bcache_device *d) + 763 { + 764 lockdep_assert_held(&bch_register_lock); + 765 + 766 pr_info("%s stopped", d->disk->disk_name); + ^^^^^^^^^ +Unchecked dereference. + + 767 + 768 if (d->c) + 769 bcache_device_detach(d); + 770 if (d->disk && d->disk->flags & GENHD_FL_UP) + ^^^^^^^ +Check too late. + + 771 del_gendisk(d->disk); + 772 if (d->disk && d->disk->queue) + 773 blk_cleanup_queue(d->disk->queue); + 774 if (d->disk) { + 775 ida_simple_remove(&bcache_device_idx, + 776 first_minor_to_idx(d->disk->first_minor)); + 777 put_disk(d->disk); + 778 } + 779 + +It is not 100% sure that the gendisk struct of bcache device will always +be there, the warning makes sense when there is problem in block core. + +This patch tries to remove the static checking warning by checking +d->disk to avoid NULL pointer deferences. + +Reported-by: Dan Carpenter <dan.carpenter@oracle.com> +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/super.c | 24 ++++++++++++++++-------- + 1 file changed, 16 insertions(+), 8 deletions(-) + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index ebb854ed05a4..7beccede5360 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -761,20 +761,28 @@ static inline int idx_to_first_minor(int idx) + + static void bcache_device_free(struct bcache_device *d) + { ++ struct gendisk *disk = d->disk; ++ + lockdep_assert_held(&bch_register_lock); + +- pr_info("%s stopped", d->disk->disk_name); ++ if (disk) ++ pr_info("%s stopped", disk->disk_name); ++ else ++ pr_err("bcache device (NULL gendisk) stopped"); + + if (d->c) + bcache_device_detach(d); +- if (d->disk && d->disk->flags & GENHD_FL_UP) +- del_gendisk(d->disk); +- if (d->disk && d->disk->queue) +- blk_cleanup_queue(d->disk->queue); +- if (d->disk) { ++ ++ if (disk) { ++ if (disk->flags & GENHD_FL_UP) ++ del_gendisk(disk); ++ ++ if (disk->queue) ++ blk_cleanup_queue(disk->queue); ++ + ida_simple_remove(&bcache_device_idx, +- first_minor_to_idx(d->disk->first_minor)); +- put_disk(d->disk); ++ first_minor_to_idx(disk->first_minor)); ++ put_disk(disk); + } + + bioset_exit(&d->bio_split); +-- +2.16.4 + diff --git a/for-next/0004-bcache-add-more-accurate-error-messages-in-read_supe.patch b/for-next/0004-bcache-add-more-accurate-error-messages-in-read_supe.patch new file mode 100644 index 0000000..306712d --- /dev/null +++ b/for-next/0004-bcache-add-more-accurate-error-messages-in-read_supe.patch @@ -0,0 +1,39 @@ +From fbe2ab194731b8ef42b3cc94c649b8db8c8e7440 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Mon, 30 Sep 2019 15:30:44 +0800 +Subject: [PATCH 04/10] bcache: add more accurate error messages in + read_super() + +Previous code only returns "Not a bcache superblock" for both bcache +super block offset and magic error. This patch addss more accurate error +messages, +- for super block unmatched offset: + "Not a bcache superblock (bad offset)" +- for super block unmatched magic number: + "Not a bcache superblock (bad magic)" + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/super.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 7beccede5360..623fdaf10c4c 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -92,10 +92,11 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, + pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u", + sb->version, sb->flags, sb->seq, sb->keys); + +- err = "Not a bcache superblock"; ++ err = "Not a bcache superblock (bad offset)"; + if (sb->offset != SB_SECTOR) + goto err; + ++ err = "Not a bcache superblock (bad magic)"; + if (memcmp(sb->magic, bcache_magic, 16)) + goto err; + +-- +2.16.4 + diff --git a/for-next/0005-bcache-deleted-code-comments-for-dead-code-in-bch_da.patch b/for-next/0005-bcache-deleted-code-comments-for-dead-code-in-bch_da.patch new file mode 100644 index 0000000..016a694 --- /dev/null +++ b/for-next/0005-bcache-deleted-code-comments-for-dead-code-in-bch_da.patch @@ -0,0 +1,41 @@ +From 8dbad7b7f9bcbfd6ea04e640378b8f3f49f9f275 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Tue, 8 Oct 2019 21:37:01 +0800 +Subject: [PATCH 05/10] bcache: deleted code comments for dead code in + bch_data_insert_keys() + +In request.c:bch_data_insert_keys(), there is code comment for a piece +of dead code. This patch deletes the dead code and its code comment +since they are useless in practice. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/request.c | 12 ------------ + 1 file changed, 12 deletions(-) + +diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c +index 41adcd1546f1..73478a91a342 100644 +--- a/drivers/md/bcache/request.c ++++ b/drivers/md/bcache/request.c +@@ -62,18 +62,6 @@ static void bch_data_insert_keys(struct closure *cl) + struct bkey *replace_key = op->replace ? &op->replace_key : NULL; + int ret; + +- /* +- * If we're looping, might already be waiting on +- * another journal write - can't wait on more than one journal write at +- * a time +- * +- * XXX: this looks wrong +- */ +-#if 0 +- while (atomic_read(&s->cl.remaining) & CLOSURE_WAITING) +- closure_sync(&s->cl); +-#endif +- + if (!op->replace) + journal_ref = bch_journal(op->c, &op->insert_keys, + op->flush_journal ? cl : NULL); +-- +2.16.4 + diff --git a/for-next/0006-bcache-add-code-comment-bch_keylist_pop-and-bch_keyl.patch b/for-next/0006-bcache-add-code-comment-bch_keylist_pop-and-bch_keyl.patch new file mode 100644 index 0000000..97e1c24 --- /dev/null +++ b/for-next/0006-bcache-add-code-comment-bch_keylist_pop-and-bch_keyl.patch @@ -0,0 +1,38 @@ +From 7e659fd9ed2facc6d6782d2741c3f04293f9bf7a Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Thu, 10 Oct 2019 14:19:08 +0800 +Subject: [PATCH 06/10] bcache: add code comment bch_keylist_pop() and + bch_keylist_pop_front() + +This patch adds simple code comments for bch_keylist_pop() and +bch_keylist_pop_front() in bset.c, to make the code more easier to +be understand. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/bset.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c +index 08768796b543..f37a429f093d 100644 +--- a/drivers/md/bcache/bset.c ++++ b/drivers/md/bcache/bset.c +@@ -155,6 +155,7 @@ int __bch_keylist_realloc(struct keylist *l, unsigned int u64s) + return 0; + } + ++/* Pop the top key of keylist by pointing l->top to its previous key */ + struct bkey *bch_keylist_pop(struct keylist *l) + { + struct bkey *k = l->keys; +@@ -168,6 +169,7 @@ struct bkey *bch_keylist_pop(struct keylist *l) + return l->top = k; + } + ++/* Pop the bottom key of keylist and update l->top_p */ + void bch_keylist_pop_front(struct keylist *l) + { + l->top_p -= bkey_u64s(l->keys); +-- +2.16.4 + diff --git a/for-next/0007-bcache-fix-deadlock-in-bcache_allocator.patch b/for-next/0007-bcache-fix-deadlock-in-bcache_allocator.patch new file mode 100644 index 0000000..2e7fccc --- /dev/null +++ b/for-next/0007-bcache-fix-deadlock-in-bcache_allocator.patch @@ -0,0 +1,150 @@ +From 6932d005723a7a9d2b268c6c9065b2f6d47850cf Mon Sep 17 00:00:00 2001 +From: Andrea Righi <andrea.righi@canonical.com> +Date: Wed, 7 Aug 2019 12:38:06 +0200 +Subject: [PATCH 07/10] bcache: fix deadlock in bcache_allocator + +bcache_allocator can call the following: + + bch_allocator_thread() + -> bch_prio_write() + -> bch_bucket_alloc() + -> wait on &ca->set->bucket_wait + +But the wake up event on bucket_wait is supposed to come from +bch_allocator_thread() itself => deadlock: + +[ 1158.490744] INFO: task bcache_allocato:15861 blocked for more than 10 seconds. +[ 1158.495929] Not tainted 5.3.0-050300rc3-generic #201908042232 +[ 1158.500653] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. +[ 1158.504413] bcache_allocato D 0 15861 2 0x80004000 +[ 1158.504419] Call Trace: +[ 1158.504429] __schedule+0x2a8/0x670 +[ 1158.504432] schedule+0x2d/0x90 +[ 1158.504448] bch_bucket_alloc+0xe5/0x370 [bcache] +[ 1158.504453] ? wait_woken+0x80/0x80 +[ 1158.504466] bch_prio_write+0x1dc/0x390 [bcache] +[ 1158.504476] bch_allocator_thread+0x233/0x490 [bcache] +[ 1158.504491] kthread+0x121/0x140 +[ 1158.504503] ? invalidate_buckets+0x890/0x890 [bcache] +[ 1158.504506] ? kthread_park+0xb0/0xb0 +[ 1158.504510] ret_from_fork+0x35/0x40 + +Fix by making the call to bch_prio_write() non-blocking, so that +bch_allocator_thread() never waits on itself. + +Moreover, make sure to wake up the garbage collector thread when +bch_prio_write() is failing to allocate buckets. + +BugLink: https://bugs.launchpad.net/bugs/1784665 +BugLink: https://bugs.launchpad.net/bugs/1796292 +Signed-off-by: Andrea Righi <andrea.righi@canonical.com> +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/alloc.c | 5 ++++- + drivers/md/bcache/bcache.h | 2 +- + drivers/md/bcache/super.c | 27 +++++++++++++++++++++------ + 3 files changed, 26 insertions(+), 8 deletions(-) + +diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c +index 6f776823b9ba..a1df0d95151c 100644 +--- a/drivers/md/bcache/alloc.c ++++ b/drivers/md/bcache/alloc.c +@@ -377,7 +377,10 @@ static int bch_allocator_thread(void *arg) + if (!fifo_full(&ca->free_inc)) + goto retry_invalidate; + +- bch_prio_write(ca); ++ if (bch_prio_write(ca, false) < 0) { ++ ca->invalidate_needs_gc = 1; ++ wake_up_gc(ca->set); ++ } + } + } + out: +diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h +index 3653faf3bf48..50241e045c70 100644 +--- a/drivers/md/bcache/bcache.h ++++ b/drivers/md/bcache/bcache.h +@@ -978,7 +978,7 @@ bool bch_cached_dev_error(struct cached_dev *dc); + __printf(2, 3) + bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...); + +-void bch_prio_write(struct cache *ca); ++int bch_prio_write(struct cache *ca, bool wait); + void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent); + + extern struct workqueue_struct *bcache_wq; +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 623fdaf10c4c..d1352fcc6ff2 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -530,12 +530,29 @@ static void prio_io(struct cache *ca, uint64_t bucket, int op, + closure_sync(cl); + } + +-void bch_prio_write(struct cache *ca) ++int bch_prio_write(struct cache *ca, bool wait) + { + int i; + struct bucket *b; + struct closure cl; + ++ pr_debug("free_prio=%zu, free_none=%zu, free_inc=%zu", ++ fifo_used(&ca->free[RESERVE_PRIO]), ++ fifo_used(&ca->free[RESERVE_NONE]), ++ fifo_used(&ca->free_inc)); ++ ++ /* ++ * Pre-check if there are enough free buckets. In the non-blocking ++ * scenario it's better to fail early rather than starting to allocate ++ * buckets and do a cleanup later in case of failure. ++ */ ++ if (!wait) { ++ size_t avail = fifo_used(&ca->free[RESERVE_PRIO]) + ++ fifo_used(&ca->free[RESERVE_NONE]); ++ if (prio_buckets(ca) > avail) ++ return -ENOMEM; ++ } ++ + closure_init_stack(&cl); + + lockdep_assert_held(&ca->set->bucket_lock); +@@ -545,9 +562,6 @@ void bch_prio_write(struct cache *ca) + atomic_long_add(ca->sb.bucket_size * prio_buckets(ca), + &ca->meta_sectors_written); + +- //pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free), +- // fifo_used(&ca->free_inc), fifo_used(&ca->unused)); +- + for (i = prio_buckets(ca) - 1; i >= 0; --i) { + long bucket; + struct prio_set *p = ca->disk_buckets; +@@ -565,7 +579,7 @@ void bch_prio_write(struct cache *ca) + p->magic = pset_magic(&ca->sb); + p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8); + +- bucket = bch_bucket_alloc(ca, RESERVE_PRIO, true); ++ bucket = bch_bucket_alloc(ca, RESERVE_PRIO, wait); + BUG_ON(bucket == -1); + + mutex_unlock(&ca->set->bucket_lock); +@@ -594,6 +608,7 @@ void bch_prio_write(struct cache *ca) + + ca->prio_last_buckets[i] = ca->prio_buckets[i]; + } ++ return 0; + } + + static void prio_read(struct cache *ca, uint64_t bucket) +@@ -1964,7 +1979,7 @@ static int run_cache_set(struct cache_set *c) + + mutex_lock(&c->bucket_lock); + for_each_cache(ca, c, i) +- bch_prio_write(ca); ++ bch_prio_write(ca, true); + mutex_unlock(&c->bucket_lock); + + err = "cannot allocate new UUID bucket"; +-- +2.16.4 + diff --git a/for-next/0008-bcache-add-code-comments-in-bch_btree_leaf_dirty.patch b/for-next/0008-bcache-add-code-comments-in-bch_btree_leaf_dirty.patch new file mode 100644 index 0000000..0296a83 --- /dev/null +++ b/for-next/0008-bcache-add-code-comments-in-bch_btree_leaf_dirty.patch @@ -0,0 +1,34 @@ +From 239fc6405b9264ac61ded073174cee1cbff16a9f Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Tue, 12 Nov 2019 17:03:18 +0800 +Subject: [PATCH 08/10] bcache: add code comments in bch_btree_leaf_dirty() + +This patch adds code comments in bch_btree_leaf_dirty() to explain +why w->journal should always reference the eldest journal pin of +all the writing bkeys in the btree node. To make the bcache journal +code to be easier to be understood. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/btree.c | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c +index 39d7fc1ef1ee..48e33ee0d876 100644 +--- a/drivers/md/bcache/btree.c ++++ b/drivers/md/bcache/btree.c +@@ -569,6 +569,11 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) + + set_btree_node_dirty(b); + ++ /* ++ * w->journal is always the oldest journal pin of all bkeys ++ * in the leaf node, to make sure the oldest jset seq won't ++ * be increased before this btree node is flushed. ++ */ + if (journal_ref) { + if (w->journal && + journal_pin_cmp(b->c, w->journal, journal_ref)) { +-- +2.16.4 + diff --git a/for-next/0009-bcache-add-idle_max_writeback_rate-sysfs-interface.patch b/for-next/0009-bcache-add-idle_max_writeback_rate-sysfs-interface.patch new file mode 100644 index 0000000..c8b3194 --- /dev/null +++ b/for-next/0009-bcache-add-idle_max_writeback_rate-sysfs-interface.patch @@ -0,0 +1,105 @@ +From 1a03ba923ac3dcf83518ab5f876df2ebf6e0e8a7 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Tue, 12 Nov 2019 18:24:36 +0800 +Subject: [PATCH 09/10] bcache: add idle_max_writeback_rate sysfs interface + +For writeback mode, if there is no regular I/O request for a while, +the writeback rate will be set to the maximum value (1TB/s for now). +This is good for most of the storage workload, but there are still +people don't what the maximum writeback rate in I/O idle time. + +This patch adds a sysfs interface file idle_max_writeback_rate to +permit people to disable maximum writeback rate. Then the minimum +writeback rate can be advised by writeback_rate_minimum in the +bcache device's sysfs interface. + +Reported-by: Christian Balzer <chibi@gol.com> +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/bcache.h | 1 + + drivers/md/bcache/super.c | 1 + + drivers/md/bcache/sysfs.c | 7 +++++++ + drivers/md/bcache/writeback.c | 4 ++++ + 4 files changed, 13 insertions(+) + +diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h +index 50241e045c70..9198c1b480d9 100644 +--- a/drivers/md/bcache/bcache.h ++++ b/drivers/md/bcache/bcache.h +@@ -724,6 +724,7 @@ struct cache_set { + unsigned int gc_always_rewrite:1; + unsigned int shrinker_disabled:1; + unsigned int copy_gc_enabled:1; ++ unsigned int idle_max_writeback_rate_enabled:1; + + #define BUCKET_HASH_BITS 12 + struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index d1352fcc6ff2..77e9869345e7 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -1834,6 +1834,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) + c->congested_read_threshold_us = 2000; + c->congested_write_threshold_us = 20000; + c->error_limit = DEFAULT_IO_ERROR_LIMIT; ++ c->idle_max_writeback_rate_enabled = 1; + WARN_ON(test_and_clear_bit(CACHE_SET_IO_DISABLE, &c->flags)); + + return c; +diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c +index 627dcea0f5b6..733e2ddf3c78 100644 +--- a/drivers/md/bcache/sysfs.c ++++ b/drivers/md/bcache/sysfs.c +@@ -134,6 +134,7 @@ rw_attribute(expensive_debug_checks); + rw_attribute(cache_replacement_policy); + rw_attribute(btree_shrinker_disabled); + rw_attribute(copy_gc_enabled); ++rw_attribute(idle_max_writeback_rate); + rw_attribute(gc_after_writeback); + rw_attribute(size); + +@@ -747,6 +748,8 @@ SHOW(__bch_cache_set) + sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite); + sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled); + sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); ++ sysfs_printf(idle_max_writeback_rate, "%i", ++ c->idle_max_writeback_rate_enabled); + sysfs_printf(gc_after_writeback, "%i", c->gc_after_writeback); + sysfs_printf(io_disable, "%i", + test_bit(CACHE_SET_IO_DISABLE, &c->flags)); +@@ -864,6 +867,9 @@ STORE(__bch_cache_set) + sysfs_strtoul_bool(gc_always_rewrite, c->gc_always_rewrite); + sysfs_strtoul_bool(btree_shrinker_disabled, c->shrinker_disabled); + sysfs_strtoul_bool(copy_gc_enabled, c->copy_gc_enabled); ++ sysfs_strtoul_bool(idle_max_writeback_rate, ++ c->idle_max_writeback_rate_enabled); ++ + /* + * write gc_after_writeback here may overwrite an already set + * BCH_DO_AUTO_GC, it doesn't matter because this flag will be +@@ -954,6 +960,7 @@ static struct attribute *bch_cache_set_internal_files[] = { + &sysfs_gc_always_rewrite, + &sysfs_btree_shrinker_disabled, + &sysfs_copy_gc_enabled, ++ &sysfs_idle_max_writeback_rate, + &sysfs_gc_after_writeback, + &sysfs_io_disable, + &sysfs_cutoff_writeback, +diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c +index d60268fe49e1..4a40f9eadeaf 100644 +--- a/drivers/md/bcache/writeback.c ++++ b/drivers/md/bcache/writeback.c +@@ -122,6 +122,10 @@ static void __update_writeback_rate(struct cached_dev *dc) + static bool set_at_max_writeback_rate(struct cache_set *c, + struct cached_dev *dc) + { ++ /* Don't sst max writeback rate if it is disabled */ ++ if (!c->idle_max_writeback_rate_enabled) ++ return false; ++ + /* Don't set max writeback rate if gc is running */ + if (!c->gc_mark_valid) + return false; +-- +2.16.4 + diff --git a/for-next/0010-bcache-at-least-try-to-shrink-1-node-in-bch_mca_scan.patch b/for-next/0010-bcache-at-least-try-to-shrink-1-node-in-bch_mca_scan.patch new file mode 100644 index 0000000..bd2e023 --- /dev/null +++ b/for-next/0010-bcache-at-least-try-to-shrink-1-node-in-bch_mca_scan.patch @@ -0,0 +1,46 @@ +From 8cfdd5280f61070b29f0b4b711bf090397893dd0 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Tue, 12 Nov 2019 23:41:03 +0800 +Subject: [PATCH 10/10] bcache: at least try to shrink 1 node in bch_mca_scan() + +In bch_mca_scan(), the number of shrinking btree node is calculated +by code like this, + unsigned long nr = sc->nr_to_scan; + + nr /= c->btree_pages; + nr = min_t(unsigned long, nr, mca_can_free(c)); +variable sc->nr_to_scan is number of objects (here is bcache B+tree +nodes' number) to shrink, and pointer variable sc is sent from memory +management code as parametr of a callback. + +If sc->nr_to_scan is smaller than c->btree_pages, after the above +calculation, variable 'nr' will be 0 and nothing will be shrunk. It is +frequeently observed that only 1 or 2 is set to sc->nr_to_scan and make +nr to be zero. Then bch_mca_scan() will do nothing more then acquiring +and releasing mutex c->bucket_lock. + +This patch checkes whether nr is 0 after the above calculation, if 0 +is the result then set 1 to variable 'n'. Then at least bch_mca_scan() +will try to shrink a single B+tree node. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/btree.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c +index 48e33ee0d876..3df5fa4a501c 100644 +--- a/drivers/md/bcache/btree.c ++++ b/drivers/md/bcache/btree.c +@@ -754,6 +754,8 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, + * IO can always make forward progress: + */ + nr /= c->btree_pages; ++ if (nr == 0) ++ nr = 1; + nr = min_t(unsigned long, nr, mca_can_free(c)); + + i = 0; +-- +2.16.4 + diff --git a/for-test/0001-bcache-fix-deadlock-in-bcache_allocator.patch b/for-test/0001-bcache-fix-deadlock-in-bcache_allocator.patch index 81646a5..b978abf 100644 --- a/for-test/0001-bcache-fix-deadlock-in-bcache_allocator.patch +++ b/for-test/0001-bcache-fix-deadlock-in-bcache_allocator.patch @@ -38,6 +38,7 @@ bch_prio_write() is failing to allocate buckets. BugLink: https://bugs.launchpad.net/bugs/1784665 BugLink: https://bugs.launchpad.net/bugs/1796292 Signed-off-by: Andrea Righi <andrea.righi@canonical.com> +Signed-off-by: Coly Li <colyli@suse.de> --- drivers/md/bcache/alloc.c | 5 ++++- drivers/md/bcache/bcache.h | 2 +- diff --git a/for-test/0001-bcache-only-set-b-accessed-1-for-dirty-btree-node-ca.patch b/for-test/0001-bcache-only-set-b-accessed-1-for-dirty-btree-node-ca.patch deleted file mode 100644 index 7ccd838..0000000 --- a/for-test/0001-bcache-only-set-b-accessed-1-for-dirty-btree-node-ca.patch +++ /dev/null @@ -1,28 +0,0 @@ -From 779bada095ec02a9bd400bc0a46039c4ead6c00d Mon Sep 17 00:00:00 2001 -From: Coly Li <colyli@suse.de> -Date: Tue, 2 Jul 2019 22:30:29 +0800 -Subject: [PATCH] bcache: only set b->accessed = 1 for dirty btree node cache - ---- - drivers/md/bcache/btree.c | 5 ++++- - 1 file changed, 4 insertions(+), 1 deletion(-) - -diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c -index ba434d9ac720..1497f1114b10 100644 ---- a/drivers/md/bcache/btree.c -+++ b/drivers/md/bcache/btree.c -@@ -1058,7 +1058,10 @@ struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op, - BUG_ON(!b->written); - - b->parent = parent; -- b->accessed = 1; -+ -+ /* make clean btree node more easier to be reclaim */ -+ if (!write) -+ b->accessed = 1; - - for (; i <= b->keys.nsets && b->keys.set[i].size; i++) { - prefetch(b->keys.set[i].tree); --- -2.16.4 - |