aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorColy Li <colyli@suse.de>2018-02-05 23:15:25 +0800
committerColy Li <colyli@suse.de>2018-02-05 23:15:25 +0800
commitdcc1da5ab0d2ea18f2f5e5dd769047fcc9c0bb43 (patch)
treebc72e87b45912fac9995a624c0f5f12e7715fd6c
parent2bbb32018a3cbf7780cf14ba16b9935837b2ce5f (diff)
downloadbcache-patches-dcc1da5ab0d2ea18f2f5e5dd769047fcc9c0bb43.tar.gz
for-next: update v5 patch set
-rw-r--r--for-next/v4/v4-0000-cover-letter.patch (renamed from for-next/v4-0000-cover-letter.patch)0
-rw-r--r--for-next/v4/v4-0001-bcache-set-writeback_rate_update_seconds-in-range.patch (renamed from for-next/v4-0001-bcache-set-writeback_rate_update_seconds-in-range.patch)0
-rw-r--r--for-next/v4/v4-0002-bcache-properly-set-task-state-in-bch_writeback_t.patch (renamed from for-next/v4-0002-bcache-properly-set-task-state-in-bch_writeback_t.patch)0
-rw-r--r--for-next/v4/v4-0003-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch (renamed from for-next/v4-0003-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch)0
-rw-r--r--for-next/v4/v4-0004-bcache-quit-dc-writeback_thread-when-BCACHE_DEV_D.patch (renamed from for-next/v4-0004-bcache-quit-dc-writeback_thread-when-BCACHE_DEV_D.patch)0
-rw-r--r--for-next/v4/v4-0005-bcache-stop-dc-writeback_rate_update-properly.patch (renamed from for-next/v4-0005-bcache-stop-dc-writeback_rate_update-properly.patch)0
-rw-r--r--for-next/v4/v4-0006-bcache-set-error_limit-correctly.patch (renamed from for-next/v4-0006-bcache-set-error_limit-correctly.patch)0
-rw-r--r--for-next/v4/v4-0007-bcache-add-CACHE_SET_IO_DISABLE-to-struct-cache_s.patch (renamed from for-next/v4-0007-bcache-add-CACHE_SET_IO_DISABLE-to-struct-cache_s.patch)0
-rw-r--r--for-next/v4/v4-0008-bcache-stop-all-attached-bcache-devices-for-a-ret.patch (renamed from for-next/v4-0008-bcache-stop-all-attached-bcache-devices-for-a-ret.patch)0
-rw-r--r--for-next/v4/v4-0009-bcache-fix-inaccurate-io-state-for-detached-bcach.patch (renamed from for-next/v4-0009-bcache-fix-inaccurate-io-state-for-detached-bcach.patch)0
-rw-r--r--for-next/v4/v4-0010-bcache-add-backing_request_endio-for-bi_end_io-of.patch (renamed from for-next/v4-0010-bcache-add-backing_request_endio-for-bi_end_io-of.patch)0
-rw-r--r--for-next/v4/v4-0011-bcache-add-io_disable-to-struct-cached_dev.patch (renamed from for-next/v4-0011-bcache-add-io_disable-to-struct-cached_dev.patch)0
-rw-r--r--for-next/v4/v4-0012-bcache-stop-bcache-device-when-backing-device-is-.patch (renamed from for-next/v4-0012-bcache-stop-bcache-device-when-backing-device-is-.patch)0
-rw-r--r--for-next/v4/v4-0013-bcache-add-stop_attached_devs_on_fail-to-struct-c.patch (renamed from for-next/v4-0013-bcache-add-stop_attached_devs_on_fail-to-struct-c.patch)0
-rw-r--r--for-next/v5-0000-cover-letter.patch92
-rw-r--r--for-next/v5-0001-bcache-set-writeback_rate_update_seconds-in-range.patch79
-rw-r--r--for-next/v5-0002-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch178
-rw-r--r--for-next/v5-0003-bcache-quit-dc-writeback_thread-when-BCACHE_DEV_D.patch130
-rw-r--r--for-next/v5-0004-bcache-stop-dc-writeback_rate_update-properly.patch268
-rw-r--r--for-next/v5-0005-bcache-add-CACHE_SET_IO_DISABLE-to-struct-cache_s.patch491
-rw-r--r--for-next/v5-0006-bcache-stop-all-attached-bcache-devices-for-a-ret.patch67
-rw-r--r--for-next/v5-0007-bcache-fix-inaccurate-io-state-for-detached-bcach.patch119
-rw-r--r--for-next/v5-0008-bcache-add-backing_request_endio-for-bi_end_io-of.patch255
-rw-r--r--for-next/v5-0009-bcache-add-io_disable-to-struct-cached_dev.patch237
-rw-r--r--for-next/v5-0010-bcache-stop-bcache-device-when-backing-device-is-.patch152
-rw-r--r--for-next/v5-0011-bcache-add-stop_when_cache_set_failed-option-to-b.patch251
26 files changed, 2319 insertions, 0 deletions
diff --git a/for-next/v4-0000-cover-letter.patch b/for-next/v4/v4-0000-cover-letter.patch
index 0327afe..0327afe 100644
--- a/for-next/v4-0000-cover-letter.patch
+++ b/for-next/v4/v4-0000-cover-letter.patch
diff --git a/for-next/v4-0001-bcache-set-writeback_rate_update_seconds-in-range.patch b/for-next/v4/v4-0001-bcache-set-writeback_rate_update_seconds-in-range.patch
index 51edd0b..51edd0b 100644
--- a/for-next/v4-0001-bcache-set-writeback_rate_update_seconds-in-range.patch
+++ b/for-next/v4/v4-0001-bcache-set-writeback_rate_update_seconds-in-range.patch
diff --git a/for-next/v4-0002-bcache-properly-set-task-state-in-bch_writeback_t.patch b/for-next/v4/v4-0002-bcache-properly-set-task-state-in-bch_writeback_t.patch
index 113dd97..113dd97 100644
--- a/for-next/v4-0002-bcache-properly-set-task-state-in-bch_writeback_t.patch
+++ b/for-next/v4/v4-0002-bcache-properly-set-task-state-in-bch_writeback_t.patch
diff --git a/for-next/v4-0003-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch b/for-next/v4/v4-0003-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch
index f85123b..f85123b 100644
--- a/for-next/v4-0003-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch
+++ b/for-next/v4/v4-0003-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch
diff --git a/for-next/v4-0004-bcache-quit-dc-writeback_thread-when-BCACHE_DEV_D.patch b/for-next/v4/v4-0004-bcache-quit-dc-writeback_thread-when-BCACHE_DEV_D.patch
index 349a3d1..349a3d1 100644
--- a/for-next/v4-0004-bcache-quit-dc-writeback_thread-when-BCACHE_DEV_D.patch
+++ b/for-next/v4/v4-0004-bcache-quit-dc-writeback_thread-when-BCACHE_DEV_D.patch
diff --git a/for-next/v4-0005-bcache-stop-dc-writeback_rate_update-properly.patch b/for-next/v4/v4-0005-bcache-stop-dc-writeback_rate_update-properly.patch
index 2e6ce9b..2e6ce9b 100644
--- a/for-next/v4-0005-bcache-stop-dc-writeback_rate_update-properly.patch
+++ b/for-next/v4/v4-0005-bcache-stop-dc-writeback_rate_update-properly.patch
diff --git a/for-next/v4-0006-bcache-set-error_limit-correctly.patch b/for-next/v4/v4-0006-bcache-set-error_limit-correctly.patch
index 927468d..927468d 100644
--- a/for-next/v4-0006-bcache-set-error_limit-correctly.patch
+++ b/for-next/v4/v4-0006-bcache-set-error_limit-correctly.patch
diff --git a/for-next/v4-0007-bcache-add-CACHE_SET_IO_DISABLE-to-struct-cache_s.patch b/for-next/v4/v4-0007-bcache-add-CACHE_SET_IO_DISABLE-to-struct-cache_s.patch
index 849d522..849d522 100644
--- a/for-next/v4-0007-bcache-add-CACHE_SET_IO_DISABLE-to-struct-cache_s.patch
+++ b/for-next/v4/v4-0007-bcache-add-CACHE_SET_IO_DISABLE-to-struct-cache_s.patch
diff --git a/for-next/v4-0008-bcache-stop-all-attached-bcache-devices-for-a-ret.patch b/for-next/v4/v4-0008-bcache-stop-all-attached-bcache-devices-for-a-ret.patch
index eab5e76..eab5e76 100644
--- a/for-next/v4-0008-bcache-stop-all-attached-bcache-devices-for-a-ret.patch
+++ b/for-next/v4/v4-0008-bcache-stop-all-attached-bcache-devices-for-a-ret.patch
diff --git a/for-next/v4-0009-bcache-fix-inaccurate-io-state-for-detached-bcach.patch b/for-next/v4/v4-0009-bcache-fix-inaccurate-io-state-for-detached-bcach.patch
index 048a30a..048a30a 100644
--- a/for-next/v4-0009-bcache-fix-inaccurate-io-state-for-detached-bcach.patch
+++ b/for-next/v4/v4-0009-bcache-fix-inaccurate-io-state-for-detached-bcach.patch
diff --git a/for-next/v4-0010-bcache-add-backing_request_endio-for-bi_end_io-of.patch b/for-next/v4/v4-0010-bcache-add-backing_request_endio-for-bi_end_io-of.patch
index 80f6dc8..80f6dc8 100644
--- a/for-next/v4-0010-bcache-add-backing_request_endio-for-bi_end_io-of.patch
+++ b/for-next/v4/v4-0010-bcache-add-backing_request_endio-for-bi_end_io-of.patch
diff --git a/for-next/v4-0011-bcache-add-io_disable-to-struct-cached_dev.patch b/for-next/v4/v4-0011-bcache-add-io_disable-to-struct-cached_dev.patch
index 6b4ae2a..6b4ae2a 100644
--- a/for-next/v4-0011-bcache-add-io_disable-to-struct-cached_dev.patch
+++ b/for-next/v4/v4-0011-bcache-add-io_disable-to-struct-cached_dev.patch
diff --git a/for-next/v4-0012-bcache-stop-bcache-device-when-backing-device-is-.patch b/for-next/v4/v4-0012-bcache-stop-bcache-device-when-backing-device-is-.patch
index e73bf4f..e73bf4f 100644
--- a/for-next/v4-0012-bcache-stop-bcache-device-when-backing-device-is-.patch
+++ b/for-next/v4/v4-0012-bcache-stop-bcache-device-when-backing-device-is-.patch
diff --git a/for-next/v4-0013-bcache-add-stop_attached_devs_on_fail-to-struct-c.patch b/for-next/v4/v4-0013-bcache-add-stop_attached_devs_on_fail-to-struct-c.patch
index d9edf10..d9edf10 100644
--- a/for-next/v4-0013-bcache-add-stop_attached_devs_on_fail-to-struct-c.patch
+++ b/for-next/v4/v4-0013-bcache-add-stop_attached_devs_on_fail-to-struct-c.patch
diff --git a/for-next/v5-0000-cover-letter.patch b/for-next/v5-0000-cover-letter.patch
new file mode 100644
index 0000000..de57fc5
--- /dev/null
+++ b/for-next/v5-0000-cover-letter.patch
@@ -0,0 +1,92 @@
+From e8f72263c0f4f20b85f42a617fa4998115f797af Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Mon, 5 Feb 2018 18:26:45 +0800
+Subject: [PATCH v5 00/11] bcache: device failure handling improvement
+
+Hi maintainers and folks,
+
+This patch set tries to improve bcache device failure handling, includes
+cache device and backing device failures.
+
+The basic idea to handle failed cache device is,
+- Unregister cache set
+- Detach all backing devices which are attached to this cache set
+- Stop all the detached bcache devices (configurable)
+- Stop all flash only volume on the cache set
+The above process is named 'cache set retire' by me. The result of cache
+set retire is, cache set and bcache devices are all removed, following
+I/O requests will get failed immediately to notift upper layer or user
+space coce that the cache device is failed or disconnected.
+- Stop all the detached bcache devices (configurable)
+- Stop all flash only volume on the cache set
+The above process is named 'cache set retire' by me. The result of cache
+set retire is, cache set and bcache devices are all removed
+(configurable), following I/O requests will get failed immediately to
+notify upper layer or user space coce that the cache device is failed or
+disconnected.
+
+There are 2 patches from v4 patch set is merged into bcache-for-next, they
+are not in v5 patch set any more.
+
+In v5 patch set add a new patch "bcache: add stop_when_cache_set_failed
+option to backing device", which provides "auto"/"always" options to
+configure whether or not to stop bcache device for a broken cache cset.
+
+Most of the patches are reviewed by Hannes Reinecke and Junhui Tang. There
+are still severl patches need to be reviewed,
+- [PATCH v5 03/11] bcache: quit dc->writeback_thread when
+ BCACHE_DEV_DETACHING is set
+- [PATCH v5 11/11] bcache: add stop_when_cache_set_failed option to
+ backing device
+
+Any comment, question and review are warmly welcome. Thanks in advance.
+
+Changelog:
+v5: add [PATCH v5 11/11] bcache: add stop_when_cache_set_failed option to
+ backing device.
+ fix issues from v4 patch set.
+ improve kernel message format, remove redundant prefix string.
+v4: add per-cached_dev option stop_attached_devs_on_fail to avoid stopping
+ attached bcache device from a retiring cache set.
+v3: fix detach issue find in v2 patch set.
+v2: fixes all problems found in v1 review.
+ add patches to handle backing device failure.
+ add one more patch to set writeback_rate_update_seconds range.
+ include a patch from Junhui Tang.
+v1: the initial version, only handles cache device failure.
+
+Coly Li
+
+
+Coly Li (10):
+ bcache: set writeback_rate_update_seconds in range [1, 60] seconds
+ bcache: fix cached_dev->count usage for bch_cache_set_error()
+ bcache: quit dc->writeback_thread when BCACHE_DEV_DETACHING is set
+ bcache: stop dc->writeback_rate_update properly
+ bcache: add CACHE_SET_IO_DISABLE to struct cache_set flags
+ bcache: stop all attached bcache devices for a retired cache set
+ bcache: add backing_request_endio() for bi_end_io of attached backing
+ device I/O
+ bcache: add io_disable to struct cached_dev
+ bcache: stop bcache device when backing device is offline
+ bcache: add stop_when_cache_set_failed option to backing device
+
+Tang Junhui (1):
+ bcache: fix inaccurate io state for detached bcache devices
+
+ drivers/md/bcache/alloc.c | 3 +-
+ drivers/md/bcache/bcache.h | 44 ++++++++-
+ drivers/md/bcache/btree.c | 10 +-
+ drivers/md/bcache/io.c | 16 +++-
+ drivers/md/bcache/journal.c | 4 +-
+ drivers/md/bcache/request.c | 185 +++++++++++++++++++++++++++++++------
+ drivers/md/bcache/super.c | 206 ++++++++++++++++++++++++++++++++++++++----
+ drivers/md/bcache/sysfs.c | 59 +++++++++++-
+ drivers/md/bcache/util.h | 6 --
+ drivers/md/bcache/writeback.c | 94 ++++++++++++++++---
+ drivers/md/bcache/writeback.h | 5 +-
+ 11 files changed, 551 insertions(+), 81 deletions(-)
+
+--
+2.16.1
+
diff --git a/for-next/v5-0001-bcache-set-writeback_rate_update_seconds-in-range.patch b/for-next/v5-0001-bcache-set-writeback_rate_update_seconds-in-range.patch
new file mode 100644
index 0000000..d218724
--- /dev/null
+++ b/for-next/v5-0001-bcache-set-writeback_rate_update_seconds-in-range.patch
@@ -0,0 +1,79 @@
+From 31fa907c6f962fc229c4364ac08b239f7bf5384d Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sat, 13 Jan 2018 15:11:03 +0800
+Subject: [PATCH v5 01/11] bcache: set writeback_rate_update_seconds in range
+ [1, 60] seconds
+
+dc->writeback_rate_update_seconds can be set via sysfs and its value can
+be set to [1, ULONG_MAX]. It does not make sense to set such a large
+value, 60 seconds is long enough value considering the default 5 seconds
+works well for long time.
+
+Because dc->writeback_rate_update is a special delayed work, it re-arms
+itself inside the delayed work routine update_writeback_rate(). When
+stopping it by cancel_delayed_work_sync(), there should be a timeout to
+wait and make sure the re-armed delayed work is stopped too. A small max
+value of dc->writeback_rate_update_seconds is also helpful to decide a
+reasonable small timeout.
+
+This patch limits sysfs interface to set dc->writeback_rate_update_seconds
+in range of [1, 60] seconds, and replaces the hand-coded number by macros.
+
+Changelog:
+v2: fix a rebase typo in v4, which is pointed out by Michael Lyle.
+v1: initial version.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reviewed-by: Hannes Reinecke <hare@suse.com>
+Cc: Michael Lyle <mlyle@lyle.org>
+---
+ drivers/md/bcache/sysfs.c | 4 +++-
+ drivers/md/bcache/writeback.c | 2 +-
+ drivers/md/bcache/writeback.h | 3 +++
+ 3 files changed, 7 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
+index c524305cc9a7..4a6a697e1680 100644
+--- a/drivers/md/bcache/sysfs.c
++++ b/drivers/md/bcache/sysfs.c
+@@ -218,7 +218,9 @@ STORE(__cached_dev)
+ sysfs_strtoul_clamp(writeback_rate,
+ dc->writeback_rate.rate, 1, INT_MAX);
+
+- d_strtoul_nonzero(writeback_rate_update_seconds);
++ sysfs_strtoul_clamp(writeback_rate_update_seconds,
++ dc->writeback_rate_update_seconds,
++ 1, WRITEBACK_RATE_UPDATE_SECS_MAX);
+ d_strtoul(writeback_rate_i_term_inverse);
+ d_strtoul_nonzero(writeback_rate_p_term_inverse);
+
+diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
+index 58218f7e77c3..f1d2fc15abcc 100644
+--- a/drivers/md/bcache/writeback.c
++++ b/drivers/md/bcache/writeback.c
+@@ -655,7 +655,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
+ dc->writeback_rate.rate = 1024;
+ dc->writeback_rate_minimum = 8;
+
+- dc->writeback_rate_update_seconds = 5;
++ dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT;
+ dc->writeback_rate_p_term_inverse = 40;
+ dc->writeback_rate_i_term_inverse = 10000;
+
+diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
+index 66f1c527fa24..587b25599856 100644
+--- a/drivers/md/bcache/writeback.h
++++ b/drivers/md/bcache/writeback.h
+@@ -8,6 +8,9 @@
+ #define MAX_WRITEBACKS_IN_PASS 5
+ #define MAX_WRITESIZE_IN_PASS 5000 /* *512b */
+
++#define WRITEBACK_RATE_UPDATE_SECS_MAX 60
++#define WRITEBACK_RATE_UPDATE_SECS_DEFAULT 5
++
+ /*
+ * 14 (16384ths) is chosen here as something that each backing device
+ * should be a reasonable fraction of the share, and not to blow up
+--
+2.16.1
+
diff --git a/for-next/v5-0002-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch b/for-next/v5-0002-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch
new file mode 100644
index 0000000..1c09d03
--- /dev/null
+++ b/for-next/v5-0002-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch
@@ -0,0 +1,178 @@
+From c88508fc06f1d3d2241104d5e3b7b5e0045d24c0 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Mon, 8 Jan 2018 23:05:58 +0800
+Subject: [PATCH v5 02/11] bcache: fix cached_dev->count usage for
+ bch_cache_set_error()
+
+When bcache metadata I/O fails, bcache will call bch_cache_set_error()
+to retire the whole cache set. The expected behavior to retire a cache
+set is to unregister the cache set, and unregister all backing device
+attached to this cache set, then remove sysfs entries of the cache set
+and all attached backing devices, finally release memory of structs
+cache_set, cache, cached_dev and bcache_device.
+
+In my testing when journal I/O failure triggered by disconnected cache
+device, sometimes the cache set cannot be retired, and its sysfs
+entry /sys/fs/bcache/<uuid> still exits and the backing device also
+references it. This is not expected behavior.
+
+When metadata I/O failes, the call senquence to retire whole cache set is,
+ bch_cache_set_error()
+ bch_cache_set_unregister()
+ bch_cache_set_stop()
+ __cache_set_unregister() <- called as callback by calling
+ clousre_queue(&c->caching)
+ cache_set_flush() <- called as a callback when refcount
+ of cache_set->caching is 0
+ cache_set_free() <- called as a callback when refcount
+ of catch_set->cl is 0
+ bch_cache_set_release() <- called as a callback when refcount
+ of catch_set->kobj is 0
+
+I find if kernel thread bch_writeback_thread() quits while-loop when
+kthread_should_stop() is true and searched_full_index is false, clousre
+callback cache_set_flush() set by continue_at() will never be called. The
+result is, bcache fails to retire whole cache set.
+
+cache_set_flush() will be called when refcount of closure c->caching is 0,
+and in function bcache_device_detach() refcount of closure c->caching is
+released to 0 by clousre_put(). In metadata error code path, function
+bcache_device_detach() is called by cached_dev_detach_finish(). This is a
+callback routine being called when cached_dev->count is 0. This refcount
+is decreased by cached_dev_put().
+
+The above dependence indicates, cache_set_flush() will be called when
+refcount of cache_set->cl is 0, and refcount of cache_set->cl to be 0
+when refcount of cache_dev->count is 0.
+
+The reason why sometimes cache_dev->count is not 0 (when metadata I/O fails
+and bch_cache_set_error() called) is, in bch_writeback_thread(), refcount
+of cache_dev is not decreased properly.
+
+In bch_writeback_thread(), cached_dev_put() is called only when
+searched_full_index is true and cached_dev->writeback_keys is empty, a.k.a
+there is no dirty data on cache. In most of run time it is correct, but
+when bch_writeback_thread() quits the while-loop while cache is still
+dirty, current code forget to call cached_dev_put() before this kernel
+thread exits. This is why sometimes cache_set_flush() is not executed and
+cache set fails to be retired.
+
+The reason to call cached_dev_put() in bch_writeback_rate() is, when the
+cache device changes from clean to dirty, cached_dev_get() is called, to
+make sure during writeback operatiions both backing and cache devices
+won't be released.
+
+Adding following code in bch_writeback_thread() does not work,
+ static int bch_writeback_thread(void *arg)
+ }
+
++ if (atomic_read(&dc->has_dirty))
++ cached_dev_put()
++
+ return 0;
+ }
+because writeback kernel thread can be waken up and start via sysfs entry:
+ echo 1 > /sys/block/bcache<N>/bcache/writeback_running
+It is difficult to check whether backing device is dirty without race and
+extra lock. So the above modification will introduce potential refcount
+underflow in some conditions.
+
+The correct fix is, to take cached dev refcount when creating the kernel
+thread, and put it before the kernel thread exits. Then bcache does not
+need to take a cached dev refcount when cache turns from clean to dirty,
+or to put a cached dev refcount when cache turns from ditry to clean. The
+writeback kernel thread is alwasy safe to reference data structure from
+cache set, cache and cached device (because a refcount of cache device is
+taken for it already), and no matter the kernel thread is stopped by I/O
+errors or system reboot, cached_dev->count can always be used correctly.
+
+The patch is simple, but understanding how it works is quite complicated.
+
+Changelog:
+v2: set dc->writeback_thread to NULL in this patch, as suggested by Hannes.
+v1: initial version for review.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reviewed-by: Hannes Reinecke <hare@suse.com>
+Cc: Michael Lyle <mlyle@lyle.org>
+Cc: Junhui Tang <tang.junhui@zte.com.cn>
+---
+ drivers/md/bcache/super.c | 1 -
+ drivers/md/bcache/writeback.c | 11 ++++++++---
+ drivers/md/bcache/writeback.h | 2 --
+ 3 files changed, 8 insertions(+), 6 deletions(-)
+
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index a2ad37a8afc0..7d96dc6860fa 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -1052,7 +1052,6 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
+ if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
+ bch_sectors_dirty_init(&dc->disk);
+ atomic_set(&dc->has_dirty, 1);
+- refcount_inc(&dc->count);
+ bch_writeback_queue(dc);
+ }
+
+diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
+index f1d2fc15abcc..b280c134dd4d 100644
+--- a/drivers/md/bcache/writeback.c
++++ b/drivers/md/bcache/writeback.c
+@@ -572,7 +572,7 @@ static int bch_writeback_thread(void *arg)
+
+ if (kthread_should_stop()) {
+ set_current_state(TASK_RUNNING);
+- return 0;
++ break;
+ }
+
+ schedule();
+@@ -585,7 +585,6 @@ static int bch_writeback_thread(void *arg)
+ if (searched_full_index &&
+ RB_EMPTY_ROOT(&dc->writeback_keys.keys)) {
+ atomic_set(&dc->has_dirty, 0);
+- cached_dev_put(dc);
+ SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
+ bch_write_bdev_super(dc, NULL);
+ }
+@@ -606,6 +605,9 @@ static int bch_writeback_thread(void *arg)
+ }
+ }
+
++ dc->writeback_thread = NULL;
++ cached_dev_put(dc);
++
+ return 0;
+ }
+
+@@ -669,10 +671,13 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc)
+ if (!dc->writeback_write_wq)
+ return -ENOMEM;
+
++ cached_dev_get(dc);
+ dc->writeback_thread = kthread_create(bch_writeback_thread, dc,
+ "bcache_writeback");
+- if (IS_ERR(dc->writeback_thread))
++ if (IS_ERR(dc->writeback_thread)) {
++ cached_dev_put(dc);
+ return PTR_ERR(dc->writeback_thread);
++ }
+
+ schedule_delayed_work(&dc->writeback_rate_update,
+ dc->writeback_rate_update_seconds * HZ);
+diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
+index 587b25599856..0bba8f1c6cdf 100644
+--- a/drivers/md/bcache/writeback.h
++++ b/drivers/md/bcache/writeback.h
+@@ -105,8 +105,6 @@ static inline void bch_writeback_add(struct cached_dev *dc)
+ {
+ if (!atomic_read(&dc->has_dirty) &&
+ !atomic_xchg(&dc->has_dirty, 1)) {
+- refcount_inc(&dc->count);
+-
+ if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) {
+ SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY);
+ /* XXX: should do this synchronously */
+--
+2.16.1
+
diff --git a/for-next/v5-0003-bcache-quit-dc-writeback_thread-when-BCACHE_DEV_D.patch b/for-next/v5-0003-bcache-quit-dc-writeback_thread-when-BCACHE_DEV_D.patch
new file mode 100644
index 0000000..d9aa14d
--- /dev/null
+++ b/for-next/v5-0003-bcache-quit-dc-writeback_thread-when-BCACHE_DEV_D.patch
@@ -0,0 +1,130 @@
+From ad362b22f19cca3073dfe9290529ea5ff63c8b4b Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sun, 14 Jan 2018 21:41:57 +0800
+Subject: [PATCH v5 03/11] bcache: quit dc->writeback_thread when
+ BCACHE_DEV_DETACHING is set
+
+In patch "bcache: fix cached_dev->count usage for bch_cache_set_error()",
+cached_dev_get() is called when creating dc->writeback_thread, and
+cached_dev_put() is called when exiting dc->writeback_thread. This
+modification works well unless people detach the bcache device manually by
+ 'echo 1 > /sys/block/bcache<N>/bcache/detach'
+Because this sysfs interface only calls bch_cached_dev_detach() which wakes
+up dc->writeback_thread but does not stop it. The reason is, before patch
+"bcache: fix cached_dev->count usage for bch_cache_set_error()", inside
+bch_writeback_thread(), if cache is not dirty after writeback,
+cached_dev_put() will be called here. And in cached_dev_make_request() when
+a new write request makes cache from clean to dirty, cached_dev_get() will
+be called there. Since we don't operate dc->count in these locations,
+refcount d->count cannot be dropped after cache becomes clean, and
+cached_dev_detach_finish() won't be called to detach bcache device.
+
+This patch fixes the issue by checking whether BCACHE_DEV_DETACHING is
+set inside bch_writeback_thread(). If this bit is set and cache is clean
+(no existing writeback_keys), break the while-loop, call cached_dev_put()
+and quit the writeback thread.
+
+Please note if cache is still dirty, even BCACHE_DEV_DETACHING is set the
+writeback thread should continue to perform writeback, this is the original
+design of manually detach.
+
+It is safe to do the following check without locking, let me explain why,
++ if (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) &&
++ (!atomic_read(&dc->has_dirty) || !dc->writeback_running)) {
+
+If the kenrel thread does not sleep and continue to run due to conditions
+are not updated in time on the running CPU core, it just consumes more CPU
+cycles and has no hurt. This should-sleep-but-run is safe here. We just
+focus on the should-run-but-sleep condition, which means the writeback
+thread goes to sleep in mistake while it should continue to run.
+1, First of all, no matter the writeback thread is hung or not, kthread_stop() from
+ cached_dev_detach_finish() will wake up it and terminate by making
+ kthread_should_stop() return true. And in normal run time, bit on index
+ BCACHE_DEV_DETACHING is always cleared, the condition
+ !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)
+ is always true and can be ignored as constant value.
+2, If one of the following conditions is true, the writeback thread should
+ go to sleep,
+ "!atomic_read(&dc->has_dirty)" or "!dc->writeback_running)"
+ each of them independently controls the writeback thread should sleep or
+ not, let's analyse them one by one.
+2.1 condition "!atomic_read(&dc->has_dirty)"
+ If dc->has_dirty is set from 0 to 1 on another CPU core, bcache will
+ call bch_writeback_queue() immediately or call bch_writeback_add() which
+ indirectly calls bch_writeback_queue() too. In bch_writeback_queue(),
+ wake_up_process(dc->writeback_thread) is called. It sets writeback
+ thread's task state to TASK_RUNNING and following an implicit memory
+ barrier, then tries to wake up the writeback thread.
+ In writeback thread, its task state is set to TASK_INTERRUPTIBLE before
+ doing the condition check. If other CPU core sets the TASK_RUNNING state
+ after writeback thread setting TASK_INTERRUPTIBLE, the writeback thread
+ will be scheduled to run very soon because its state is not
+ TASK_INTERRUPTIBLE. If other CPU core sets the TASK_RUNNING state before
+ writeback thread setting TASK_INTERRUPTIBLE, the implict memory barrier
+ of wake_up_process() will make sure modification of dc->has_dirty on
+ other CPU core is updated and observed on the CPU core of writeback
+ thread. Therefore the condition check will correctly be false, and
+ continue writeback code without sleeping.
+2.2 condition "!dc->writeback_running)"
+ dc->writeback_running can be changed via sysfs file, every time it is
+ modified, a following bch_writeback_queue() is alwasy called. So the
+ change is always observed on the CPU core of writeback thread. If
+ dc->writeback_running is changed from 0 to 1 on other CPU core, this
+ condition check will observe the modification and allow writeback
+ thread to continue to run without sleeping.
+Now we can see, even without a locking protection, multiple conditions
+check is safe here, no deadlock or process hang up will happen.
+
+I compose a separte patch because that patch "bcache: fix cached_dev->count
+usage for bch_cache_set_error()" already gets a "Reviewed-by:" from Hannes
+Reinecke. Also this fix is not trivial and good for a separate patch.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Michael Lyle <mlyle@lyle.org>
+Cc: Hannes Reinecke <hare@suse.com>
+Cc: Huijun Tang <tang.junhui@zte.com.cn>
+---
+ drivers/md/bcache/writeback.c | 20 +++++++++++++++++---
+ 1 file changed, 17 insertions(+), 3 deletions(-)
+
+diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
+index b280c134dd4d..4dbeaaa575bf 100644
+--- a/drivers/md/bcache/writeback.c
++++ b/drivers/md/bcache/writeback.c
+@@ -565,9 +565,15 @@ static int bch_writeback_thread(void *arg)
+ while (!kthread_should_stop()) {
+ down_write(&dc->writeback_lock);
+ set_current_state(TASK_INTERRUPTIBLE);
+- if (!atomic_read(&dc->has_dirty) ||
+- (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) &&
+- !dc->writeback_running)) {
++ /*
++ * If the bache device is detaching, skip here and continue
++ * to perform writeback. Otherwise, if no dirty data on cache,
++ * or there is dirty data on cache but writeback is disabled,
++ * the writeback thread should sleep here and wait for others
++ * to wake up it.
++ */
++ if (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) &&
++ (!atomic_read(&dc->has_dirty) || !dc->writeback_running)) {
+ up_write(&dc->writeback_lock);
+
+ if (kthread_should_stop()) {
+@@ -587,6 +593,14 @@ static int bch_writeback_thread(void *arg)
+ atomic_set(&dc->has_dirty, 0);
+ SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
+ bch_write_bdev_super(dc, NULL);
++ /*
++ * If bcache device is detaching via sysfs interface,
++ * writeback thread should stop after there is no dirty
++ * data on cache. BCACHE_DEV_DETACHING flag is set in
++ * bch_cached_dev_detach().
++ */
++ if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
++ break;
+ }
+
+ up_write(&dc->writeback_lock);
+--
+2.16.1
+
diff --git a/for-next/v5-0004-bcache-stop-dc-writeback_rate_update-properly.patch b/for-next/v5-0004-bcache-stop-dc-writeback_rate_update-properly.patch
new file mode 100644
index 0000000..83524b8
--- /dev/null
+++ b/for-next/v5-0004-bcache-stop-dc-writeback_rate_update-properly.patch
@@ -0,0 +1,268 @@
+From f4ce0fb52b341b89a5302daa4a0c2ce716281867 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sat, 13 Jan 2018 15:48:39 +0800
+Subject: [PATCH v5 04/11] bcache: stop dc->writeback_rate_update properly
+
+struct delayed_work writeback_rate_update in struct cache_dev is a delayed
+worker to call function update_writeback_rate() in period (the interval is
+defined by dc->writeback_rate_update_seconds).
+
+When a metadate I/O error happens on cache device, bcache error handling
+routine bch_cache_set_error() will call bch_cache_set_unregister() to
+retire whole cache set. On the unregister code path, this delayed work is
+stopped by calling cancel_delayed_work_sync(&dc->writeback_rate_update).
+
+dc->writeback_rate_update is a special delayed work from others in bcache.
+In its routine update_writeback_rate(), this delayed work is re-armed
+itself. That means when cancel_delayed_work_sync() returns, this delayed
+work can still be executed after several seconds defined by
+dc->writeback_rate_update_seconds.
+
+The problem is, after cancel_delayed_work_sync() returns, the cache set
+unregister code path will continue and release memory of struct cache set.
+Then the delayed work is scheduled to run, __update_writeback_rate()
+will reference the already released cache_set memory, and trigger a NULL
+pointer deference fault.
+
+This patch introduces two more bcache device flags,
+- BCACHE_DEV_WB_RUNNING
+ bit set: bcache device is in writeback mode and running, it is OK for
+ dc->writeback_rate_update to re-arm itself.
+ bit clear:bcache device is trying to stop dc->writeback_rate_update,
+ this delayed work should not re-arm itself and quit.
+- BCACHE_DEV_RATE_DW_RUNNING
+ bit set: routine update_writeback_rate() is executing.
+ bit clear: routine update_writeback_rate() quits.
+
+This patch also adds a function cancel_writeback_rate_update_dwork() to
+wait for dc->writeback_rate_update quits before cancel it by calling
+cancel_delayed_work_sync(). In order to avoid a deadlock by unexpected
+quit dc->writeback_rate_update, after time_out seconds this function will
+give up and continue to call cancel_delayed_work_sync().
+
+And here I explain how this patch stops self re-armed delayed work properly
+with the above stuffs.
+
+update_writeback_rate() sets BCACHE_DEV_RATE_DW_RUNNING at its beginning
+and clears BCACHE_DEV_RATE_DW_RUNNING at its end. Before calling
+cancel_writeback_rate_update_dwork() clear flag BCACHE_DEV_WB_RUNNING.
+
+Before calling cancel_delayed_work_sync() wait utill flag
+BCACHE_DEV_RATE_DW_RUNNING is clear. So when calling
+cancel_delayed_work_sync(), dc->writeback_rate_update must be already re-
+armed, or quite by seeing BCACHE_DEV_WB_RUNNING cleared. In both cases
+delayed work routine update_writeback_rate() won't be executed after
+cancel_delayed_work_sync() returns.
+
+Inside update_writeback_rate() before calling schedule_delayed_work(), flag
+BCACHE_DEV_WB_RUNNING is checked before. If this flag is cleared, it means
+someone is about to stop the delayed work. Because flag
+BCACHE_DEV_RATE_DW_RUNNING is set already and cancel_delayed_work_sync()
+has to wait for this flag to be cleared, we don't need to worry about race
+condition here.
+
+If update_writeback_rate() is scheduled to run after checking
+BCACHE_DEV_RATE_DW_RUNNING and before calling cancel_delayed_work_sync()
+in cancel_writeback_rate_update_dwork(), it is also safe. Because at this
+moment BCACHE_DEV_WB_RUNNING is cleared with memory barrier. As I mentioned
+previously, update_writeback_rate() will see BCACHE_DEV_WB_RUNNING is clear
+and quit immediately.
+
+Because there are more dependences inside update_writeback_rate() to struct
+cache_set memory, dc->writeback_rate_update is not a simple self re-arm
+delayed work. After trying many different methods (e.g. hold dc->count, or
+use locks), this is the only way I can find which works to properly stop
+dc->writeback_rate_update delayed work.
+
+Changelog:
+v3: change values of BCACHE_DEV_WB_RUNNING and BCACHE_DEV_RATE_DW_RUNNING
+ to bit index, for test_bit().
+v2: Try to fix the race issue which is pointed out by Junhui.
+v1: The initial version for review
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reviewed-by: Junhui Tang <tang.junhui@zte.com.cn>
+Cc: Michael Lyle <mlyle@lyle.org>
+Cc: Hannes Reinecke <hare@suse.com>
+---
+ drivers/md/bcache/bcache.h | 9 +++++----
+ drivers/md/bcache/super.c | 39 +++++++++++++++++++++++++++++++++++----
+ drivers/md/bcache/sysfs.c | 3 ++-
+ drivers/md/bcache/writeback.c | 29 ++++++++++++++++++++++++++++-
+ 4 files changed, 70 insertions(+), 10 deletions(-)
+
+diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
+index b8c2e1bef1f1..0380626bf525 100644
+--- a/drivers/md/bcache/bcache.h
++++ b/drivers/md/bcache/bcache.h
+@@ -258,10 +258,11 @@ struct bcache_device {
+ struct gendisk *disk;
+
+ unsigned long flags;
+-#define BCACHE_DEV_CLOSING 0
+-#define BCACHE_DEV_DETACHING 1
+-#define BCACHE_DEV_UNLINK_DONE 2
+-
++#define BCACHE_DEV_CLOSING 0
++#define BCACHE_DEV_DETACHING 1
++#define BCACHE_DEV_UNLINK_DONE 2
++#define BCACHE_DEV_WB_RUNNING 3
++#define BCACHE_DEV_RATE_DW_RUNNING 4
+ unsigned nr_stripes;
+ unsigned stripe_size;
+ atomic_t *stripe_sectors_dirty;
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index 7d96dc6860fa..e15cacecf078 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -899,6 +899,32 @@ void bch_cached_dev_run(struct cached_dev *dc)
+ pr_debug("error creating sysfs link");
+ }
+
++/*
++ * If BCACHE_DEV_RATE_DW_RUNNING is set, it means routine of the delayed
++ * work dc->writeback_rate_update is running. Wait until the routine
++ * quits (BCACHE_DEV_RATE_DW_RUNNING is clear), then continue to
++ * cancel it. If BCACHE_DEV_RATE_DW_RUNNING is not clear after time_out
++ * seconds, give up waiting here and continue to cancel it too.
++ */
++static void cancel_writeback_rate_update_dwork(struct cached_dev *dc)
++{
++ int time_out = WRITEBACK_RATE_UPDATE_SECS_MAX * HZ;
++
++ do {
++ if (!test_bit(BCACHE_DEV_RATE_DW_RUNNING,
++ &dc->disk.flags))
++ break;
++ time_out--;
++ schedule_timeout_interruptible(1);
++ } while (time_out > 0);
++
++ if (time_out == 0)
++ pr_warn("give up waiting for dc->writeback_write_update"
++ " to quit");
++
++ cancel_delayed_work_sync(&dc->writeback_rate_update);
++}
++
+ static void cached_dev_detach_finish(struct work_struct *w)
+ {
+ struct cached_dev *dc = container_of(w, struct cached_dev, detach);
+@@ -911,7 +937,9 @@ static void cached_dev_detach_finish(struct work_struct *w)
+
+ mutex_lock(&bch_register_lock);
+
+- cancel_delayed_work_sync(&dc->writeback_rate_update);
++ if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
++ cancel_writeback_rate_update_dwork(dc);
++
+ if (!IS_ERR_OR_NULL(dc->writeback_thread)) {
+ kthread_stop(dc->writeback_thread);
+ dc->writeback_thread = NULL;
+@@ -954,6 +982,7 @@ void bch_cached_dev_detach(struct cached_dev *dc)
+ closure_get(&dc->disk.cl);
+
+ bch_writeback_queue(dc);
++
+ cached_dev_put(dc);
+ }
+
+@@ -1079,14 +1108,16 @@ static void cached_dev_free(struct closure *cl)
+ {
+ struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
+
+- cancel_delayed_work_sync(&dc->writeback_rate_update);
++ mutex_lock(&bch_register_lock);
++
++ if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
++ cancel_writeback_rate_update_dwork(dc);
++
+ if (!IS_ERR_OR_NULL(dc->writeback_thread))
+ kthread_stop(dc->writeback_thread);
+ if (dc->writeback_write_wq)
+ destroy_workqueue(dc->writeback_write_wq);
+
+- mutex_lock(&bch_register_lock);
+-
+ if (atomic_read(&dc->running))
+ bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
+ bcache_device_free(&dc->disk);
+diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
+index 4a6a697e1680..399e91cbf714 100644
+--- a/drivers/md/bcache/sysfs.c
++++ b/drivers/md/bcache/sysfs.c
+@@ -306,7 +306,8 @@ STORE(bch_cached_dev)
+ bch_writeback_queue(dc);
+
+ if (attr == &sysfs_writeback_percent)
+- schedule_delayed_work(&dc->writeback_rate_update,
++ if (!test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
++ schedule_delayed_work(&dc->writeback_rate_update,
+ dc->writeback_rate_update_seconds * HZ);
+
+ mutex_unlock(&bch_register_lock);
+diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
+index 4dbeaaa575bf..8f98ef1038d3 100644
+--- a/drivers/md/bcache/writeback.c
++++ b/drivers/md/bcache/writeback.c
+@@ -115,6 +115,21 @@ static void update_writeback_rate(struct work_struct *work)
+ struct cached_dev,
+ writeback_rate_update);
+
++ /*
++ * should check BCACHE_DEV_RATE_DW_RUNNING before calling
++ * cancel_delayed_work_sync().
++ */
++ set_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
++ /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
++ smp_mb();
++
++ if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) {
++ clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
++ /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
++ smp_mb();
++ return;
++ }
++
+ down_read(&dc->writeback_lock);
+
+ if (atomic_read(&dc->has_dirty) &&
+@@ -123,8 +138,18 @@ static void update_writeback_rate(struct work_struct *work)
+
+ up_read(&dc->writeback_lock);
+
+- schedule_delayed_work(&dc->writeback_rate_update,
++ if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) {
++ schedule_delayed_work(&dc->writeback_rate_update,
+ dc->writeback_rate_update_seconds * HZ);
++ }
++
++ /*
++ * should check BCACHE_DEV_RATE_DW_RUNNING before calling
++ * cancel_delayed_work_sync().
++ */
++ clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
++ /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
++ smp_mb();
+ }
+
+ static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
+@@ -675,6 +700,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
+ dc->writeback_rate_p_term_inverse = 40;
+ dc->writeback_rate_i_term_inverse = 10000;
+
++ WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags));
+ INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
+ }
+
+@@ -693,6 +719,7 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc)
+ return PTR_ERR(dc->writeback_thread);
+ }
+
++ WARN_ON(test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags));
+ schedule_delayed_work(&dc->writeback_rate_update,
+ dc->writeback_rate_update_seconds * HZ);
+
+--
+2.16.1
+
diff --git a/for-next/v5-0005-bcache-add-CACHE_SET_IO_DISABLE-to-struct-cache_s.patch b/for-next/v5-0005-bcache-add-CACHE_SET_IO_DISABLE-to-struct-cache_s.patch
new file mode 100644
index 0000000..57afe27
--- /dev/null
+++ b/for-next/v5-0005-bcache-add-CACHE_SET_IO_DISABLE-to-struct-cache_s.patch
@@ -0,0 +1,491 @@
+From 805fefa99ced3d184d69c65449eb3e24104346b9 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sun, 14 Jan 2018 22:15:00 +0800
+Subject: [PATCH v5 05/11] bcache: add CACHE_SET_IO_DISABLE to struct cache_set
+ flags
+
+When too many I/Os failed on cache device, bch_cache_set_error() is called
+in the error handling code path to retire whole problematic cache set. If
+new I/O requests continue to come and take refcount dc->count, the cache
+set won't be retired immediately, this is a problem.
+
+Further more, there are several kernel thread and self-armed kernel work
+may still running after bch_cache_set_error() is called. It needs to wait
+quite a while for them to stop, or they won't stop at all. They also
+prevent the cache set from being retired.
+
+The solution in this patch is, to add per cache set flag to disable I/O
+request on this cache and all attached backing devices. Then new coming I/O
+requests can be rejected in *_make_request() before taking refcount, kernel
+threads and self-armed kernel worker can stop very fast when flags bit
+CACHE_SET_IO_DISABLE is set.
+
+Because bcache also do internal I/Os for writeback, garbage collection,
+bucket allocation, journaling, this kind of I/O should be disabled after
+bch_cache_set_error() is called. So closure_bio_submit() is modified to
+check whether CACHE_SET_IO_DISABLE is set on cache_set->flags. If set,
+closure_bio_submit() will set bio->bi_status to BLK_STS_IOERR and
+return, generic_make_request() won't be called.
+
+A sysfs interface is also added to set or clear CACHE_SET_IO_DISABLE bit
+from cache_set->flags, to disable or enable cache set I/O for debugging. It
+is helpful to trigger more corner case issues for failed cache device.
+
+Changelog
+v3, change CACHE_SET_IO_DISABLE from 4 to 3, since it is bit index.
+ remove "bcache: " prefix when printing out kernel message.
+v2, more changes by previous review,
+- Use CACHE_SET_IO_DISABLE of cache_set->flags, suggested by Junhui.
+- Check CACHE_SET_IO_DISABLE in bch_btree_gc() to stop a while-loop, this
+ is reported and inspired from origal patch of Pavel Vazharov.
+v1, initial version.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reviewed-by: Hannes Reinecke <hare@suse.com>
+Cc: Junhui Tang <tang.junhui@zte.com.cn>
+Cc: Michael Lyle <mlyle@lyle.org>
+Cc: Pavel Vazharov <freakpv@gmail.com>
+---
+ drivers/md/bcache/alloc.c | 3 ++-
+ drivers/md/bcache/bcache.h | 18 ++++++++++++++++++
+ drivers/md/bcache/btree.c | 10 +++++++---
+ drivers/md/bcache/io.c | 2 +-
+ drivers/md/bcache/journal.c | 4 ++--
+ drivers/md/bcache/request.c | 26 +++++++++++++++++++-------
+ drivers/md/bcache/super.c | 6 +++++-
+ drivers/md/bcache/sysfs.c | 20 ++++++++++++++++++++
+ drivers/md/bcache/util.h | 6 ------
+ drivers/md/bcache/writeback.c | 35 +++++++++++++++++++++++++++--------
+ 10 files changed, 101 insertions(+), 29 deletions(-)
+
+diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
+index 458e1d38577d..004cc3cc6123 100644
+--- a/drivers/md/bcache/alloc.c
++++ b/drivers/md/bcache/alloc.c
+@@ -287,7 +287,8 @@ do { \
+ break; \
+ \
+ mutex_unlock(&(ca)->set->bucket_lock); \
+- if (kthread_should_stop()) { \
++ if (kthread_should_stop() || \
++ test_bit(CACHE_SET_IO_DISABLE, &ca->set->flags)) { \
+ set_current_state(TASK_RUNNING); \
+ return 0; \
+ } \
+diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
+index 0380626bf525..7917b3820dd5 100644
+--- a/drivers/md/bcache/bcache.h
++++ b/drivers/md/bcache/bcache.h
+@@ -475,10 +475,15 @@ struct gc_stat {
+ *
+ * CACHE_SET_RUNNING means all cache devices have been registered and journal
+ * replay is complete.
++ *
++ * CACHE_SET_IO_DISABLE is set when bcache is stopping the whold cache set, all
++ * external and internal I/O should be denied when this flag is set.
++ *
+ */
+ #define CACHE_SET_UNREGISTERING 0
+ #define CACHE_SET_STOPPING 1
+ #define CACHE_SET_RUNNING 2
++#define CACHE_SET_IO_DISABLE 3
+
+ struct cache_set {
+ struct closure cl;
+@@ -868,6 +873,19 @@ static inline void wake_up_allocators(struct cache_set *c)
+ wake_up_process(ca->alloc_thread);
+ }
+
++static inline void closure_bio_submit(struct cache_set *c,
++ struct bio *bio,
++ struct closure *cl)
++{
++ closure_get(cl);
++ if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) {
++ bio->bi_status = BLK_STS_IOERR;
++ bio_endio(bio);
++ return;
++ }
++ generic_make_request(bio);
++}
++
+ /* Forward declarations */
+
+ void bch_count_io_errors(struct cache *, blk_status_t, int, const char *);
+diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
+index fad9fe8817eb..8ca50f387a1d 100644
+--- a/drivers/md/bcache/btree.c
++++ b/drivers/md/bcache/btree.c
+@@ -1744,6 +1744,7 @@ static void bch_btree_gc(struct cache_set *c)
+
+ btree_gc_start(c);
+
++ /* if CACHE_SET_IO_DISABLE set, gc thread should stop too */
+ do {
+ ret = btree_root(gc_root, c, &op, &writes, &stats);
+ closure_sync(&writes);
+@@ -1751,7 +1752,7 @@ static void bch_btree_gc(struct cache_set *c)
+
+ if (ret && ret != -EAGAIN)
+ pr_warn("gc failed!");
+- } while (ret);
++ } while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags));
+
+ bch_btree_gc_finish(c);
+ wake_up_allocators(c);
+@@ -1789,9 +1790,12 @@ static int bch_gc_thread(void *arg)
+
+ while (1) {
+ wait_event_interruptible(c->gc_wait,
+- kthread_should_stop() || gc_should_run(c));
++ kthread_should_stop() ||
++ test_bit(CACHE_SET_IO_DISABLE, &c->flags) ||
++ gc_should_run(c));
+
+- if (kthread_should_stop())
++ if (kthread_should_stop() ||
++ test_bit(CACHE_SET_IO_DISABLE, &c->flags))
+ break;
+
+ set_gc_sectors(c);
+diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
+index a783c5a41ff1..8013ecbcdbda 100644
+--- a/drivers/md/bcache/io.c
++++ b/drivers/md/bcache/io.c
+@@ -38,7 +38,7 @@ void __bch_submit_bbio(struct bio *bio, struct cache_set *c)
+ bio_set_dev(bio, PTR_CACHE(c, &b->key, 0)->bdev);
+
+ b->submit_time_us = local_clock_us();
+- closure_bio_submit(bio, bio->bi_private);
++ closure_bio_submit(c, bio, bio->bi_private);
+ }
+
+ void bch_submit_bbio(struct bio *bio, struct cache_set *c,
+diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
+index 1b736b860739..c94085f400a4 100644
+--- a/drivers/md/bcache/journal.c
++++ b/drivers/md/bcache/journal.c
+@@ -62,7 +62,7 @@ reread: left = ca->sb.bucket_size - offset;
+ bio_set_op_attrs(bio, REQ_OP_READ, 0);
+ bch_bio_map(bio, data);
+
+- closure_bio_submit(bio, &cl);
++ closure_bio_submit(ca->set, bio, &cl);
+ closure_sync(&cl);
+
+ /* This function could be simpler now since we no longer write
+@@ -674,7 +674,7 @@ static void journal_write_unlocked(struct closure *cl)
+ spin_unlock(&c->journal.lock);
+
+ while ((bio = bio_list_pop(&list)))
+- closure_bio_submit(bio, cl);
++ closure_bio_submit(c, bio, cl);
+
+ continue_at(cl, journal_write_done, NULL);
+ }
+diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
+index 1a46b41dac70..02296bda6384 100644
+--- a/drivers/md/bcache/request.c
++++ b/drivers/md/bcache/request.c
+@@ -747,7 +747,7 @@ static void cached_dev_read_error(struct closure *cl)
+
+ /* XXX: invalidate cache */
+
+- closure_bio_submit(bio, cl);
++ closure_bio_submit(s->iop.c, bio, cl);
+ }
+
+ continue_at(cl, cached_dev_cache_miss_done, NULL);
+@@ -872,7 +872,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
+ s->cache_miss = miss;
+ s->iop.bio = cache_bio;
+ bio_get(cache_bio);
+- closure_bio_submit(cache_bio, &s->cl);
++ closure_bio_submit(s->iop.c, cache_bio, &s->cl);
+
+ return ret;
+ out_put:
+@@ -880,7 +880,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
+ out_submit:
+ miss->bi_end_io = request_endio;
+ miss->bi_private = &s->cl;
+- closure_bio_submit(miss, &s->cl);
++ closure_bio_submit(s->iop.c, miss, &s->cl);
+ return ret;
+ }
+
+@@ -945,7 +945,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
+
+ if ((bio_op(bio) != REQ_OP_DISCARD) ||
+ blk_queue_discard(bdev_get_queue(dc->bdev)))
+- closure_bio_submit(bio, cl);
++ closure_bio_submit(s->iop.c, bio, cl);
+ } else if (s->iop.writeback) {
+ bch_writeback_add(dc);
+ s->iop.bio = bio;
+@@ -960,12 +960,12 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
+ flush->bi_private = cl;
+ flush->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
+
+- closure_bio_submit(flush, cl);
++ closure_bio_submit(s->iop.c, flush, cl);
+ }
+ } else {
+ s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split);
+
+- closure_bio_submit(bio, cl);
++ closure_bio_submit(s->iop.c, bio, cl);
+ }
+
+ closure_call(&s->iop.cl, bch_data_insert, NULL, cl);
+@@ -981,7 +981,7 @@ static void cached_dev_nodata(struct closure *cl)
+ bch_journal_meta(s->iop.c, cl);
+
+ /* If it's a flush, we send the flush to the backing device too */
+- closure_bio_submit(bio, cl);
++ closure_bio_submit(s->iop.c, bio, cl);
+
+ continue_at(cl, cached_dev_bio_complete, NULL);
+ }
+@@ -996,6 +996,12 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
+ struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+ int rw = bio_data_dir(bio);
+
++ if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) {
++ bio->bi_status = BLK_STS_IOERR;
++ bio_endio(bio);
++ return BLK_QC_T_NONE;
++ }
++
+ atomic_set(&dc->backing_idle, 0);
+ generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
+
+@@ -1112,6 +1118,12 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q,
+ struct bcache_device *d = bio->bi_disk->private_data;
+ int rw = bio_data_dir(bio);
+
++ if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) {
++ bio->bi_status = BLK_STS_IOERR;
++ bio_endio(bio);
++ return BLK_QC_T_NONE;
++ }
++
+ generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
+
+ s = search_alloc(bio, d);
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index e15cacecf078..f8b0d1196c12 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -521,7 +521,7 @@ static void prio_io(struct cache *ca, uint64_t bucket, int op,
+ bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags);
+ bch_bio_map(bio, ca->disk_buckets);
+
+- closure_bio_submit(bio, &ca->prio);
++ closure_bio_submit(ca->set, bio, &ca->prio);
+ closure_sync(cl);
+ }
+
+@@ -1349,6 +1349,9 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
+ test_bit(CACHE_SET_STOPPING, &c->flags))
+ return false;
+
++ if (test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags))
++ pr_warn("CACHE_SET_IO_DISABLE already set");
++
+ /* XXX: we can be called from atomic context
+ acquire_console_sem();
+ */
+@@ -1584,6 +1587,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
+ c->congested_read_threshold_us = 2000;
+ c->congested_write_threshold_us = 20000;
+ c->error_limit = DEFAULT_IO_ERROR_LIMIT;
++ WARN_ON(test_and_clear_bit(CACHE_SET_IO_DISABLE, &c->flags));
+
+ return c;
+ err:
+diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
+index 399e91cbf714..cf973c07c856 100644
+--- a/drivers/md/bcache/sysfs.c
++++ b/drivers/md/bcache/sysfs.c
+@@ -95,6 +95,7 @@ read_attribute(partial_stripes_expensive);
+
+ rw_attribute(synchronous);
+ rw_attribute(journal_delay_ms);
++rw_attribute(io_disable);
+ rw_attribute(discard);
+ rw_attribute(running);
+ rw_attribute(label);
+@@ -588,6 +589,8 @@ SHOW(__bch_cache_set)
+ sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite);
+ sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled);
+ sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
++ sysfs_printf(io_disable, "%i",
++ test_bit(CACHE_SET_IO_DISABLE, &c->flags));
+
+ if (attr == &sysfs_bset_tree_stats)
+ return bch_bset_print_stats(c, buf);
+@@ -677,6 +680,22 @@ STORE(__bch_cache_set)
+ if (attr == &sysfs_io_error_halflife)
+ c->error_decay = strtoul_or_return(buf) / 88;
+
++ if (attr == &sysfs_io_disable) {
++ int v = strtoul_or_return(buf);
++
++ if (v) {
++ if (test_and_set_bit(CACHE_SET_IO_DISABLE,
++ &c->flags))
++ pr_warn("CACHE_SET_IO_DISABLE"
++ " already set");
++ } else {
++ if (!test_and_clear_bit(CACHE_SET_IO_DISABLE,
++ &c->flags))
++ pr_warn("CACHE_SET_IO_DISABLE"
++ " already cleared");
++ }
++ }
++
+ sysfs_strtoul(journal_delay_ms, c->journal_delay_ms);
+ sysfs_strtoul(verify, c->verify);
+ sysfs_strtoul(key_merging_disabled, c->key_merging_disabled);
+@@ -762,6 +781,7 @@ static struct attribute *bch_cache_set_internal_files[] = {
+ &sysfs_gc_always_rewrite,
+ &sysfs_btree_shrinker_disabled,
+ &sysfs_copy_gc_enabled,
++ &sysfs_io_disable,
+ NULL
+ };
+ KTYPE(bch_cache_set_internal);
+diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
+index a6763db7f061..268024529edd 100644
+--- a/drivers/md/bcache/util.h
++++ b/drivers/md/bcache/util.h
+@@ -567,12 +567,6 @@ static inline sector_t bdev_sectors(struct block_device *bdev)
+ return bdev->bd_inode->i_size >> 9;
+ }
+
+-#define closure_bio_submit(bio, cl) \
+-do { \
+- closure_get(cl); \
+- generic_make_request(bio); \
+-} while (0)
+-
+ uint64_t bch_crc64_update(uint64_t, const void *, size_t);
+ uint64_t bch_crc64(const void *, size_t);
+
+diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
+index 8f98ef1038d3..3d7d8452e0de 100644
+--- a/drivers/md/bcache/writeback.c
++++ b/drivers/md/bcache/writeback.c
+@@ -114,6 +114,7 @@ static void update_writeback_rate(struct work_struct *work)
+ struct cached_dev *dc = container_of(to_delayed_work(work),
+ struct cached_dev,
+ writeback_rate_update);
++ struct cache_set *c = dc->disk.c;
+
+ /*
+ * should check BCACHE_DEV_RATE_DW_RUNNING before calling
+@@ -123,7 +124,12 @@ static void update_writeback_rate(struct work_struct *work)
+ /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
+ smp_mb();
+
+- if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) {
++ /*
++ * CACHE_SET_IO_DISABLE might be set via sysfs interface,
++ * check it here too.
++ */
++ if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) ||
++ test_bit(CACHE_SET_IO_DISABLE, &c->flags)) {
+ clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
+ /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
+ smp_mb();
+@@ -138,7 +144,12 @@ static void update_writeback_rate(struct work_struct *work)
+
+ up_read(&dc->writeback_lock);
+
+- if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) {
++ /*
++ * CACHE_SET_IO_DISABLE might be set via sysfs interface,
++ * check it here too.
++ */
++ if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) &&
++ !test_bit(CACHE_SET_IO_DISABLE, &c->flags)) {
+ schedule_delayed_work(&dc->writeback_rate_update,
+ dc->writeback_rate_update_seconds * HZ);
+ }
+@@ -278,7 +289,7 @@ static void write_dirty(struct closure *cl)
+ bio_set_dev(&io->bio, io->dc->bdev);
+ io->bio.bi_end_io = dirty_endio;
+
+- closure_bio_submit(&io->bio, cl);
++ closure_bio_submit(io->dc->disk.c, &io->bio, cl);
+ }
+
+ atomic_set(&dc->writeback_sequence_next, next_sequence);
+@@ -304,7 +315,7 @@ static void read_dirty_submit(struct closure *cl)
+ {
+ struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+
+- closure_bio_submit(&io->bio, cl);
++ closure_bio_submit(io->dc->disk.c, &io->bio, cl);
+
+ continue_at(cl, write_dirty, io->dc->writeback_write_wq);
+ }
+@@ -330,7 +341,9 @@ static void read_dirty(struct cached_dev *dc)
+
+ next = bch_keybuf_next(&dc->writeback_keys);
+
+- while (!kthread_should_stop() && next) {
++ while (!kthread_should_stop() &&
++ !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) &&
++ next) {
+ size = 0;
+ nk = 0;
+
+@@ -427,7 +440,9 @@ static void read_dirty(struct cached_dev *dc)
+ }
+ }
+
+- while (!kthread_should_stop() && delay) {
++ while (!kthread_should_stop() &&
++ !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) &&
++ delay) {
+ schedule_timeout_interruptible(delay);
+ delay = writeback_delay(dc, 0);
+ }
+@@ -583,11 +598,13 @@ static bool refill_dirty(struct cached_dev *dc)
+ static int bch_writeback_thread(void *arg)
+ {
+ struct cached_dev *dc = arg;
++ struct cache_set *c = dc->disk.c;
+ bool searched_full_index;
+
+ bch_ratelimit_reset(&dc->writeback_rate);
+
+- while (!kthread_should_stop()) {
++ while (!kthread_should_stop() &&
++ !test_bit(CACHE_SET_IO_DISABLE, &c->flags)) {
+ down_write(&dc->writeback_lock);
+ set_current_state(TASK_INTERRUPTIBLE);
+ /*
+@@ -601,7 +618,8 @@ static int bch_writeback_thread(void *arg)
+ (!atomic_read(&dc->has_dirty) || !dc->writeback_running)) {
+ up_write(&dc->writeback_lock);
+
+- if (kthread_should_stop()) {
++ if (kthread_should_stop() ||
++ test_bit(CACHE_SET_IO_DISABLE, &c->flags)) {
+ set_current_state(TASK_RUNNING);
+ break;
+ }
+@@ -637,6 +655,7 @@ static int bch_writeback_thread(void *arg)
+
+ while (delay &&
+ !kthread_should_stop() &&
++ !test_bit(CACHE_SET_IO_DISABLE, &c->flags) &&
+ !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
+ delay = schedule_timeout_interruptible(delay);
+
+--
+2.16.1
+
diff --git a/for-next/v5-0006-bcache-stop-all-attached-bcache-devices-for-a-ret.patch b/for-next/v5-0006-bcache-stop-all-attached-bcache-devices-for-a-ret.patch
new file mode 100644
index 0000000..7cea41c
--- /dev/null
+++ b/for-next/v5-0006-bcache-stop-all-attached-bcache-devices-for-a-ret.patch
@@ -0,0 +1,67 @@
+From e741114d13958a91559cd8c376cf704cb2180370 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Wed, 10 Jan 2018 00:26:32 +0800
+Subject: [PATCH v5 06/11] bcache: stop all attached bcache devices for a
+ retired cache set
+
+When there are too many I/O errors on cache device, current bcache code
+will retire the whole cache set, and detach all bcache devices. But the
+detached bcache devices are not stopped, which is problematic when bcache
+is in writeback mode.
+
+If the retired cache set has dirty data of backing devices, continue
+writing to bcache device will write to backing device directly. If the
+LBA of write request has a dirty version cached on cache device, next time
+when the cache device is re-registered and backing device re-attached to
+it again, the stale dirty data on cache device will be written to backing
+device, and overwrite latest directly written data. This situation causes
+a quite data corruption.
+
+This patch checkes whether cache_set->io_disable is true in
+__cache_set_unregister(). If cache_set->io_disable is true, it means cache
+set is unregistering by too many I/O errors, then all attached bcache
+devices will be stopped as well. If cache_set->io_disable is not true, it
+means __cache_set_unregister() is triggered by writing 1 to sysfs file
+/sys/fs/bcache/<UUID>/bcache/stop. This is an exception because users do
+it explicitly, this patch keeps existing behavior and does not stop any
+bcache device.
+
+Even the failed cache device has no dirty data, stopping bcache device is
+still a desired behavior by many Ceph and data base users. Then their
+application will report I/O errors due to disappeared bcache device, and
+operation people will know the cache device is broken or disconnected.
+
+Changelog:
+v2: add reviewed-by from Hannes.
+v1: initial version for review.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reviewed-by: Hannes Reinecke <hare@suse.com>
+Cc: Junhui Tang <tang.junhui@zte.com.cn>
+Cc: Michael Lyle <mlyle@lyle.org>
+---
+ drivers/md/bcache/super.c | 8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index f8b0d1196c12..41ef438e7b40 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -1478,6 +1478,14 @@ static void __cache_set_unregister(struct closure *cl)
+ dc = container_of(c->devices[i],
+ struct cached_dev, disk);
+ bch_cached_dev_detach(dc);
++ /*
++ * If we come here by too many I/O errors,
++ * bcache device should be stopped too, to
++ * keep data consistency on cache and
++ * backing devices.
++ */
++ if (test_bit(CACHE_SET_IO_DISABLE, &c->flags))
++ bcache_device_stop(c->devices[i]);
+ } else {
+ bcache_device_stop(c->devices[i]);
+ }
+--
+2.16.1
+
diff --git a/for-next/v5-0007-bcache-fix-inaccurate-io-state-for-detached-bcach.patch b/for-next/v5-0007-bcache-fix-inaccurate-io-state-for-detached-bcach.patch
new file mode 100644
index 0000000..419c2c7
--- /dev/null
+++ b/for-next/v5-0007-bcache-fix-inaccurate-io-state-for-detached-bcach.patch
@@ -0,0 +1,119 @@
+From d6f9f789096d2c3473314fa10ea1166683399ac8 Mon Sep 17 00:00:00 2001
+From: Tang Junhui <tang.junhui@zte.com.cn>
+Date: Tue, 9 Jan 2018 10:27:11 +0800
+Subject: [PATCH v5 07/11] bcache: fix inaccurate io state for detached bcache
+ devices
+
+When we run IO in a detached device, and run iostat to shows IO status,
+normally it will show like bellow (Omitted some fields):
+Device: ... avgrq-sz avgqu-sz await r_await w_await svctm %util
+sdd ... 15.89 0.53 1.82 0.20 2.23 1.81 52.30
+bcache0 ... 15.89 115.42 0.00 0.00 0.00 2.40 69.60
+but after IO stopped, there are still very big avgqu-sz and %util
+values as bellow:
+Device: ... avgrq-sz avgqu-sz await r_await w_await svctm %util
+bcache0 ... 0 5326.32 0.00 0.00 0.00 0.00 100.10
+
+The reason for this issue is that, only generic_start_io_acct() called
+and no generic_end_io_acct() called for detached device in
+cached_dev_make_request(). See the code:
+//start generic_start_io_acct()
+generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
+if (cached_dev_get(dc)) {
+ //will callback generic_end_io_acct()
+}
+else {
+ //will not call generic_end_io_acct()
+}
+
+This patch calls generic_end_io_acct() in the end of IO for detached
+devices, so we can show IO state correctly.
+
+(Modified to use GFP_NOIO in kzalloc() by Coly Li)
+
+Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
+Reviewed-by: Coly Li <colyli@suse.de>
+Reviewed-by: Hannes Reinecke <hare@suse.com>
+---
+ drivers/md/bcache/request.c | 58 +++++++++++++++++++++++++++++++++++++++------
+ 1 file changed, 51 insertions(+), 7 deletions(-)
+
+diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
+index 02296bda6384..e09c5ae745be 100644
+--- a/drivers/md/bcache/request.c
++++ b/drivers/md/bcache/request.c
+@@ -986,6 +986,55 @@ static void cached_dev_nodata(struct closure *cl)
+ continue_at(cl, cached_dev_bio_complete, NULL);
+ }
+
++struct detached_dev_io_private {
++ struct bcache_device *d;
++ unsigned long start_time;
++ bio_end_io_t *bi_end_io;
++ void *bi_private;
++};
++
++static void detatched_dev_end_io(struct bio *bio)
++{
++ struct detached_dev_io_private *ddip;
++
++ ddip = bio->bi_private;
++ bio->bi_end_io = ddip->bi_end_io;
++ bio->bi_private = ddip->bi_private;
++
++ generic_end_io_acct(ddip->d->disk->queue,
++ bio_data_dir(bio),
++ &ddip->d->disk->part0, ddip->start_time);
++
++ kfree(ddip);
++
++ bio->bi_end_io(bio);
++}
++
++static void detached_dev_do_request(struct bcache_device *d, struct bio *bio)
++{
++ struct detached_dev_io_private *ddip;
++ struct cached_dev *dc = container_of(d, struct cached_dev, disk);
++
++ /*
++ * no need to call closure_get(&dc->disk.cl),
++ * because upper layer had already opened bcache device,
++ * which would call closure_get(&dc->disk.cl)
++ */
++ ddip = kzalloc(sizeof(struct detached_dev_io_private), GFP_NOIO);
++ ddip->d = d;
++ ddip->start_time = jiffies;
++ ddip->bi_end_io = bio->bi_end_io;
++ ddip->bi_private = bio->bi_private;
++ bio->bi_end_io = detatched_dev_end_io;
++ bio->bi_private = ddip;
++
++ if ((bio_op(bio) == REQ_OP_DISCARD) &&
++ !blk_queue_discard(bdev_get_queue(dc->bdev)))
++ bio->bi_end_io(bio);
++ else
++ generic_make_request(bio);
++}
++
+ /* Cached devices - read & write stuff */
+
+ static blk_qc_t cached_dev_make_request(struct request_queue *q,
+@@ -1028,13 +1077,8 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
+ else
+ cached_dev_read(dc, s);
+ }
+- } else {
+- if ((bio_op(bio) == REQ_OP_DISCARD) &&
+- !blk_queue_discard(bdev_get_queue(dc->bdev)))
+- bio_endio(bio);
+- else
+- generic_make_request(bio);
+- }
++ } else
++ detached_dev_do_request(d, bio);
+
+ return BLK_QC_T_NONE;
+ }
+--
+2.16.1
+
diff --git a/for-next/v5-0008-bcache-add-backing_request_endio-for-bi_end_io-of.patch b/for-next/v5-0008-bcache-add-backing_request_endio-for-bi_end_io-of.patch
new file mode 100644
index 0000000..4cddd78
--- /dev/null
+++ b/for-next/v5-0008-bcache-add-backing_request_endio-for-bi_end_io-of.patch
@@ -0,0 +1,255 @@
+From e76beb2960de33506c0f6e177d43f8a8cfafee30 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Wed, 10 Jan 2018 21:01:48 +0800
+Subject: [PATCH v5 08/11] bcache: add backing_request_endio() for bi_end_io of
+ attached backing device I/O
+
+In order to catch I/O error of backing device, a separate bi_end_io
+call back is required. Then a per backing device counter can record I/O
+errors number and retire the backing device if the counter reaches a
+per backing device I/O error limit.
+
+This patch adds backing_request_endio() to bcache backing device I/O code
+path, this is a preparation for further complicated backing device failure
+handling. So far there is no real code logic change, I make this change a
+separate patch to make sure it is stable and reliable for further work.
+
+Changelog:
+v2: Fix code comments typo, remove a redundant bch_writeback_add() line
+ added in v4 patch set.
+v1: indeed this is new added in this patch set.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reviewed-by: Hannes Reinecke <hare@suse.com>
+Cc: Junhui Tang <tang.junhui@zte.com.cn>
+Cc: Michael Lyle <mlyle@lyle.org>
+---
+ drivers/md/bcache/request.c | 93 +++++++++++++++++++++++++++++++++++--------
+ drivers/md/bcache/super.c | 1 +
+ drivers/md/bcache/writeback.c | 1 +
+ 3 files changed, 79 insertions(+), 16 deletions(-)
+
+diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
+index e09c5ae745be..9c6dda3b0068 100644
+--- a/drivers/md/bcache/request.c
++++ b/drivers/md/bcache/request.c
+@@ -139,6 +139,7 @@ static void bch_data_invalidate(struct closure *cl)
+ }
+
+ op->insert_data_done = true;
++ /* get in bch_data_insert() */
+ bio_put(bio);
+ out:
+ continue_at(cl, bch_data_insert_keys, op->wq);
+@@ -630,6 +631,38 @@ static void request_endio(struct bio *bio)
+ closure_put(cl);
+ }
+
++static void backing_request_endio(struct bio *bio)
++{
++ struct closure *cl = bio->bi_private;
++
++ if (bio->bi_status) {
++ struct search *s = container_of(cl, struct search, cl);
++ /*
++ * If a bio has REQ_PREFLUSH for writeback mode, it is
++ * speically assembled in cached_dev_write() for a non-zero
++ * write request which has REQ_PREFLUSH. we don't set
++ * s->iop.status by this failure, the status will be decided
++ * by result of bch_data_insert() operation.
++ */
++ if (unlikely(s->iop.writeback &&
++ bio->bi_opf & REQ_PREFLUSH)) {
++ char buf[BDEVNAME_SIZE];
++
++ bio_devname(bio, buf);
++ pr_err("Can't flush %s: returned bi_status %i",
++ buf, bio->bi_status);
++ } else {
++ /* set to orig_bio->bi_status in bio_complete() */
++ s->iop.status = bio->bi_status;
++ }
++ s->recoverable = false;
++ /* should count I/O error for backing device here */
++ }
++
++ bio_put(bio);
++ closure_put(cl);
++}
++
+ static void bio_complete(struct search *s)
+ {
+ if (s->orig_bio) {
+@@ -644,13 +677,21 @@ static void bio_complete(struct search *s)
+ }
+ }
+
+-static void do_bio_hook(struct search *s, struct bio *orig_bio)
++static void do_bio_hook(struct search *s,
++ struct bio *orig_bio,
++ bio_end_io_t *end_io_fn)
+ {
+ struct bio *bio = &s->bio.bio;
+
+ bio_init(bio, NULL, 0);
+ __bio_clone_fast(bio, orig_bio);
+- bio->bi_end_io = request_endio;
++ /*
++ * bi_end_io can be set separately somewhere else, e.g. the
++ * variants in,
++ * - cache_bio->bi_end_io from cached_dev_cache_miss()
++ * - n->bi_end_io from cache_lookup_fn()
++ */
++ bio->bi_end_io = end_io_fn;
+ bio->bi_private = &s->cl;
+
+ bio_cnt_set(bio, 3);
+@@ -676,7 +717,7 @@ static inline struct search *search_alloc(struct bio *bio,
+ s = mempool_alloc(d->c->search, GFP_NOIO);
+
+ closure_init(&s->cl, NULL);
+- do_bio_hook(s, bio);
++ do_bio_hook(s, bio, request_endio);
+
+ s->orig_bio = bio;
+ s->cache_miss = NULL;
+@@ -743,10 +784,11 @@ static void cached_dev_read_error(struct closure *cl)
+ trace_bcache_read_retry(s->orig_bio);
+
+ s->iop.status = 0;
+- do_bio_hook(s, s->orig_bio);
++ do_bio_hook(s, s->orig_bio, backing_request_endio);
+
+ /* XXX: invalidate cache */
+
++ /* I/O request sent to backing device */
+ closure_bio_submit(s->iop.c, bio, cl);
+ }
+
+@@ -859,7 +901,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
+ bio_copy_dev(cache_bio, miss);
+ cache_bio->bi_iter.bi_size = s->insert_bio_sectors << 9;
+
+- cache_bio->bi_end_io = request_endio;
++ cache_bio->bi_end_io = backing_request_endio;
+ cache_bio->bi_private = &s->cl;
+
+ bch_bio_map(cache_bio, NULL);
+@@ -872,14 +914,16 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
+ s->cache_miss = miss;
+ s->iop.bio = cache_bio;
+ bio_get(cache_bio);
++ /* I/O request sent to backing device */
+ closure_bio_submit(s->iop.c, cache_bio, &s->cl);
+
+ return ret;
+ out_put:
+ bio_put(cache_bio);
+ out_submit:
+- miss->bi_end_io = request_endio;
++ miss->bi_end_io = backing_request_endio;
+ miss->bi_private = &s->cl;
++ /* I/O request sent to backing device */
+ closure_bio_submit(s->iop.c, miss, &s->cl);
+ return ret;
+ }
+@@ -943,31 +987,46 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
+ s->iop.bio = s->orig_bio;
+ bio_get(s->iop.bio);
+
+- if ((bio_op(bio) != REQ_OP_DISCARD) ||
+- blk_queue_discard(bdev_get_queue(dc->bdev)))
+- closure_bio_submit(s->iop.c, bio, cl);
++ if (bio_op(bio) == REQ_OP_DISCARD &&
++ !blk_queue_discard(bdev_get_queue(dc->bdev)))
++ goto insert_data;
++
++ /* I/O request sent to backing device */
++ bio->bi_end_io = backing_request_endio;
++ closure_bio_submit(s->iop.c, bio, cl);
++
+ } else if (s->iop.writeback) {
+ bch_writeback_add(dc);
+ s->iop.bio = bio;
+
+ if (bio->bi_opf & REQ_PREFLUSH) {
+- /* Also need to send a flush to the backing device */
+- struct bio *flush = bio_alloc_bioset(GFP_NOIO, 0,
+- dc->disk.bio_split);
+-
++ /*
++ * Also need to send a flush to the backing
++ * device.
++ */
++ struct bio *flush;
++
++ flush = bio_alloc_bioset(GFP_NOIO, 0,
++ dc->disk.bio_split);
++ if (!flush) {
++ s->iop.status = BLK_STS_RESOURCE;
++ goto insert_data;
++ }
+ bio_copy_dev(flush, bio);
+- flush->bi_end_io = request_endio;
++ flush->bi_end_io = backing_request_endio;
+ flush->bi_private = cl;
+ flush->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
+-
++ /* I/O request sent to backing device */
+ closure_bio_submit(s->iop.c, flush, cl);
+ }
+ } else {
+ s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split);
+-
++ /* I/O request sent to backing device */
++ bio->bi_end_io = backing_request_endio;
+ closure_bio_submit(s->iop.c, bio, cl);
+ }
+
++insert_data:
+ closure_call(&s->iop.cl, bch_data_insert, NULL, cl);
+ continue_at(cl, cached_dev_write_complete, NULL);
+ }
+@@ -981,6 +1040,7 @@ static void cached_dev_nodata(struct closure *cl)
+ bch_journal_meta(s->iop.c, cl);
+
+ /* If it's a flush, we send the flush to the backing device too */
++ bio->bi_end_io = backing_request_endio;
+ closure_bio_submit(s->iop.c, bio, cl);
+
+ continue_at(cl, cached_dev_bio_complete, NULL);
+@@ -1078,6 +1138,7 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
+ cached_dev_read(dc, s);
+ }
+ } else
++ /* I/O request sent to backing device */
+ detached_dev_do_request(d, bio);
+
+ return BLK_QC_T_NONE;
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index 41ef438e7b40..082faaf2ee2f 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -265,6 +265,7 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
+ bio->bi_private = dc;
+
+ closure_get(cl);
++ /* I/O request sent to backing device */
+ __write_super(&dc->sb, bio);
+
+ closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
+diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
+index 3d7d8452e0de..4ebe0119ea7e 100644
+--- a/drivers/md/bcache/writeback.c
++++ b/drivers/md/bcache/writeback.c
+@@ -289,6 +289,7 @@ static void write_dirty(struct closure *cl)
+ bio_set_dev(&io->bio, io->dc->bdev);
+ io->bio.bi_end_io = dirty_endio;
+
++ /* I/O request sent to backing device */
+ closure_bio_submit(io->dc->disk.c, &io->bio, cl);
+ }
+
+--
+2.16.1
+
diff --git a/for-next/v5-0009-bcache-add-io_disable-to-struct-cached_dev.patch b/for-next/v5-0009-bcache-add-io_disable-to-struct-cached_dev.patch
new file mode 100644
index 0000000..9a63f8d
--- /dev/null
+++ b/for-next/v5-0009-bcache-add-io_disable-to-struct-cached_dev.patch
@@ -0,0 +1,237 @@
+From fff4f5d3d40952ddba1649fd7a5ce0d025b0b3cc Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Wed, 10 Jan 2018 21:33:45 +0800
+Subject: [PATCH v5 09/11] bcache: add io_disable to struct cached_dev
+
+If a bcache device is configured to writeback mode, current code does not
+handle write I/O errors on backing devices properly.
+
+In writeback mode, write request is written to cache device, and
+latter being flushed to backing device. If I/O failed when writing from
+cache device to the backing device, bcache code just ignores the error and
+upper layer code is NOT noticed that the backing device is broken.
+
+This patch tries to handle backing device failure like how the cache device
+failure is handled,
+- Add a error counter 'io_errors' and error limit 'error_limit' in struct
+ cached_dev. Add another io_disable to struct cached_dev to disable I/Os
+ on the problematic backing device.
+- When I/O error happens on backing device, increase io_errors counter. And
+ if io_errors reaches error_limit, set cache_dev->io_disable to true, and
+ stop the bcache device.
+
+The result is, if backing device is broken of disconnected, and I/O errors
+reach its error limit, backing device will be disabled and the associated
+bcache device will be removed from system.
+
+Changelog:
+v2: remove "bcache: " prefix in pr_error(), and use correct name string to
+ print out bcache device gendisk name.
+v1: indeed this is new added in v2 patch set.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reviewed-by: Hannes Reinecke <hare@suse.com>
+Cc: Michael Lyle <mlyle@lyle.org>
+Cc: Junhui Tang <tang.junhui@zte.com.cn>
+---
+ drivers/md/bcache/bcache.h | 7 +++++++
+ drivers/md/bcache/io.c | 14 ++++++++++++++
+ drivers/md/bcache/request.c | 14 ++++++++++++--
+ drivers/md/bcache/super.c | 22 ++++++++++++++++++++++
+ drivers/md/bcache/sysfs.c | 15 ++++++++++++++-
+ 5 files changed, 69 insertions(+), 3 deletions(-)
+
+diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
+index 7917b3820dd5..822ec75bb78c 100644
+--- a/drivers/md/bcache/bcache.h
++++ b/drivers/md/bcache/bcache.h
+@@ -360,6 +360,7 @@ struct cached_dev {
+ unsigned sequential_cutoff;
+ unsigned readahead;
+
++ unsigned io_disable:1;
+ unsigned verify:1;
+ unsigned bypass_torture_test:1;
+
+@@ -379,6 +380,10 @@ struct cached_dev {
+ unsigned writeback_rate_i_term_inverse;
+ unsigned writeback_rate_p_term_inverse;
+ unsigned writeback_rate_minimum;
++
++#define DEFAULT_CACHED_DEV_ERROR_LIMIT 64
++ atomic_t io_errors;
++ unsigned error_limit;
+ };
+
+ enum alloc_reserve {
+@@ -888,6 +893,7 @@ static inline void closure_bio_submit(struct cache_set *c,
+
+ /* Forward declarations */
+
++void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio);
+ void bch_count_io_errors(struct cache *, blk_status_t, int, const char *);
+ void bch_bbio_count_io_errors(struct cache_set *, struct bio *,
+ blk_status_t, const char *);
+@@ -915,6 +921,7 @@ int bch_bucket_alloc_set(struct cache_set *, unsigned,
+ struct bkey *, int, bool);
+ bool bch_alloc_sectors(struct cache_set *, struct bkey *, unsigned,
+ unsigned, unsigned, bool);
++bool bch_cached_dev_error(struct cached_dev *dc);
+
+ __printf(2, 3)
+ bool bch_cache_set_error(struct cache_set *, const char *, ...);
+diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
+index 8013ecbcdbda..7fac97ae036e 100644
+--- a/drivers/md/bcache/io.c
++++ b/drivers/md/bcache/io.c
+@@ -50,6 +50,20 @@ void bch_submit_bbio(struct bio *bio, struct cache_set *c,
+ }
+
+ /* IO errors */
++void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio)
++{
++ char buf[BDEVNAME_SIZE];
++ unsigned errors;
++
++ WARN_ONCE(!dc, "NULL pointer of struct cached_dev");
++
++ errors = atomic_add_return(1, &dc->io_errors);
++ if (errors < dc->error_limit)
++ pr_err("%s: IO error on backing device, unrecoverable",
++ bio_devname(bio, buf));
++ else
++ bch_cached_dev_error(dc);
++}
+
+ void bch_count_io_errors(struct cache *ca,
+ blk_status_t error,
+diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
+index 9c6dda3b0068..03245e6980a6 100644
+--- a/drivers/md/bcache/request.c
++++ b/drivers/md/bcache/request.c
+@@ -637,6 +637,8 @@ static void backing_request_endio(struct bio *bio)
+
+ if (bio->bi_status) {
+ struct search *s = container_of(cl, struct search, cl);
++ struct cached_dev *dc = container_of(s->d,
++ struct cached_dev, disk);
+ /*
+ * If a bio has REQ_PREFLUSH for writeback mode, it is
+ * speically assembled in cached_dev_write() for a non-zero
+@@ -657,6 +659,7 @@ static void backing_request_endio(struct bio *bio)
+ }
+ s->recoverable = false;
+ /* should count I/O error for backing device here */
++ bch_count_backing_io_errors(dc, bio);
+ }
+
+ bio_put(bio);
+@@ -1065,8 +1068,14 @@ static void detatched_dev_end_io(struct bio *bio)
+ bio_data_dir(bio),
+ &ddip->d->disk->part0, ddip->start_time);
+
+- kfree(ddip);
++ if (bio->bi_status) {
++ struct cached_dev *dc = container_of(ddip->d,
++ struct cached_dev, disk);
++ /* should count I/O error for backing device here */
++ bch_count_backing_io_errors(dc, bio);
++ }
+
++ kfree(ddip);
+ bio->bi_end_io(bio);
+ }
+
+@@ -1105,7 +1114,8 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
+ struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+ int rw = bio_data_dir(bio);
+
+- if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) {
++ if (unlikely((d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags)) ||
++ dc->io_disable)) {
+ bio->bi_status = BLK_STS_IOERR;
+ bio_endio(bio);
+ return BLK_QC_T_NONE;
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index 082faaf2ee2f..91a08cdd55bd 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -1188,6 +1188,10 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
+ max(dc->disk.disk->queue->backing_dev_info->ra_pages,
+ q->backing_dev_info->ra_pages);
+
++ atomic_set(&dc->io_errors, 0);
++ dc->io_disable = false;
++ dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT;
++
+ bch_cached_dev_request_init(dc);
+ bch_cached_dev_writeback_init(dc);
+ return 0;
+@@ -1339,6 +1343,24 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size)
+ return flash_dev_run(c, u);
+ }
+
++bool bch_cached_dev_error(struct cached_dev *dc)
++{
++ char name[BDEVNAME_SIZE];
++
++ if (!dc || test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
++ return false;
++
++ dc->io_disable = true;
++ /* make others know io_disable is true earlier */
++ smp_mb();
++
++ pr_err("stop %s: too many IO errors on backing device %s\n",
++ dc->disk.disk->disk_name, bdevname(dc->bdev, name));
++
++ bcache_device_stop(&dc->disk);
++ return true;
++}
++
+ /* Cache set */
+
+ __printf(2, 3)
+diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
+index cf973c07c856..ac3adf2dcf6c 100644
+--- a/drivers/md/bcache/sysfs.c
++++ b/drivers/md/bcache/sysfs.c
+@@ -134,7 +134,9 @@ SHOW(__bch_cached_dev)
+ var_print(writeback_delay);
+ var_print(writeback_percent);
+ sysfs_hprint(writeback_rate, dc->writeback_rate.rate << 9);
+-
++ sysfs_hprint(io_errors, atomic_read(&dc->io_errors));
++ sysfs_printf(io_error_limit, "%i", dc->error_limit);
++ sysfs_printf(io_disable, "%i", dc->io_disable);
+ var_print(writeback_rate_update_seconds);
+ var_print(writeback_rate_i_term_inverse);
+ var_print(writeback_rate_p_term_inverse);
+@@ -225,6 +227,14 @@ STORE(__cached_dev)
+ d_strtoul(writeback_rate_i_term_inverse);
+ d_strtoul_nonzero(writeback_rate_p_term_inverse);
+
++ sysfs_strtoul_clamp(io_error_limit, dc->error_limit, 0, INT_MAX);
++
++ if (attr == &sysfs_io_disable) {
++ int v = strtoul_or_return(buf);
++
++ dc->io_disable = v ? 1 : 0;
++ }
++
+ d_strtoi_h(sequential_cutoff);
+ d_strtoi_h(readahead);
+
+@@ -332,6 +342,9 @@ static struct attribute *bch_cached_dev_files[] = {
+ &sysfs_writeback_rate_i_term_inverse,
+ &sysfs_writeback_rate_p_term_inverse,
+ &sysfs_writeback_rate_debug,
++ &sysfs_errors,
++ &sysfs_io_error_limit,
++ &sysfs_io_disable,
+ &sysfs_dirty_data,
+ &sysfs_stripe_size,
+ &sysfs_partial_stripes_expensive,
+--
+2.16.1
+
diff --git a/for-next/v5-0010-bcache-stop-bcache-device-when-backing-device-is-.patch b/for-next/v5-0010-bcache-stop-bcache-device-when-backing-device-is-.patch
new file mode 100644
index 0000000..d03b24e
--- /dev/null
+++ b/for-next/v5-0010-bcache-stop-bcache-device-when-backing-device-is-.patch
@@ -0,0 +1,152 @@
+From 7d8d5a020e69671723932da30a1800eed91d3bcd Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sat, 13 Jan 2018 17:31:44 +0800
+Subject: [PATCH v5 10/11] bcache: stop bcache device when backing device is
+ offline
+
+Currently bcache does not handle backing device failure, if backing
+device is offline and disconnected from system, its bcache device can still
+be accessible. If the bcache device is in writeback mode, I/O requests even
+can success if the requests hit on cache device. That is to say, when and
+how bcache handles offline backing device is undefined.
+
+This patch tries to handle backing device offline in a rather simple way,
+- Add cached_dev->status_update_thread kernel thread to update backing
+ device status in every 1 second.
+- Add cached_dev->offline_seconds to record how many seconds the backing
+ device is observed to be offline. If the backing device is offline for
+ BACKING_DEV_OFFLINE_TIMEOUT (30) seconds, set dc->io_disable to 1 and
+ call bcache_device_stop() to stop the bache device which linked to the
+ offline backing device.
+
+Now if a backing device is offline for BACKING_DEV_OFFLINE_TIMEOUT seconds,
+its bcache device will be removed, then user space application writing on
+it will get error immediately, and handler the device failure in time.
+
+This patch is quite simple, does not handle more complicated situations.
+Once the bcache device is stopped, users need to recovery the backing
+device, register and attach it manually.
+
+Changelog:
+v2: remove "bcache: " prefix when calling pr_warn().
+v1: initial version.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reviewed-by: Hannes Reinecke <hare@suse.com>
+Cc: Michael Lyle <mlyle@lyle.org>
+Cc: Junhui Tang <tang.junhui@zte.com.cn>
+---
+ drivers/md/bcache/bcache.h | 2 ++
+ drivers/md/bcache/super.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++
+ 2 files changed, 57 insertions(+)
+
+diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
+index 822ec75bb78c..1c749352172d 100644
+--- a/drivers/md/bcache/bcache.h
++++ b/drivers/md/bcache/bcache.h
+@@ -338,6 +338,7 @@ struct cached_dev {
+
+ struct keybuf writeback_keys;
+
++ struct task_struct *status_update_thread;
+ /*
+ * Order the write-half of writeback operations strongly in dispatch
+ * order. (Maintain LBA order; don't allow reads completing out of
+@@ -384,6 +385,7 @@ struct cached_dev {
+ #define DEFAULT_CACHED_DEV_ERROR_LIMIT 64
+ atomic_t io_errors;
+ unsigned error_limit;
++ unsigned offline_seconds;
+ };
+
+ enum alloc_reserve {
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index 91a08cdd55bd..de0f5fb9bde2 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -646,6 +646,11 @@ static int ioctl_dev(struct block_device *b, fmode_t mode,
+ unsigned int cmd, unsigned long arg)
+ {
+ struct bcache_device *d = b->bd_disk->private_data;
++ struct cached_dev *dc = container_of(d, struct cached_dev, disk);
++
++ if (dc->io_disable)
++ return -EIO;
++
+ return d->ioctl(d, mode, cmd, arg);
+ }
+
+@@ -856,6 +861,45 @@ static void calc_cached_dev_sectors(struct cache_set *c)
+ c->cached_dev_sectors = sectors;
+ }
+
++#define BACKING_DEV_OFFLINE_TIMEOUT 5
++static int cached_dev_status_update(void *arg)
++{
++ struct cached_dev *dc = arg;
++ struct request_queue *q;
++ char buf[BDEVNAME_SIZE];
++
++ /*
++ * If this delayed worker is stopping outside, directly quit here.
++ * dc->io_disable might be set via sysfs interface, so check it
++ * here too.
++ */
++ while (!kthread_should_stop() && !dc->io_disable) {
++ q = bdev_get_queue(dc->bdev);
++ if (blk_queue_dying(q))
++ dc->offline_seconds++;
++ else
++ dc->offline_seconds = 0;
++
++ if (dc->offline_seconds >= BACKING_DEV_OFFLINE_TIMEOUT) {
++ pr_err("%s: device offline for %d seconds",
++ bdevname(dc->bdev, buf),
++ BACKING_DEV_OFFLINE_TIMEOUT);
++ pr_err("%s: disable I/O request due to backing "
++ "device offline", dc->disk.name);
++ dc->io_disable = true;
++ /* let others know earlier that io_disable is true */
++ smp_mb();
++ bcache_device_stop(&dc->disk);
++ break;
++ }
++
++ schedule_timeout_interruptible(HZ);
++ }
++
++ dc->status_update_thread = NULL;
++ return 0;
++}
++
+ void bch_cached_dev_run(struct cached_dev *dc)
+ {
+ struct bcache_device *d = &dc->disk;
+@@ -898,6 +942,15 @@ void bch_cached_dev_run(struct cached_dev *dc)
+ if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
+ sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache"))
+ pr_debug("error creating sysfs link");
++
++ dc->status_update_thread = kthread_run(cached_dev_status_update,
++ dc,
++ "bcache_status_update");
++ if (IS_ERR(dc->status_update_thread)) {
++ pr_warn("failed to create bcache_status_update kthread, "
++ "continue to run without monitoring backing "
++ "device status");
++ }
+ }
+
+ /*
+@@ -1118,6 +1171,8 @@ static void cached_dev_free(struct closure *cl)
+ kthread_stop(dc->writeback_thread);
+ if (dc->writeback_write_wq)
+ destroy_workqueue(dc->writeback_write_wq);
++ if (!IS_ERR_OR_NULL(dc->status_update_thread))
++ kthread_stop(dc->status_update_thread);
+
+ if (atomic_read(&dc->running))
+ bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
+--
+2.16.1
+
diff --git a/for-next/v5-0011-bcache-add-stop_when_cache_set_failed-option-to-b.patch b/for-next/v5-0011-bcache-add-stop_when_cache_set_failed-option-to-b.patch
new file mode 100644
index 0000000..c782476
--- /dev/null
+++ b/for-next/v5-0011-bcache-add-stop_when_cache_set_failed-option-to-b.patch
@@ -0,0 +1,251 @@
+From e8f72263c0f4f20b85f42a617fa4998115f797af Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Mon, 5 Feb 2018 10:43:18 +0800
+Subject: [PATCH v5 11/11] bcache: add stop_when_cache_set_failed option to
+ backing device
+
+Current bcache failure handling code will stop all attached bcache devices
+when the cache set is broken or disconnected. This might not be desired
+behavior, example bcache deployed for an email service. In such workload,
+if cache device is broken but no dirty data lost, keep the bcache device
+alive and permit email service continue to access data might be a better
+solution for the cache device failure.
+
+Nix <nix@esperi.org.uk> points out the issue and provides the above example
+to explain why it might be necessary to not stop bcache device for broken
+cache device. Pavel Goran <via-bcache@pvgoran.name> provides a brilliant
+suggestion to provide "always" and "auto" options to per-cached device
+sysfs file stop_when_cache_set_failed. If cache set is retiring and the
+backing device has no dirty data on cache, it should be safe to keep the
+bcache device alive. In this case, if stop_when_cache_set_failed is set to
+"auto", the device failure handling code will not stop this bcache device
+and permit application to access the backing device with a unattached
+bcache device.
+
+Changelog:
+v2: change option values of stop_when_cache_set_failed from 1/0 to
+ "auto"/"always".
+v1: initial version, stop_when_cache_set_failed can be 0 (not stop) or 1
+ (always stop).
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Nix <nix@esperi.org.uk>
+Cc: Pavel Goran <via-bcache@pvgoran.name>
+Cc: Michael Lyle <mlyle@lyle.org>
+Cc: Junhui Tang <tang.junhui@zte.com.cn>
+Cc: Hannes Reinecke <hare@suse.com>
+---
+ drivers/md/bcache/bcache.h | 8 +++++
+ drivers/md/bcache/super.c | 90 ++++++++++++++++++++++++++++++++++++----------
+ drivers/md/bcache/sysfs.c | 17 +++++++++
+ 3 files changed, 97 insertions(+), 18 deletions(-)
+
+diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
+index 1c749352172d..59e675304b7e 100644
+--- a/drivers/md/bcache/bcache.h
++++ b/drivers/md/bcache/bcache.h
+@@ -287,6 +287,12 @@ struct io {
+ sector_t last;
+ };
+
++enum stop_on_faliure {
++ BCH_CACHED_DEV_STOP_ATUO = 0,
++ BCH_CACHED_DEV_STOP_ALWAYS,
++ BCH_CACHED_DEV_STOP_MODE_MAX,
++};
++
+ struct cached_dev {
+ struct list_head list;
+ struct bcache_device disk;
+@@ -382,6 +388,7 @@ struct cached_dev {
+ unsigned writeback_rate_p_term_inverse;
+ unsigned writeback_rate_minimum;
+
++ enum stop_on_faliure stop_when_cache_set_failed;
+ #define DEFAULT_CACHED_DEV_ERROR_LIMIT 64
+ atomic_t io_errors;
+ unsigned error_limit;
+@@ -933,6 +940,7 @@ void bch_write_bdev_super(struct cached_dev *, struct closure *);
+
+ extern struct workqueue_struct *bcache_wq;
+ extern const char * const bch_cache_modes[];
++extern const char * const bch_stop_on_failure_modes[];
+ extern struct mutex bch_register_lock;
+ extern struct list_head bch_cache_sets;
+
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index de0f5fb9bde2..d2999f1e6ae2 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -47,6 +47,14 @@ const char * const bch_cache_modes[] = {
+ NULL
+ };
+
++/* Default is -1; we skip past it for stop_when_cache_set_failed */
++const char * const bch_stop_on_failure_modes[] = {
++ "default",
++ "auto",
++ "always",
++ NULL
++};
++
+ static struct kobject *bcache_kobj;
+ struct mutex bch_register_lock;
+ LIST_HEAD(bch_cache_sets);
+@@ -1246,6 +1254,8 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
+ atomic_set(&dc->io_errors, 0);
+ dc->io_disable = false;
+ dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT;
++ /* default to auto */
++ dc->stop_when_cache_set_failed = BCH_CACHED_DEV_STOP_ATUO;
+
+ bch_cached_dev_request_init(dc);
+ bch_cached_dev_writeback_init(dc);
+@@ -1541,33 +1551,77 @@ static void cache_set_flush(struct closure *cl)
+ closure_return(cl);
+ }
+
++/*
++ * This function is only called when CACHE_SET_IO_DISABLE is set, which means
++ * cache set is unregistering due to too many I/O errors. In this condition,
++ * the bcache device might be stopped, it depends on stop_when_cache_set_failed
++ * value and whether the broken cache has dirty data:
++ *
++ * dc->stop_when_cache_set_failed dc->has_dirty stop bcache device
++ * BCH_CACHED_STOP_ATUO 0 NO
++ * BCH_CACHED_STOP_ATUO 1 YES
++ * BCH_CACHED_DEV_STOP_ALWAYS 0 YES
++ * BCH_CACHED_DEV_STOP_ALWAYS 1 YES
++ *
++ * The expected behavior is, if stop_when_cache_set_failed is configured to
++ * "auto" via sysfs interface, the bcache device will not be stopped if the
++ * backing device is clean on the broken cache device.
++ */
++static void conditional_stop_bcache_device(struct cache_set *c,
++ struct bcache_device *d,
++ struct cached_dev *dc)
++{
++ if (dc->stop_when_cache_set_failed == BCH_CACHED_DEV_STOP_ALWAYS) {
++ pr_warn("stop_when_cache_set_failed of %s is \"always\", stop"
++ " it for failed cache set %pU.",
++ d->disk->disk_name, c->sb.set_uuid);
++ bcache_device_stop(d);
++ return;
++ } else if (atomic_read(&dc->has_dirty)) {
++ /*
++ * dc->stop_when_cache_set_failed == BCH_CACHED_STOP_ATUO
++ * and dc->has_dirty == 1
++ */
++ pr_warn("stop_when_cache_set_failed of %s is \"auto\" and "
++ "cache is dirty, stop it to avoid potential data "
++ "corruption.",
++ d->disk->disk_name);
++ bcache_device_stop(d);
++ } else {
++ /*
++ * dc->stop_when_cache_set_failed == BCH_CACHED_STOP_ATUO
++ * and dc->has_dirty == 0
++ */
++ pr_warn("stop_when_cache_set_failed of %s is \"auto\" and "
++ "cache is clean, keep it alive.",
++ d->disk->disk_name);
++ }
++}
++
+ static void __cache_set_unregister(struct closure *cl)
+ {
+ struct cache_set *c = container_of(cl, struct cache_set, caching);
+ struct cached_dev *dc;
++ struct bcache_device *d;
+ size_t i;
+
+ mutex_lock(&bch_register_lock);
+
+- for (i = 0; i < c->devices_max_used; i++)
+- if (c->devices[i]) {
+- if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
+- test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
+- dc = container_of(c->devices[i],
+- struct cached_dev, disk);
+- bch_cached_dev_detach(dc);
+- /*
+- * If we come here by too many I/O errors,
+- * bcache device should be stopped too, to
+- * keep data consistency on cache and
+- * backing devices.
+- */
+- if (test_bit(CACHE_SET_IO_DISABLE, &c->flags))
+- bcache_device_stop(c->devices[i]);
+- } else {
+- bcache_device_stop(c->devices[i]);
+- }
++ for (i = 0; i < c->devices_max_used; i++) {
++ d = c->devices[i];
++ if (!d)
++ continue;
++
++ if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
++ test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
++ dc = container_of(d, struct cached_dev, disk);
++ bch_cached_dev_detach(dc);
++ if (test_bit(CACHE_SET_IO_DISABLE, &c->flags))
++ conditional_stop_bcache_device(c, d, dc);
++ } else {
++ bcache_device_stop(d);
+ }
++ }
+
+ mutex_unlock(&bch_register_lock);
+
+diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
+index ac3adf2dcf6c..e88fdcc549cd 100644
+--- a/drivers/md/bcache/sysfs.c
++++ b/drivers/md/bcache/sysfs.c
+@@ -78,6 +78,7 @@ rw_attribute(congested_write_threshold_us);
+ rw_attribute(sequential_cutoff);
+ rw_attribute(data_csum);
+ rw_attribute(cache_mode);
++rw_attribute(stop_when_cache_set_failed);
+ rw_attribute(writeback_metadata);
+ rw_attribute(writeback_running);
+ rw_attribute(writeback_percent);
+@@ -126,6 +127,12 @@ SHOW(__bch_cached_dev)
+ bch_cache_modes + 1,
+ BDEV_CACHE_MODE(&dc->sb));
+
++ if (attr == &sysfs_stop_when_cache_set_failed)
++ return bch_snprint_string_list(buf, PAGE_SIZE,
++ bch_stop_on_failure_modes + 1,
++ dc->stop_when_cache_set_failed);
++
++
+ sysfs_printf(data_csum, "%i", dc->disk.data_csum);
+ var_printf(verify, "%i");
+ var_printf(bypass_torture_test, "%i");
+@@ -257,6 +264,15 @@ STORE(__cached_dev)
+ }
+ }
+
++ if (attr == &sysfs_stop_when_cache_set_failed) {
++ v = bch_read_string_list(buf, bch_stop_on_failure_modes + 1);
++
++ if (v < 0)
++ return v;
++
++ dc->stop_when_cache_set_failed = v;
++ }
++
+ if (attr == &sysfs_label) {
+ if (size > SB_LABEL_SIZE)
+ return -EINVAL;
+@@ -333,6 +349,7 @@ static struct attribute *bch_cached_dev_files[] = {
+ &sysfs_data_csum,
+ #endif
+ &sysfs_cache_mode,
++ &sysfs_stop_when_cache_set_failed,
+ &sysfs_writeback_metadata,
+ &sysfs_writeback_running,
+ &sysfs_writeback_delay,
+--
+2.16.1
+