for-next: update v5 patch set

author: Coly Li <colyli@suse.de> 2018-02-05 23:15:25 +0800
committer: Coly Li <colyli@suse.de> 2018-02-05 23:15:25 +0800
commit: dcc1da5ab0d2ea18f2f5e5dd769047fcc9c0bb43 (patch)
tree: bc72e87b45912fac9995a624c0f5f12e7715fd6c
parent: 2bbb32018a3cbf7780cf14ba16b9935837b2ce5f (diff)
download: bcache-patches-dcc1da5ab0d2ea18f2f5e5dd769047fcc9c0bb43.tar.gz
26 files changed, 2319 insertions, 0 deletions
diff --git a/for-next/v4-0000-cover-letter.patch b/for-next/v4/v4-0000-cover-letter.patch
index 0327afe..0327afe 100644
--- a/for-next/v4-0000-cover-letter.patch
+++ b/for-next/v4/v4-0000-cover-letter.patch
diff --git a/for-next/v4-0001-bcache-set-writeback_rate_update_seconds-in-range.patch b/for-next/v4/v4-0001-bcache-set-writeback_rate_update_seconds-in-range.patch
index 51edd0b..51edd0b 100644
--- a/for-next/v4-0001-bcache-set-writeback_rate_update_seconds-in-range.patch
+++ b/for-next/v4/v4-0001-bcache-set-writeback_rate_update_seconds-in-range.patch
diff --git a/for-next/v4-0002-bcache-properly-set-task-state-in-bch_writeback_t.patch b/for-next/v4/v4-0002-bcache-properly-set-task-state-in-bch_writeback_t.patch
index 113dd97..113dd97 100644
--- a/for-next/v4-0002-bcache-properly-set-task-state-in-bch_writeback_t.patch
+++ b/for-next/v4/v4-0002-bcache-properly-set-task-state-in-bch_writeback_t.patch
diff --git a/for-next/v4-0003-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch b/for-next/v4/v4-0003-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch
index f85123b..f85123b 100644
--- a/for-next/v4-0003-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch
+++ b/for-next/v4/v4-0003-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch
diff --git a/for-next/v4-0004-bcache-quit-dc-writeback_thread-when-BCACHE_DEV_D.patch b/for-next/v4/v4-0004-bcache-quit-dc-writeback_thread-when-BCACHE_DEV_D.patch
index 349a3d1..349a3d1 100644
--- a/for-next/v4-0004-bcache-quit-dc-writeback_thread-when-BCACHE_DEV_D.patch
+++ b/for-next/v4/v4-0004-bcache-quit-dc-writeback_thread-when-BCACHE_DEV_D.patch
diff --git a/for-next/v4-0005-bcache-stop-dc-writeback_rate_update-properly.patch b/for-next/v4/v4-0005-bcache-stop-dc-writeback_rate_update-properly.patch
index 2e6ce9b..2e6ce9b 100644
--- a/for-next/v4-0005-bcache-stop-dc-writeback_rate_update-properly.patch
+++ b/for-next/v4/v4-0005-bcache-stop-dc-writeback_rate_update-properly.patch
diff --git a/for-next/v4-0006-bcache-set-error_limit-correctly.patch b/for-next/v4/v4-0006-bcache-set-error_limit-correctly.patch
index 927468d..927468d 100644
--- a/for-next/v4-0006-bcache-set-error_limit-correctly.patch
+++ b/for-next/v4/v4-0006-bcache-set-error_limit-correctly.patch
diff --git a/for-next/v4-0007-bcache-add-CACHE_SET_IO_DISABLE-to-struct-cache_s.patch b/for-next/v4/v4-0007-bcache-add-CACHE_SET_IO_DISABLE-to-struct-cache_s.patch
index 849d522..849d522 100644
--- a/for-next/v4-0007-bcache-add-CACHE_SET_IO_DISABLE-to-struct-cache_s.patch
+++ b/for-next/v4/v4-0007-bcache-add-CACHE_SET_IO_DISABLE-to-struct-cache_s.patch
diff --git a/for-next/v4-0008-bcache-stop-all-attached-bcache-devices-for-a-ret.patch b/for-next/v4/v4-0008-bcache-stop-all-attached-bcache-devices-for-a-ret.patch
index eab5e76..eab5e76 100644
--- a/for-next/v4-0008-bcache-stop-all-attached-bcache-devices-for-a-ret.patch
+++ b/for-next/v4/v4-0008-bcache-stop-all-attached-bcache-devices-for-a-ret.patch
diff --git a/for-next/v4-0009-bcache-fix-inaccurate-io-state-for-detached-bcach.patch b/for-next/v4/v4-0009-bcache-fix-inaccurate-io-state-for-detached-bcach.patch
index 048a30a..048a30a 100644
--- a/for-next/v4-0009-bcache-fix-inaccurate-io-state-for-detached-bcach.patch
+++ b/for-next/v4/v4-0009-bcache-fix-inaccurate-io-state-for-detached-bcach.patch
diff --git a/for-next/v4-0010-bcache-add-backing_request_endio-for-bi_end_io-of.patch b/for-next/v4/v4-0010-bcache-add-backing_request_endio-for-bi_end_io-of.patch
index 80f6dc8..80f6dc8 100644
--- a/for-next/v4-0010-bcache-add-backing_request_endio-for-bi_end_io-of.patch
+++ b/for-next/v4/v4-0010-bcache-add-backing_request_endio-for-bi_end_io-of.patch
diff --git a/for-next/v4-0011-bcache-add-io_disable-to-struct-cached_dev.patch b/for-next/v4/v4-0011-bcache-add-io_disable-to-struct-cached_dev.patch
index 6b4ae2a..6b4ae2a 100644
--- a/for-next/v4-0011-bcache-add-io_disable-to-struct-cached_dev.patch
+++ b/for-next/v4/v4-0011-bcache-add-io_disable-to-struct-cached_dev.patch
diff --git a/for-next/v4-0012-bcache-stop-bcache-device-when-backing-device-is-.patch b/for-next/v4/v4-0012-bcache-stop-bcache-device-when-backing-device-is-.patch
index e73bf4f..e73bf4f 100644
--- a/for-next/v4-0012-bcache-stop-bcache-device-when-backing-device-is-.patch
+++ b/for-next/v4/v4-0012-bcache-stop-bcache-device-when-backing-device-is-.patch
diff --git a/for-next/v4-0013-bcache-add-stop_attached_devs_on_fail-to-struct-c.patch b/for-next/v4/v4-0013-bcache-add-stop_attached_devs_on_fail-to-struct-c.patch
index d9edf10..d9edf10 100644
--- a/for-next/v4-0013-bcache-add-stop_attached_devs_on_fail-to-struct-c.patch
+++ b/for-next/v4/v4-0013-bcache-add-stop_attached_devs_on_fail-to-struct-c.patch
diff --git a/for-next/v5-0000-cover-letter.patch b/for-next/v5-0000-cover-letter.patch
new file mode 100644
index 0000000..de57fc5
--- /dev/null
+++ b/for-next/v5-0000-cover-letter.patch
@@ -0,0 +1,92 @@
+From e8f72263c0f4f20b85f42a617fa4998115f797af Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Mon, 5 Feb 2018 18:26:45 +0800
+Subject: [PATCH v5 00/11] bcache: device failure handling improvement 
+
+Hi maintainers and folks,
+
+This patch set tries to improve bcache device failure handling, includes
+cache device and backing device failures.
+
+The basic idea to handle failed cache device is,
+- Unregister cache set
+- Detach all backing devices which are attached to this cache set
+- Stop all the detached bcache devices (configurable)
+- Stop all flash only volume on the cache set
+The above process is named 'cache set retire' by me. The result of cache
+set retire is, cache set and bcache devices are all removed, following
+I/O requests will get failed immediately to notift upper layer or user
+space coce that the cache device is failed or disconnected.
+- Stop all the detached bcache devices (configurable)
+- Stop all flash only volume on the cache set
+The above process is named 'cache set retire' by me. The result of cache
+set retire is, cache set and bcache devices are all removed
+(configurable), following I/O requests will get failed immediately to
+notify upper layer or user space coce that the cache device is failed or
+disconnected.
+
+There are 2 patches from v4 patch set is merged into bcache-for-next, they
+are not in v5 patch set any more.
+
+In v5 patch set add a new patch "bcache: add stop_when_cache_set_failed
+option to backing device", which provides "auto"/"always" options to
+configure whether or not to stop bcache device for a broken cache cset.
+
+Most of the patches are reviewed by Hannes Reinecke and Junhui Tang. There
+are still severl patches need to be reviewed,
+- [PATCH v5 03/11] bcache: quit dc->writeback_thread when
+  BCACHE_DEV_DETACHING is set
+- [PATCH v5 11/11] bcache: add stop_when_cache_set_failed option to
+   backing device 
+
+Any comment, question and review are warmly welcome. Thanks in advance.
+
+Changelog:
+v5: add [PATCH v5 11/11] bcache: add stop_when_cache_set_failed option to
+    backing device.
+    fix issues from v4 patch set.
+    improve kernel message format, remove redundant prefix string.
+v4: add per-cached_dev option stop_attached_devs_on_fail to avoid stopping
+    attached bcache device from a retiring cache set.
+v3: fix detach issue find in v2 patch set.
+v2: fixes all problems found in v1 review.
+    add patches to handle backing device failure.
+    add one more patch to set writeback_rate_update_seconds range.
+    include a patch from Junhui Tang.
+v1: the initial version, only handles cache device failure.
+
+Coly Li
+
+
+Coly Li (10):
+  bcache: set writeback_rate_update_seconds in range [1, 60] seconds
+  bcache: fix cached_dev->count usage for bch_cache_set_error()
+  bcache: quit dc->writeback_thread when BCACHE_DEV_DETACHING is set
+  bcache: stop dc->writeback_rate_update properly
+  bcache: add CACHE_SET_IO_DISABLE to struct cache_set flags
+  bcache: stop all attached bcache devices for a retired cache set
+  bcache: add backing_request_endio() for bi_end_io of attached backing
+    device I/O
+  bcache: add io_disable to struct cached_dev
+  bcache: stop bcache device when backing device is offline
+  bcache: add stop_when_cache_set_failed option to backing device
+
+Tang Junhui (1):
+  bcache: fix inaccurate io state for detached bcache devices
+
+ drivers/md/bcache/alloc.c     |   3 +-
+ drivers/md/bcache/bcache.h    |  44 ++++++++-
+ drivers/md/bcache/btree.c     |  10 +-
+ drivers/md/bcache/io.c        |  16 +++-
+ drivers/md/bcache/journal.c   |   4 +-
+ drivers/md/bcache/request.c   | 185 +++++++++++++++++++++++++++++++------
+ drivers/md/bcache/super.c     | 206 ++++++++++++++++++++++++++++++++++++++----
+ drivers/md/bcache/sysfs.c     |  59 +++++++++++-
+ drivers/md/bcache/util.h      |   6 --
+ drivers/md/bcache/writeback.c |  94 ++++++++++++++++---
+ drivers/md/bcache/writeback.h |   5 +-
+ 11 files changed, 551 insertions(+), 81 deletions(-)
+
+-- 
+2.16.1
+
diff --git a/for-next/v5-0001-bcache-set-writeback_rate_update_seconds-in-range.patch b/for-next/v5-0001-bcache-set-writeback_rate_update_seconds-in-range.patch
new file mode 100644
index 0000000..d218724
--- /dev/null
+++ b/for-next/v5-0001-bcache-set-writeback_rate_update_seconds-in-range.patch
@@ -0,0 +1,79 @@
+From 31fa907c6f962fc229c4364ac08b239f7bf5384d Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sat, 13 Jan 2018 15:11:03 +0800
+Subject: [PATCH v5 01/11] bcache: set writeback_rate_update_seconds in range
+ [1, 60] seconds
+
+dc->writeback_rate_update_seconds can be set via sysfs and its value can
+be set to [1, ULONG_MAX].  It does not make sense to set such a large
+value, 60 seconds is long enough value considering the default 5 seconds
+works well for long time.
+
+Because dc->writeback_rate_update is a special delayed work, it re-arms
+itself inside the delayed work routine update_writeback_rate(). When
+stopping it by cancel_delayed_work_sync(), there should be a timeout to
+wait and make sure the re-armed delayed work is stopped too. A small max
+value of dc->writeback_rate_update_seconds is also helpful to decide a
+reasonable small timeout.
+
+This patch limits sysfs interface to set dc->writeback_rate_update_seconds
+in range of [1, 60] seconds, and replaces the hand-coded number by macros.
+
+Changelog:
+v2: fix a rebase typo in v4, which is pointed out by Michael Lyle.
+v1: initial version.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reviewed-by: Hannes Reinecke <hare@suse.com>
+Cc: Michael Lyle <mlyle@lyle.org>
+---
+ drivers/md/bcache/sysfs.c     | 4 +++-
+ drivers/md/bcache/writeback.c | 2 +-
+ drivers/md/bcache/writeback.h | 3 +++
+ 3 files changed, 7 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
+index c524305cc9a7..4a6a697e1680 100644
+--- a/drivers/md/bcache/sysfs.c
++++ b/drivers/md/bcache/sysfs.c
+@@ -218,7 +218,9 @@ STORE(__cached_dev)
+ 	sysfs_strtoul_clamp(writeback_rate,
+ 			    dc->writeback_rate.rate, 1, INT_MAX);
+ 
+-	d_strtoul_nonzero(writeback_rate_update_seconds);
++	sysfs_strtoul_clamp(writeback_rate_update_seconds,
++			    dc->writeback_rate_update_seconds,
++			    1, WRITEBACK_RATE_UPDATE_SECS_MAX);
+ 	d_strtoul(writeback_rate_i_term_inverse);
+ 	d_strtoul_nonzero(writeback_rate_p_term_inverse);
+ 
+diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
+index 58218f7e77c3..f1d2fc15abcc 100644
+--- a/drivers/md/bcache/writeback.c
++++ b/drivers/md/bcache/writeback.c
+@@ -655,7 +655,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
+ 	dc->writeback_rate.rate		= 1024;
+ 	dc->writeback_rate_minimum	= 8;
+ 
+-	dc->writeback_rate_update_seconds = 5;
++	dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT;
+ 	dc->writeback_rate_p_term_inverse = 40;
+ 	dc->writeback_rate_i_term_inverse = 10000;
+ 
+diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
+index 66f1c527fa24..587b25599856 100644
+--- a/drivers/md/bcache/writeback.h
++++ b/drivers/md/bcache/writeback.h
+@@ -8,6 +8,9 @@
+ #define MAX_WRITEBACKS_IN_PASS  5
+ #define MAX_WRITESIZE_IN_PASS   5000	/* *512b */
+ 
++#define WRITEBACK_RATE_UPDATE_SECS_MAX		60
++#define WRITEBACK_RATE_UPDATE_SECS_DEFAULT	5
++
+ /*
+  * 14 (16384ths) is chosen here as something that each backing device
+  * should be a reasonable fraction of the share, and not to blow up
+-- 
+2.16.1
+
diff --git a/for-next/v5-0002-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch b/for-next/v5-0002-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch
new file mode 100644
index 0000000..1c09d03
--- /dev/null
+++ b/for-next/v5-0002-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch
@@ -0,0 +1,178 @@
+From c88508fc06f1d3d2241104d5e3b7b5e0045d24c0 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Mon, 8 Jan 2018 23:05:58 +0800
+Subject: [PATCH v5 02/11] bcache: fix cached_dev->count usage for
+ bch_cache_set_error()
+
+When bcache metadata I/O fails, bcache will call bch_cache_set_error()
+to retire the whole cache set. The expected behavior to retire a cache
+set is to unregister the cache set, and unregister all backing device
+attached to this cache set, then remove sysfs entries of the cache set
+and all attached backing devices, finally release memory of structs
+cache_set, cache, cached_dev and bcache_device.
+
+In my testing when journal I/O failure triggered by disconnected cache
+device, sometimes the cache set cannot be retired, and its sysfs
+entry /sys/fs/bcache/<uuid> still exits and the backing device also
+references it. This is not expected behavior.
+
+When metadata I/O failes, the call senquence to retire whole cache set is,
+        bch_cache_set_error()
+        bch_cache_set_unregister()
+        bch_cache_set_stop()
+        __cache_set_unregister()     <- called as callback by calling
+                                        clousre_queue(&c->caching)
+        cache_set_flush()            <- called as a callback when refcount
+                                        of cache_set->caching is 0
+        cache_set_free()             <- called as a callback when refcount
+                                        of catch_set->cl is 0
+        bch_cache_set_release()      <- called as a callback when refcount
+                                        of catch_set->kobj is 0
+
+I find if kernel thread bch_writeback_thread() quits while-loop when
+kthread_should_stop() is true and searched_full_index is false, clousre
+callback cache_set_flush() set by continue_at() will never be called. The
+result is, bcache fails to retire whole cache set.
+
+cache_set_flush() will be called when refcount of closure c->caching is 0,
+and in function bcache_device_detach() refcount of closure c->caching is
+released to 0 by clousre_put(). In metadata error code path, function
+bcache_device_detach() is called by cached_dev_detach_finish(). This is a
+callback routine being called when cached_dev->count is 0. This refcount
+is decreased by cached_dev_put().
+
+The above dependence indicates, cache_set_flush() will be called when
+refcount of cache_set->cl is 0, and refcount of cache_set->cl to be 0
+when refcount of cache_dev->count is 0.
+
+The reason why sometimes cache_dev->count is not 0 (when metadata I/O fails
+and bch_cache_set_error() called) is, in bch_writeback_thread(), refcount
+of cache_dev is not decreased properly.
+
+In bch_writeback_thread(), cached_dev_put() is called only when
+searched_full_index is true and cached_dev->writeback_keys is empty, a.k.a
+there is no dirty data on cache. In most of run time it is correct, but
+when bch_writeback_thread() quits the while-loop while cache is still
+dirty, current code forget to call cached_dev_put() before this kernel
+thread exits. This is why sometimes cache_set_flush() is not executed and
+cache set fails to be retired.
+
+The reason to call cached_dev_put() in bch_writeback_rate() is, when the
+cache device changes from clean to dirty, cached_dev_get() is called, to
+make sure during writeback operatiions both backing and cache devices
+won't be released.
+
+Adding following code in bch_writeback_thread() does not work,
+   static int bch_writeback_thread(void *arg)
+        }
+
++       if (atomic_read(&dc->has_dirty))
++               cached_dev_put()
++
+        return 0;
+ }
+because writeback kernel thread can be waken up and start via sysfs entry:
+        echo 1 > /sys/block/bcache<N>/bcache/writeback_running
+It is difficult to check whether backing device is dirty without race and
+extra lock. So the above modification will introduce potential refcount
+underflow in some conditions.
+
+The correct fix is, to take cached dev refcount when creating the kernel
+thread, and put it before the kernel thread exits. Then bcache does not
+need to take a cached dev refcount when cache turns from clean to dirty,
+or to put a cached dev refcount when cache turns from ditry to clean. The
+writeback kernel thread is alwasy safe to reference data structure from
+cache set, cache and cached device (because a refcount of cache device is
+taken for it already), and no matter the kernel thread is stopped by I/O
+errors or system reboot, cached_dev->count can always be used correctly.
+
+The patch is simple, but understanding how it works is quite complicated.
+
+Changelog:
+v2: set dc->writeback_thread to NULL in this patch, as suggested by Hannes.
+v1: initial version for review.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reviewed-by: Hannes Reinecke <hare@suse.com>
+Cc: Michael Lyle <mlyle@lyle.org>
+Cc: Junhui Tang <tang.junhui@zte.com.cn>
+---
+ drivers/md/bcache/super.c     |  1 -
+ drivers/md/bcache/writeback.c | 11 ++++++++---
+ drivers/md/bcache/writeback.h |  2 --
+ 3 files changed, 8 insertions(+), 6 deletions(-)
+
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index a2ad37a8afc0..7d96dc6860fa 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -1052,7 +1052,6 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
+ 	if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
+ 		bch_sectors_dirty_init(&dc->disk);
+ 		atomic_set(&dc->has_dirty, 1);
+-		refcount_inc(&dc->count);
+ 		bch_writeback_queue(dc);
+ 	}
+ 
+diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
+index f1d2fc15abcc..b280c134dd4d 100644
+--- a/drivers/md/bcache/writeback.c
++++ b/drivers/md/bcache/writeback.c
+@@ -572,7 +572,7 @@ static int bch_writeback_thread(void *arg)
+ 
+ 			if (kthread_should_stop()) {
+ 				set_current_state(TASK_RUNNING);
+-				return 0;
++				break;
+ 			}
+ 
+ 			schedule();
+@@ -585,7 +585,6 @@ static int bch_writeback_thread(void *arg)
+ 		if (searched_full_index &&
+ 		    RB_EMPTY_ROOT(&dc->writeback_keys.keys)) {
+ 			atomic_set(&dc->has_dirty, 0);
+-			cached_dev_put(dc);
+ 			SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
+ 			bch_write_bdev_super(dc, NULL);
+ 		}
+@@ -606,6 +605,9 @@ static int bch_writeback_thread(void *arg)
+ 		}
+ 	}
+ 
++	dc->writeback_thread = NULL;
++	cached_dev_put(dc);
++
+ 	return 0;
+ }
+ 
+@@ -669,10 +671,13 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc)
+ 	if (!dc->writeback_write_wq)
+ 		return -ENOMEM;
+ 
++	cached_dev_get(dc);
+ 	dc->writeback_thread = kthread_create(bch_writeback_thread, dc,
+ 					      "bcache_writeback");
+-	if (IS_ERR(dc->writeback_thread))
++	if (IS_ERR(dc->writeback_thread)) {
++		cached_dev_put(dc);
+ 		return PTR_ERR(dc->writeback_thread);
++	}
+ 
+ 	schedule_delayed_work(&dc->writeback_rate_update,
+ 			      dc->writeback_rate_update_seconds * HZ);
+diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
+index 587b25599856..0bba8f1c6cdf 100644
+--- a/drivers/md/bcache/writeback.h
++++ b/drivers/md/bcache/writeback.h
+@@ -105,8 +105,6 @@ static inline void bch_writeback_add(struct cached_dev *dc)
+ {
+ 	if (!atomic_read(&dc->has_dirty) &&
+ 	    !atomic_xchg(&dc->has_dirty, 1)) {
+-		refcount_inc(&dc->count);
+-
+ 		if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) {
+ 			SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY);
+ 			/* XXX: should do this synchronously */
+-- 
+2.16.1
+
diff --git a/for-next/v5-0003-bcache-quit-dc-writeback_thread-when-BCACHE_DEV_D.patch b/for-next/v5-0003-bcache-quit-dc-writeback_thread-when-BCACHE_DEV_D.patch
new file mode 100644
index 0000000..d9aa14d
--- /dev/null
+++ b/for-next/v5-0003-bcache-quit-dc-writeback_thread-when-BCACHE_DEV_D.patch
@@ -0,0 +1,130 @@
+From ad362b22f19cca3073dfe9290529ea5ff63c8b4b Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sun, 14 Jan 2018 21:41:57 +0800
+Subject: [PATCH v5 03/11] bcache: quit dc->writeback_thread when
+ BCACHE_DEV_DETACHING is set
+
+In patch "bcache: fix cached_dev->count usage for bch_cache_set_error()",
+cached_dev_get() is called when creating dc->writeback_thread, and
+cached_dev_put() is called when exiting dc->writeback_thread. This
+modification works well unless people detach the bcache device manually by
+    'echo 1 > /sys/block/bcache<N>/bcache/detach'
+Because this sysfs interface only calls bch_cached_dev_detach() which wakes
+up dc->writeback_thread but does not stop it. The reason is, before patch
+"bcache: fix cached_dev->count usage for bch_cache_set_error()", inside
+bch_writeback_thread(), if cache is not dirty after writeback,
+cached_dev_put() will be called here. And in cached_dev_make_request() when
+a new write request makes cache from clean to dirty, cached_dev_get() will
+be called there. Since we don't operate dc->count in these locations,
+refcount d->count cannot be dropped after cache becomes clean, and
+cached_dev_detach_finish() won't be called to detach bcache device.
+
+This patch fixes the issue by checking whether BCACHE_DEV_DETACHING is
+set inside bch_writeback_thread(). If this bit is set and cache is clean
+(no existing writeback_keys), break the while-loop, call cached_dev_put()
+and quit the writeback thread.
+
+Please note if cache is still dirty, even BCACHE_DEV_DETACHING is set the
+writeback thread should continue to perform writeback, this is the original
+design of manually detach.
+
+It is safe to do the following check without locking, let me explain why,
++	if (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) &&
++	    (!atomic_read(&dc->has_dirty) || !dc->writeback_running)) {
+
+If the kenrel thread does not sleep and continue to run due to conditions
+are not updated in time on the running CPU core, it just consumes more CPU
+cycles and has no hurt. This should-sleep-but-run is safe here. We just
+focus on the should-run-but-sleep condition, which means the writeback
+thread goes to sleep in mistake while it should continue to run.
+1, First of all, no matter the writeback thread is hung or not, kthread_stop() from
+   cached_dev_detach_finish() will wake up it and terminate by making
+   kthread_should_stop() return true. And in normal run time, bit on index
+   BCACHE_DEV_DETACHING is always cleared, the condition
+	!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)
+   is always true and can be ignored as constant value.
+2, If one of the following conditions is true, the writeback thread should
+   go to sleep,
+   "!atomic_read(&dc->has_dirty)" or "!dc->writeback_running)"
+   each of them independently controls the writeback thread should sleep or
+   not, let's analyse them one by one.
+2.1 condition "!atomic_read(&dc->has_dirty)"
+   If dc->has_dirty is set from 0 to 1 on another CPU core, bcache will
+   call bch_writeback_queue() immediately or call bch_writeback_add() which
+   indirectly calls bch_writeback_queue() too. In bch_writeback_queue(),
+   wake_up_process(dc->writeback_thread) is called. It sets writeback
+   thread's task state to TASK_RUNNING and following an implicit memory
+   barrier, then tries to wake up the writeback thread.
+   In writeback thread, its task state is set to TASK_INTERRUPTIBLE before
+   doing the condition check. If other CPU core sets the TASK_RUNNING state
+   after writeback thread setting TASK_INTERRUPTIBLE, the writeback thread
+   will be scheduled to run very soon because its state is not
+   TASK_INTERRUPTIBLE. If other CPU core sets the TASK_RUNNING state before
+   writeback thread setting TASK_INTERRUPTIBLE, the implict memory barrier
+   of wake_up_process() will make sure modification of dc->has_dirty on
+   other CPU core is updated and observed on the CPU core of writeback
+   thread. Therefore the condition check will correctly be false, and
+   continue writeback code without sleeping.
+2.2 condition "!dc->writeback_running)"
+   dc->writeback_running can be changed via sysfs file, every time it is 
+   modified, a following bch_writeback_queue() is alwasy called. So the
+   change is always observed on the CPU core of writeback thread. If
+   dc->writeback_running is changed from 0 to 1 on other CPU core, this
+   condition check will observe the modification and allow writeback
+   thread to continue to run without sleeping. 
+Now we can see, even without a locking protection, multiple conditions
+check is safe here, no deadlock or process hang up will happen.
+
+I compose a separte patch because that patch "bcache: fix cached_dev->count
+usage for bch_cache_set_error()" already gets a "Reviewed-by:" from Hannes
+Reinecke. Also this fix is not trivial and good for a separate patch.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Michael Lyle <mlyle@lyle.org>
+Cc: Hannes Reinecke <hare@suse.com>
+Cc: Huijun Tang <tang.junhui@zte.com.cn>
+---
+ drivers/md/bcache/writeback.c | 20 +++++++++++++++++---
+ 1 file changed, 17 insertions(+), 3 deletions(-)
+
+diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
+index b280c134dd4d..4dbeaaa575bf 100644
+--- a/drivers/md/bcache/writeback.c
++++ b/drivers/md/bcache/writeback.c
+@@ -565,9 +565,15 @@ static int bch_writeback_thread(void *arg)
+ 	while (!kthread_should_stop()) {
+ 		down_write(&dc->writeback_lock);
+ 		set_current_state(TASK_INTERRUPTIBLE);
+-		if (!atomic_read(&dc->has_dirty) ||
+-		    (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) &&
+-		     !dc->writeback_running)) {
++		/*
++		 * If the bache device is detaching, skip here and continue
++		 * to perform writeback. Otherwise, if no dirty data on cache,
++		 * or there is dirty data on cache but writeback is disabled,
++		 * the writeback thread should sleep here and wait for others
++		 * to wake up it.
++		 */
++		if (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) &&
++		    (!atomic_read(&dc->has_dirty) || !dc->writeback_running)) {
+ 			up_write(&dc->writeback_lock);
+ 
+ 			if (kthread_should_stop()) {
+@@ -587,6 +593,14 @@ static int bch_writeback_thread(void *arg)
+ 			atomic_set(&dc->has_dirty, 0);
+ 			SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
+ 			bch_write_bdev_super(dc, NULL);
++			/*
++			 * If bcache device is detaching via sysfs interface,
++			 * writeback thread should stop after there is no dirty
++			 * data on cache. BCACHE_DEV_DETACHING flag is set in
++			 * bch_cached_dev_detach().
++			 */
++			if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
++				break;
+ 		}
+ 
+ 		up_write(&dc->writeback_lock);
+-- 
+2.16.1
+
diff --git a/for-next/v5-0004-bcache-stop-dc-writeback_rate_update-properly.patch b/for-next/v5-0004-bcache-stop-dc-writeback_rate_update-properly.patch
new file mode 100644
index 0000000..83524b8
--- /dev/null
+++ b/for-next/v5-0004-bcache-stop-dc-writeback_rate_update-properly.patch
@@ -0,0 +1,268 @@
+From f4ce0fb52b341b89a5302daa4a0c2ce716281867 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sat, 13 Jan 2018 15:48:39 +0800
+Subject: [PATCH v5 04/11] bcache: stop dc->writeback_rate_update properly
+
+struct delayed_work writeback_rate_update in struct cache_dev is a delayed
+worker to call function update_writeback_rate() in period (the interval is
+defined by dc->writeback_rate_update_seconds).
+
+When a metadate I/O error happens on cache device, bcache error handling
+routine bch_cache_set_error() will call bch_cache_set_unregister() to
+retire whole cache set. On the unregister code path, this delayed work is
+stopped by calling cancel_delayed_work_sync(&dc->writeback_rate_update).
+
+dc->writeback_rate_update is a special delayed work from others in bcache.
+In its routine update_writeback_rate(), this delayed work is re-armed
+itself. That means when cancel_delayed_work_sync() returns, this delayed
+work can still be executed after several seconds defined by
+dc->writeback_rate_update_seconds.
+
+The problem is, after cancel_delayed_work_sync() returns, the cache set
+unregister code path will continue and release memory of struct cache set.
+Then the delayed work is scheduled to run, __update_writeback_rate()
+will reference the already released cache_set memory, and trigger a NULL
+pointer deference fault.
+
+This patch introduces two more bcache device flags,
+- BCACHE_DEV_WB_RUNNING
+  bit set:  bcache device is in writeback mode and running, it is OK for
+            dc->writeback_rate_update to re-arm itself.
+  bit clear:bcache device is trying to stop dc->writeback_rate_update,
+            this delayed work should not re-arm itself and quit.
+- BCACHE_DEV_RATE_DW_RUNNING
+  bit set:  routine update_writeback_rate() is executing.
+  bit clear: routine update_writeback_rate() quits.
+
+This patch also adds a function cancel_writeback_rate_update_dwork() to
+wait for dc->writeback_rate_update quits before cancel it by calling
+cancel_delayed_work_sync(). In order to avoid a deadlock by unexpected
+quit dc->writeback_rate_update, after time_out seconds this function will
+give up and continue to call cancel_delayed_work_sync().
+
+And here I explain how this patch stops self re-armed delayed work properly
+with the above stuffs.
+
+update_writeback_rate() sets BCACHE_DEV_RATE_DW_RUNNING at its beginning
+and clears BCACHE_DEV_RATE_DW_RUNNING at its end. Before calling
+cancel_writeback_rate_update_dwork() clear flag BCACHE_DEV_WB_RUNNING.
+
+Before calling cancel_delayed_work_sync() wait utill flag
+BCACHE_DEV_RATE_DW_RUNNING is clear. So when calling
+cancel_delayed_work_sync(), dc->writeback_rate_update must be already re-
+armed, or quite by seeing BCACHE_DEV_WB_RUNNING cleared. In both cases
+delayed work routine update_writeback_rate() won't be executed after
+cancel_delayed_work_sync() returns.
+
+Inside update_writeback_rate() before calling schedule_delayed_work(), flag
+BCACHE_DEV_WB_RUNNING is checked before. If this flag is cleared, it means
+someone is about to stop the delayed work. Because flag
+BCACHE_DEV_RATE_DW_RUNNING is set already and cancel_delayed_work_sync()
+has to wait for this flag to be cleared, we don't need to worry about race
+condition here.
+
+If update_writeback_rate() is scheduled to run after checking
+BCACHE_DEV_RATE_DW_RUNNING and before calling cancel_delayed_work_sync()
+in cancel_writeback_rate_update_dwork(), it is also safe. Because at this
+moment BCACHE_DEV_WB_RUNNING is cleared with memory barrier. As I mentioned
+previously, update_writeback_rate() will see BCACHE_DEV_WB_RUNNING is clear
+and quit immediately.
+
+Because there are more dependences inside update_writeback_rate() to struct
+cache_set memory, dc->writeback_rate_update is not a simple self re-arm
+delayed work. After trying many different methods (e.g. hold dc->count, or
+use locks), this is the only way I can find which works to properly stop
+dc->writeback_rate_update delayed work.
+
+Changelog:
+v3: change values of BCACHE_DEV_WB_RUNNING and BCACHE_DEV_RATE_DW_RUNNING
+    to bit index, for test_bit().
+v2: Try to fix the race issue which is pointed out by Junhui.
+v1: The initial version for review
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reviewed-by: Junhui Tang <tang.junhui@zte.com.cn>
+Cc: Michael Lyle <mlyle@lyle.org>
+Cc: Hannes Reinecke <hare@suse.com>
+---
+ drivers/md/bcache/bcache.h    |  9 +++++----
+ drivers/md/bcache/super.c     | 39 +++++++++++++++++++++++++++++++++++----
+ drivers/md/bcache/sysfs.c     |  3 ++-
+ drivers/md/bcache/writeback.c | 29 ++++++++++++++++++++++++++++-
+ 4 files changed, 70 insertions(+), 10 deletions(-)
+
+diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
+index b8c2e1bef1f1..0380626bf525 100644
+--- a/drivers/md/bcache/bcache.h
++++ b/drivers/md/bcache/bcache.h
+@@ -258,10 +258,11 @@ struct bcache_device {
+ 	struct gendisk		*disk;
+ 
+ 	unsigned long		flags;
+-#define BCACHE_DEV_CLOSING	0
+-#define BCACHE_DEV_DETACHING	1
+-#define BCACHE_DEV_UNLINK_DONE	2
+-
++#define BCACHE_DEV_CLOSING		0
++#define BCACHE_DEV_DETACHING		1
++#define BCACHE_DEV_UNLINK_DONE		2
++#define BCACHE_DEV_WB_RUNNING		3
++#define BCACHE_DEV_RATE_DW_RUNNING	4
+ 	unsigned		nr_stripes;
+ 	unsigned		stripe_size;
+ 	atomic_t		*stripe_sectors_dirty;
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index 7d96dc6860fa..e15cacecf078 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -899,6 +899,32 @@ void bch_cached_dev_run(struct cached_dev *dc)
+ 		pr_debug("error creating sysfs link");
+ }
+ 
++/*
++ * If BCACHE_DEV_RATE_DW_RUNNING is set, it means routine of the delayed
++ * work dc->writeback_rate_update is running. Wait until the routine
++ * quits (BCACHE_DEV_RATE_DW_RUNNING is clear), then continue to
++ * cancel it. If BCACHE_DEV_RATE_DW_RUNNING is not clear after time_out
++ * seconds, give up waiting here and continue to cancel it too.
++ */
++static void cancel_writeback_rate_update_dwork(struct cached_dev *dc)
++{
++	int time_out = WRITEBACK_RATE_UPDATE_SECS_MAX * HZ;
++
++	do {
++		if (!test_bit(BCACHE_DEV_RATE_DW_RUNNING,
++			      &dc->disk.flags))
++			break;
++		time_out--;
++		schedule_timeout_interruptible(1);
++	} while (time_out > 0);
++
++	if (time_out == 0)
++		pr_warn("give up waiting for dc->writeback_write_update"
++			" to quit");
++
++	cancel_delayed_work_sync(&dc->writeback_rate_update);
++}
++
+ static void cached_dev_detach_finish(struct work_struct *w)
+ {
+ 	struct cached_dev *dc = container_of(w, struct cached_dev, detach);
+@@ -911,7 +937,9 @@ static void cached_dev_detach_finish(struct work_struct *w)
+ 
+ 	mutex_lock(&bch_register_lock);
+ 
+-	cancel_delayed_work_sync(&dc->writeback_rate_update);
++	if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
++		cancel_writeback_rate_update_dwork(dc);
++
+ 	if (!IS_ERR_OR_NULL(dc->writeback_thread)) {
+ 		kthread_stop(dc->writeback_thread);
+ 		dc->writeback_thread = NULL;
+@@ -954,6 +982,7 @@ void bch_cached_dev_detach(struct cached_dev *dc)
+ 	closure_get(&dc->disk.cl);
+ 
+ 	bch_writeback_queue(dc);
++
+ 	cached_dev_put(dc);
+ }
+ 
+@@ -1079,14 +1108,16 @@ static void cached_dev_free(struct closure *cl)
+ {
+ 	struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
+ 
+-	cancel_delayed_work_sync(&dc->writeback_rate_update);
++	mutex_lock(&bch_register_lock);
++
++	if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
++		cancel_writeback_rate_update_dwork(dc);
++
+ 	if (!IS_ERR_OR_NULL(dc->writeback_thread))
+ 		kthread_stop(dc->writeback_thread);
+ 	if (dc->writeback_write_wq)
+ 		destroy_workqueue(dc->writeback_write_wq);
+ 
+-	mutex_lock(&bch_register_lock);
+-
+ 	if (atomic_read(&dc->running))
+ 		bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
+ 	bcache_device_free(&dc->disk);
+diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
+index 4a6a697e1680..399e91cbf714 100644
+--- a/drivers/md/bcache/sysfs.c
++++ b/drivers/md/bcache/sysfs.c
+@@ -306,7 +306,8 @@ STORE(bch_cached_dev)
+ 		bch_writeback_queue(dc);
+ 
+ 	if (attr == &sysfs_writeback_percent)
+-		schedule_delayed_work(&dc->writeback_rate_update,
++		if (!test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
++			schedule_delayed_work(&dc->writeback_rate_update,
+ 				      dc->writeback_rate_update_seconds * HZ);
+ 
+ 	mutex_unlock(&bch_register_lock);
+diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
+index 4dbeaaa575bf..8f98ef1038d3 100644
+--- a/drivers/md/bcache/writeback.c
++++ b/drivers/md/bcache/writeback.c
+@@ -115,6 +115,21 @@ static void update_writeback_rate(struct work_struct *work)
+ 					     struct cached_dev,
+ 					     writeback_rate_update);
+ 
++	/*
++	 * should check BCACHE_DEV_RATE_DW_RUNNING before calling
++	 * cancel_delayed_work_sync().
++	 */
++	set_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
++	/* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
++	smp_mb();
++
++	if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) {
++		clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
++		/* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
++		smp_mb();
++		return;
++	}
++
+ 	down_read(&dc->writeback_lock);
+ 
+ 	if (atomic_read(&dc->has_dirty) &&
+@@ -123,8 +138,18 @@ static void update_writeback_rate(struct work_struct *work)
+ 
+ 	up_read(&dc->writeback_lock);
+ 
+-	schedule_delayed_work(&dc->writeback_rate_update,
++	if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) {
++		schedule_delayed_work(&dc->writeback_rate_update,
+ 			      dc->writeback_rate_update_seconds * HZ);
++	}
++
++	/*
++	 * should check BCACHE_DEV_RATE_DW_RUNNING before calling
++	 * cancel_delayed_work_sync().
++	 */
++	clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
++	/* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
++	smp_mb();
+ }
+ 
+ static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
+@@ -675,6 +700,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
+ 	dc->writeback_rate_p_term_inverse = 40;
+ 	dc->writeback_rate_i_term_inverse = 10000;
+ 
++	WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags));
+ 	INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
+ }
+ 
+@@ -693,6 +719,7 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc)
+ 		return PTR_ERR(dc->writeback_thread);
+ 	}
+ 
++	WARN_ON(test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags));
+ 	schedule_delayed_work(&dc->writeback_rate_update,
+ 			      dc->writeback_rate_update_seconds * HZ);
+ 
+-- 
+2.16.1
+
diff --git a/for-next/v5-0005-bcache-add-CACHE_SET_IO_DISABLE-to-struct-cache_s.patch b/for-next/v5-0005-bcache-add-CACHE_SET_IO_DISABLE-to-struct-cache_s.patch
new file mode 100644
index 0000000..57afe27
--- /dev/null
+++ b/for-next/v5-0005-bcache-add-CACHE_SET_IO_DISABLE-to-struct-cache_s.patch
@@ -0,0 +1,491 @@
+From 805fefa99ced3d184d69c65449eb3e24104346b9 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sun, 14 Jan 2018 22:15:00 +0800
+Subject: [PATCH v5 05/11] bcache: add CACHE_SET_IO_DISABLE to struct cache_set
+ flags
+
+When too many I/Os failed on cache device, bch_cache_set_error() is called
+in the error handling code path to retire whole problematic cache set. If
+new I/O requests continue to come and take refcount dc->count, the cache
+set won't be retired immediately, this is a problem.
+
+Further more, there are several kernel thread and self-armed kernel work
+may still running after bch_cache_set_error() is called. It needs to wait
+quite a while for them to stop, or they won't stop at all. They also
+prevent the cache set from being retired.
+
+The solution in this patch is, to add per cache set flag to disable I/O
+request on this cache and all attached backing devices. Then new coming I/O
+requests can be rejected in *_make_request() before taking refcount, kernel
+threads and self-armed kernel worker can stop very fast when flags bit
+CACHE_SET_IO_DISABLE is set.
+
+Because bcache also do internal I/Os for writeback, garbage collection,
+bucket allocation, journaling, this kind of I/O should be disabled after
+bch_cache_set_error() is called. So closure_bio_submit() is modified to
+check whether CACHE_SET_IO_DISABLE is set on cache_set->flags. If set,
+closure_bio_submit() will set bio->bi_status to BLK_STS_IOERR and
+return, generic_make_request() won't be called.
+
+A sysfs interface is also added to set or clear CACHE_SET_IO_DISABLE bit
+from cache_set->flags, to disable or enable cache set I/O for debugging. It
+is helpful to trigger more corner case issues for failed cache device.
+
+Changelog
+v3, change CACHE_SET_IO_DISABLE from 4 to 3, since it is bit index.
+    remove "bcache: " prefix when printing out kernel message.
+v2, more changes by previous review,
+- Use CACHE_SET_IO_DISABLE of cache_set->flags, suggested by Junhui.
+- Check CACHE_SET_IO_DISABLE in bch_btree_gc() to stop a while-loop, this
+  is reported and inspired from origal patch of Pavel Vazharov.
+v1, initial version.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reviewed-by: Hannes Reinecke <hare@suse.com>
+Cc: Junhui Tang <tang.junhui@zte.com.cn>
+Cc: Michael Lyle <mlyle@lyle.org>
+Cc: Pavel Vazharov <freakpv@gmail.com>
+---
+ drivers/md/bcache/alloc.c     |  3 ++-
+ drivers/md/bcache/bcache.h    | 18 ++++++++++++++++++
+ drivers/md/bcache/btree.c     | 10 +++++++---
+ drivers/md/bcache/io.c        |  2 +-
+ drivers/md/bcache/journal.c   |  4 ++--
+ drivers/md/bcache/request.c   | 26 +++++++++++++++++++-------
+ drivers/md/bcache/super.c     |  6 +++++-
+ drivers/md/bcache/sysfs.c     | 20 ++++++++++++++++++++
+ drivers/md/bcache/util.h      |  6 ------
+ drivers/md/bcache/writeback.c | 35 +++++++++++++++++++++++++++--------
+ 10 files changed, 101 insertions(+), 29 deletions(-)
+
+diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
+index 458e1d38577d..004cc3cc6123 100644
+--- a/drivers/md/bcache/alloc.c
++++ b/drivers/md/bcache/alloc.c
+@@ -287,7 +287,8 @@ do {									\
+ 			break;						\
+ 									\
+ 		mutex_unlock(&(ca)->set->bucket_lock);			\
+-		if (kthread_should_stop()) {				\
++		if (kthread_should_stop() ||				\
++		    test_bit(CACHE_SET_IO_DISABLE, &ca->set->flags)) {	\
+ 			set_current_state(TASK_RUNNING);		\
+ 			return 0;					\
+ 		}							\
+diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
+index 0380626bf525..7917b3820dd5 100644
+--- a/drivers/md/bcache/bcache.h
++++ b/drivers/md/bcache/bcache.h
+@@ -475,10 +475,15 @@ struct gc_stat {
+  *
+  * CACHE_SET_RUNNING means all cache devices have been registered and journal
+  * replay is complete.
++ *
++ * CACHE_SET_IO_DISABLE is set when bcache is stopping the whold cache set, all
++ * external and internal I/O should be denied when this flag is set.
++ *
+  */
+ #define CACHE_SET_UNREGISTERING		0
+ #define	CACHE_SET_STOPPING		1
+ #define	CACHE_SET_RUNNING		2
++#define CACHE_SET_IO_DISABLE		3
+ 
+ struct cache_set {
+ 	struct closure		cl;
+@@ -868,6 +873,19 @@ static inline void wake_up_allocators(struct cache_set *c)
+ 		wake_up_process(ca->alloc_thread);
+ }
+ 
++static inline void closure_bio_submit(struct cache_set *c,
++				      struct bio *bio,
++				      struct closure *cl)
++{
++	closure_get(cl);
++	if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) {
++		bio->bi_status = BLK_STS_IOERR;
++		bio_endio(bio);
++		return;
++	}
++	generic_make_request(bio);
++}
++
+ /* Forward declarations */
+ 
+ void bch_count_io_errors(struct cache *, blk_status_t, int, const char *);
+diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
+index fad9fe8817eb..8ca50f387a1d 100644
+--- a/drivers/md/bcache/btree.c
++++ b/drivers/md/bcache/btree.c
+@@ -1744,6 +1744,7 @@ static void bch_btree_gc(struct cache_set *c)
+ 
+ 	btree_gc_start(c);
+ 
++	/* if CACHE_SET_IO_DISABLE set, gc thread should stop too */
+ 	do {
+ 		ret = btree_root(gc_root, c, &op, &writes, &stats);
+ 		closure_sync(&writes);
+@@ -1751,7 +1752,7 @@ static void bch_btree_gc(struct cache_set *c)
+ 
+ 		if (ret && ret != -EAGAIN)
+ 			pr_warn("gc failed!");
+-	} while (ret);
++	} while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags));
+ 
+ 	bch_btree_gc_finish(c);
+ 	wake_up_allocators(c);
+@@ -1789,9 +1790,12 @@ static int bch_gc_thread(void *arg)
+ 
+ 	while (1) {
+ 		wait_event_interruptible(c->gc_wait,
+-			   kthread_should_stop() || gc_should_run(c));
++			   kthread_should_stop() ||
++			   test_bit(CACHE_SET_IO_DISABLE, &c->flags) ||
++			   gc_should_run(c));
+ 
+-		if (kthread_should_stop())
++		if (kthread_should_stop() ||
++		    test_bit(CACHE_SET_IO_DISABLE, &c->flags))
+ 			break;
+ 
+ 		set_gc_sectors(c);
+diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
+index a783c5a41ff1..8013ecbcdbda 100644
+--- a/drivers/md/bcache/io.c
++++ b/drivers/md/bcache/io.c
+@@ -38,7 +38,7 @@ void __bch_submit_bbio(struct bio *bio, struct cache_set *c)
+ 	bio_set_dev(bio, PTR_CACHE(c, &b->key, 0)->bdev);
+ 
+ 	b->submit_time_us = local_clock_us();
+-	closure_bio_submit(bio, bio->bi_private);
++	closure_bio_submit(c, bio, bio->bi_private);
+ }
+ 
+ void bch_submit_bbio(struct bio *bio, struct cache_set *c,
+diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
+index 1b736b860739..c94085f400a4 100644
+--- a/drivers/md/bcache/journal.c
++++ b/drivers/md/bcache/journal.c
+@@ -62,7 +62,7 @@ reread:		left = ca->sb.bucket_size - offset;
+ 		bio_set_op_attrs(bio, REQ_OP_READ, 0);
+ 		bch_bio_map(bio, data);
+ 
+-		closure_bio_submit(bio, &cl);
++		closure_bio_submit(ca->set, bio, &cl);
+ 		closure_sync(&cl);
+ 
+ 		/* This function could be simpler now since we no longer write
+@@ -674,7 +674,7 @@ static void journal_write_unlocked(struct closure *cl)
+ 	spin_unlock(&c->journal.lock);
+ 
+ 	while ((bio = bio_list_pop(&list)))
+-		closure_bio_submit(bio, cl);
++		closure_bio_submit(c, bio, cl);
+ 
+ 	continue_at(cl, journal_write_done, NULL);
+ }
+diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
+index 1a46b41dac70..02296bda6384 100644
+--- a/drivers/md/bcache/request.c
++++ b/drivers/md/bcache/request.c
+@@ -747,7 +747,7 @@ static void cached_dev_read_error(struct closure *cl)
+ 
+ 		/* XXX: invalidate cache */
+ 
+-		closure_bio_submit(bio, cl);
++		closure_bio_submit(s->iop.c, bio, cl);
+ 	}
+ 
+ 	continue_at(cl, cached_dev_cache_miss_done, NULL);
+@@ -872,7 +872,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
+ 	s->cache_miss	= miss;
+ 	s->iop.bio	= cache_bio;
+ 	bio_get(cache_bio);
+-	closure_bio_submit(cache_bio, &s->cl);
++	closure_bio_submit(s->iop.c, cache_bio, &s->cl);
+ 
+ 	return ret;
+ out_put:
+@@ -880,7 +880,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
+ out_submit:
+ 	miss->bi_end_io		= request_endio;
+ 	miss->bi_private	= &s->cl;
+-	closure_bio_submit(miss, &s->cl);
++	closure_bio_submit(s->iop.c, miss, &s->cl);
+ 	return ret;
+ }
+ 
+@@ -945,7 +945,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
+ 
+ 		if ((bio_op(bio) != REQ_OP_DISCARD) ||
+ 		    blk_queue_discard(bdev_get_queue(dc->bdev)))
+-			closure_bio_submit(bio, cl);
++			closure_bio_submit(s->iop.c, bio, cl);
+ 	} else if (s->iop.writeback) {
+ 		bch_writeback_add(dc);
+ 		s->iop.bio = bio;
+@@ -960,12 +960,12 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
+ 			flush->bi_private = cl;
+ 			flush->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
+ 
+-			closure_bio_submit(flush, cl);
++			closure_bio_submit(s->iop.c, flush, cl);
+ 		}
+ 	} else {
+ 		s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split);
+ 
+-		closure_bio_submit(bio, cl);
++		closure_bio_submit(s->iop.c, bio, cl);
+ 	}
+ 
+ 	closure_call(&s->iop.cl, bch_data_insert, NULL, cl);
+@@ -981,7 +981,7 @@ static void cached_dev_nodata(struct closure *cl)
+ 		bch_journal_meta(s->iop.c, cl);
+ 
+ 	/* If it's a flush, we send the flush to the backing device too */
+-	closure_bio_submit(bio, cl);
++	closure_bio_submit(s->iop.c, bio, cl);
+ 
+ 	continue_at(cl, cached_dev_bio_complete, NULL);
+ }
+@@ -996,6 +996,12 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
+ 	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+ 	int rw = bio_data_dir(bio);
+ 
++	if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) {
++		bio->bi_status = BLK_STS_IOERR;
++		bio_endio(bio);
++		return BLK_QC_T_NONE;
++	}
++
+ 	atomic_set(&dc->backing_idle, 0);
+ 	generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
+ 
+@@ -1112,6 +1118,12 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q,
+ 	struct bcache_device *d = bio->bi_disk->private_data;
+ 	int rw = bio_data_dir(bio);
+ 
++	if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) {
++		bio->bi_status = BLK_STS_IOERR;
++		bio_endio(bio);
++		return BLK_QC_T_NONE;
++	}
++
+ 	generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
+ 
+ 	s = search_alloc(bio, d);
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index e15cacecf078..f8b0d1196c12 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -521,7 +521,7 @@ static void prio_io(struct cache *ca, uint64_t bucket, int op,
+ 	bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags);
+ 	bch_bio_map(bio, ca->disk_buckets);
+ 
+-	closure_bio_submit(bio, &ca->prio);
++	closure_bio_submit(ca->set, bio, &ca->prio);
+ 	closure_sync(cl);
+ }
+ 
+@@ -1349,6 +1349,9 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
+ 	    test_bit(CACHE_SET_STOPPING, &c->flags))
+ 		return false;
+ 
++	if (test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags))
++		pr_warn("CACHE_SET_IO_DISABLE already set");
++
+ 	/* XXX: we can be called from atomic context
+ 	acquire_console_sem();
+ 	*/
+@@ -1584,6 +1587,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
+ 	c->congested_read_threshold_us	= 2000;
+ 	c->congested_write_threshold_us	= 20000;
+ 	c->error_limit	= DEFAULT_IO_ERROR_LIMIT;
++	WARN_ON(test_and_clear_bit(CACHE_SET_IO_DISABLE, &c->flags));
+ 
+ 	return c;
+ err:
+diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
+index 399e91cbf714..cf973c07c856 100644
+--- a/drivers/md/bcache/sysfs.c
++++ b/drivers/md/bcache/sysfs.c
+@@ -95,6 +95,7 @@ read_attribute(partial_stripes_expensive);
+ 
+ rw_attribute(synchronous);
+ rw_attribute(journal_delay_ms);
++rw_attribute(io_disable);
+ rw_attribute(discard);
+ rw_attribute(running);
+ rw_attribute(label);
+@@ -588,6 +589,8 @@ SHOW(__bch_cache_set)
+ 	sysfs_printf(gc_always_rewrite,		"%i", c->gc_always_rewrite);
+ 	sysfs_printf(btree_shrinker_disabled,	"%i", c->shrinker_disabled);
+ 	sysfs_printf(copy_gc_enabled,		"%i", c->copy_gc_enabled);
++	sysfs_printf(io_disable,		"%i",
++		     test_bit(CACHE_SET_IO_DISABLE, &c->flags));
+ 
+ 	if (attr == &sysfs_bset_tree_stats)
+ 		return bch_bset_print_stats(c, buf);
+@@ -677,6 +680,22 @@ STORE(__bch_cache_set)
+ 	if (attr == &sysfs_io_error_halflife)
+ 		c->error_decay = strtoul_or_return(buf) / 88;
+ 
++	if (attr == &sysfs_io_disable) {
++		int v = strtoul_or_return(buf);
++
++		if (v) {
++			if (test_and_set_bit(CACHE_SET_IO_DISABLE,
++					     &c->flags))
++				pr_warn("CACHE_SET_IO_DISABLE"
++					" already set");
++		} else {
++			if (!test_and_clear_bit(CACHE_SET_IO_DISABLE,
++						&c->flags))
++				pr_warn("CACHE_SET_IO_DISABLE"
++					" already cleared");
++		}
++	}
++
+ 	sysfs_strtoul(journal_delay_ms,		c->journal_delay_ms);
+ 	sysfs_strtoul(verify,			c->verify);
+ 	sysfs_strtoul(key_merging_disabled,	c->key_merging_disabled);
+@@ -762,6 +781,7 @@ static struct attribute *bch_cache_set_internal_files[] = {
+ 	&sysfs_gc_always_rewrite,
+ 	&sysfs_btree_shrinker_disabled,
+ 	&sysfs_copy_gc_enabled,
++	&sysfs_io_disable,
+ 	NULL
+ };
+ KTYPE(bch_cache_set_internal);
+diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
+index a6763db7f061..268024529edd 100644
+--- a/drivers/md/bcache/util.h
++++ b/drivers/md/bcache/util.h
+@@ -567,12 +567,6 @@ static inline sector_t bdev_sectors(struct block_device *bdev)
+ 	return bdev->bd_inode->i_size >> 9;
+ }
+ 
+-#define closure_bio_submit(bio, cl)					\
+-do {									\
+-	closure_get(cl);						\
+-	generic_make_request(bio);					\
+-} while (0)
+-
+ uint64_t bch_crc64_update(uint64_t, const void *, size_t);
+ uint64_t bch_crc64(const void *, size_t);
+ 
+diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
+index 8f98ef1038d3..3d7d8452e0de 100644
+--- a/drivers/md/bcache/writeback.c
++++ b/drivers/md/bcache/writeback.c
+@@ -114,6 +114,7 @@ static void update_writeback_rate(struct work_struct *work)
+ 	struct cached_dev *dc = container_of(to_delayed_work(work),
+ 					     struct cached_dev,
+ 					     writeback_rate_update);
++	struct cache_set *c = dc->disk.c;
+ 
+ 	/*
+ 	 * should check BCACHE_DEV_RATE_DW_RUNNING before calling
+@@ -123,7 +124,12 @@ static void update_writeback_rate(struct work_struct *work)
+ 	/* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
+ 	smp_mb();
+ 
+-	if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) {
++	/*
++	 * CACHE_SET_IO_DISABLE might be set via sysfs interface,
++	 * check it here too.
++	 */
++	if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) ||
++	    test_bit(CACHE_SET_IO_DISABLE, &c->flags)) {
+ 		clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
+ 		/* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
+ 		smp_mb();
+@@ -138,7 +144,12 @@ static void update_writeback_rate(struct work_struct *work)
+ 
+ 	up_read(&dc->writeback_lock);
+ 
+-	if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) {
++	/*
++	 * CACHE_SET_IO_DISABLE might be set via sysfs interface,
++	 * check it here too.
++	 */
++	if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) &&
++	    !test_bit(CACHE_SET_IO_DISABLE, &c->flags)) {
+ 		schedule_delayed_work(&dc->writeback_rate_update,
+ 			      dc->writeback_rate_update_seconds * HZ);
+ 	}
+@@ -278,7 +289,7 @@ static void write_dirty(struct closure *cl)
+ 		bio_set_dev(&io->bio, io->dc->bdev);
+ 		io->bio.bi_end_io	= dirty_endio;
+ 
+-		closure_bio_submit(&io->bio, cl);
++		closure_bio_submit(io->dc->disk.c, &io->bio, cl);
+ 	}
+ 
+ 	atomic_set(&dc->writeback_sequence_next, next_sequence);
+@@ -304,7 +315,7 @@ static void read_dirty_submit(struct closure *cl)
+ {
+ 	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+ 
+-	closure_bio_submit(&io->bio, cl);
++	closure_bio_submit(io->dc->disk.c, &io->bio, cl);
+ 
+ 	continue_at(cl, write_dirty, io->dc->writeback_write_wq);
+ }
+@@ -330,7 +341,9 @@ static void read_dirty(struct cached_dev *dc)
+ 
+ 	next = bch_keybuf_next(&dc->writeback_keys);
+ 
+-	while (!kthread_should_stop() && next) {
++	while (!kthread_should_stop() &&
++	       !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) &&
++	       next) {
+ 		size = 0;
+ 		nk = 0;
+ 
+@@ -427,7 +440,9 @@ static void read_dirty(struct cached_dev *dc)
+ 			}
+ 		}
+ 
+-		while (!kthread_should_stop() && delay) {
++		while (!kthread_should_stop() &&
++		       !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) &&
++		       delay) {
+ 			schedule_timeout_interruptible(delay);
+ 			delay = writeback_delay(dc, 0);
+ 		}
+@@ -583,11 +598,13 @@ static bool refill_dirty(struct cached_dev *dc)
+ static int bch_writeback_thread(void *arg)
+ {
+ 	struct cached_dev *dc = arg;
++	struct cache_set *c = dc->disk.c;
+ 	bool searched_full_index;
+ 
+ 	bch_ratelimit_reset(&dc->writeback_rate);
+ 
+-	while (!kthread_should_stop()) {
++	while (!kthread_should_stop() &&
++	       !test_bit(CACHE_SET_IO_DISABLE, &c->flags)) {
+ 		down_write(&dc->writeback_lock);
+ 		set_current_state(TASK_INTERRUPTIBLE);
+ 		/*
+@@ -601,7 +618,8 @@ static int bch_writeback_thread(void *arg)
+ 		    (!atomic_read(&dc->has_dirty) || !dc->writeback_running)) {
+ 			up_write(&dc->writeback_lock);
+ 
+-			if (kthread_should_stop()) {
++			if (kthread_should_stop() ||
++			    test_bit(CACHE_SET_IO_DISABLE, &c->flags)) {
+ 				set_current_state(TASK_RUNNING);
+ 				break;
+ 			}
+@@ -637,6 +655,7 @@ static int bch_writeback_thread(void *arg)
+ 
+ 			while (delay &&
+ 			       !kthread_should_stop() &&
++			       !test_bit(CACHE_SET_IO_DISABLE, &c->flags) &&
+ 			       !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
+ 				delay = schedule_timeout_interruptible(delay);
+ 
+-- 
+2.16.1
+
diff --git a/for-next/v5-0006-bcache-stop-all-attached-bcache-devices-for-a-ret.patch b/for-next/v5-0006-bcache-stop-all-attached-bcache-devices-for-a-ret.patch
new file mode 100644
index 0000000..7cea41c
--- /dev/null
+++ b/for-next/v5-0006-bcache-stop-all-attached-bcache-devices-for-a-ret.patch
@@ -0,0 +1,67 @@
+From e741114d13958a91559cd8c376cf704cb2180370 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Wed, 10 Jan 2018 00:26:32 +0800
+Subject: [PATCH v5 06/11] bcache: stop all attached bcache devices for a
+ retired cache set
+
+When there are too many I/O errors on cache device, current bcache code
+will retire the whole cache set, and detach all bcache devices. But the
+detached bcache devices are not stopped, which is problematic when bcache
+is in writeback mode.
+
+If the retired cache set has dirty data of backing devices, continue
+writing to bcache device will write to backing device directly. If the
+LBA of write request has a dirty version cached on cache device, next time
+when the cache device is re-registered and backing device re-attached to
+it again, the stale dirty data on cache device will be written to backing
+device, and overwrite latest directly written data. This situation causes
+a quite data corruption.
+
+This patch checkes whether cache_set->io_disable is true in
+__cache_set_unregister(). If cache_set->io_disable is true, it means cache
+set is unregistering by too many I/O errors, then all attached bcache
+devices will be stopped as well. If cache_set->io_disable is not true, it
+means __cache_set_unregister() is triggered by writing 1 to sysfs file
+/sys/fs/bcache/<UUID>/bcache/stop. This is an exception because users do
+it explicitly, this patch keeps existing behavior and does not stop any
+bcache device.
+
+Even the failed cache device has no dirty data, stopping bcache device is
+still a desired behavior by many Ceph and data base users. Then their
+application will report I/O errors due to disappeared bcache device, and
+operation people will know the cache device is broken or disconnected.
+
+Changelog:
+v2: add reviewed-by from Hannes.
+v1: initial version for review.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reviewed-by: Hannes Reinecke <hare@suse.com>
+Cc: Junhui Tang <tang.junhui@zte.com.cn>
+Cc: Michael Lyle <mlyle@lyle.org>
+---
+ drivers/md/bcache/super.c | 8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index f8b0d1196c12..41ef438e7b40 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -1478,6 +1478,14 @@ static void __cache_set_unregister(struct closure *cl)
+ 				dc = container_of(c->devices[i],
+ 						  struct cached_dev, disk);
+ 				bch_cached_dev_detach(dc);
++				/*
++				 * If we come here by too many I/O errors,
++				 * bcache device should be stopped too, to
++				 * keep data consistency on cache and
++				 * backing devices.
++				 */
++				if (test_bit(CACHE_SET_IO_DISABLE, &c->flags))
++					bcache_device_stop(c->devices[i]);
+ 			} else {
+ 				bcache_device_stop(c->devices[i]);
+ 			}
+-- 
+2.16.1
+
diff --git a/for-next/v5-0007-bcache-fix-inaccurate-io-state-for-detached-bcach.patch b/for-next/v5-0007-bcache-fix-inaccurate-io-state-for-detached-bcach.patch
new file mode 100644
index 0000000..419c2c7
--- /dev/null
+++ b/for-next/v5-0007-bcache-fix-inaccurate-io-state-for-detached-bcach.patch
@@ -0,0 +1,119 @@
+From d6f9f789096d2c3473314fa10ea1166683399ac8 Mon Sep 17 00:00:00 2001
+From: Tang Junhui <tang.junhui@zte.com.cn>
+Date: Tue, 9 Jan 2018 10:27:11 +0800
+Subject: [PATCH v5 07/11] bcache: fix inaccurate io state for detached bcache
+ devices
+
+When we run IO in a detached device,  and run iostat to shows IO status,
+normally it will show like bellow (Omitted some fields):
+Device: ... avgrq-sz avgqu-sz   await r_await w_await  svctm  %util
+sdd        ... 15.89     0.53    1.82    0.20    2.23   1.81  52.30
+bcache0    ... 15.89   115.42    0.00    0.00    0.00   2.40  69.60
+but after IO stopped, there are still very big avgqu-sz and %util
+values as bellow:
+Device: ... avgrq-sz avgqu-sz   await r_await w_await  svctm  %util
+bcache0   ...      0   5326.32    0.00    0.00    0.00   0.00 100.10
+
+The reason for this issue is that, only generic_start_io_acct() called
+and no generic_end_io_acct() called for detached device in
+cached_dev_make_request(). See the code:
+//start generic_start_io_acct()
+generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
+if (cached_dev_get(dc)) {
+	//will callback generic_end_io_acct()
+}
+else {
+	//will not call generic_end_io_acct()
+}
+
+This patch calls generic_end_io_acct() in the end of IO for detached
+devices, so we can show IO state correctly.
+
+(Modified to use GFP_NOIO in kzalloc() by Coly Li)
+
+Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
+Reviewed-by: Coly Li <colyli@suse.de>
+Reviewed-by: Hannes Reinecke <hare@suse.com>
+---
+ drivers/md/bcache/request.c | 58 +++++++++++++++++++++++++++++++++++++++------
+ 1 file changed, 51 insertions(+), 7 deletions(-)
+
+diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
+index 02296bda6384..e09c5ae745be 100644
+--- a/drivers/md/bcache/request.c
++++ b/drivers/md/bcache/request.c
+@@ -986,6 +986,55 @@ static void cached_dev_nodata(struct closure *cl)
+ 	continue_at(cl, cached_dev_bio_complete, NULL);
+ }
+ 
++struct detached_dev_io_private {
++	struct bcache_device	*d;
++	unsigned long		start_time;
++	bio_end_io_t		*bi_end_io;
++	void			*bi_private;
++};
++
++static void detatched_dev_end_io(struct bio *bio)
++{
++	struct detached_dev_io_private *ddip;
++
++	ddip = bio->bi_private;
++	bio->bi_end_io = ddip->bi_end_io;
++	bio->bi_private = ddip->bi_private;
++
++	generic_end_io_acct(ddip->d->disk->queue,
++			    bio_data_dir(bio),
++			    &ddip->d->disk->part0, ddip->start_time);
++
++	kfree(ddip);
++
++	bio->bi_end_io(bio);
++}
++
++static void detached_dev_do_request(struct bcache_device *d, struct bio *bio)
++{
++	struct detached_dev_io_private *ddip;
++	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
++
++	/*
++	 * no need to call closure_get(&dc->disk.cl),
++	 * because upper layer had already opened bcache device,
++	 * which would call closure_get(&dc->disk.cl)
++	 */
++	ddip = kzalloc(sizeof(struct detached_dev_io_private), GFP_NOIO);
++	ddip->d = d;
++	ddip->start_time = jiffies;
++	ddip->bi_end_io = bio->bi_end_io;
++	ddip->bi_private = bio->bi_private;
++	bio->bi_end_io = detatched_dev_end_io;
++	bio->bi_private = ddip;
++
++	if ((bio_op(bio) == REQ_OP_DISCARD) &&
++	    !blk_queue_discard(bdev_get_queue(dc->bdev)))
++		bio->bi_end_io(bio);
++	else
++		generic_make_request(bio);
++}
++
+ /* Cached devices - read & write stuff */
+ 
+ static blk_qc_t cached_dev_make_request(struct request_queue *q,
+@@ -1028,13 +1077,8 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
+ 			else
+ 				cached_dev_read(dc, s);
+ 		}
+-	} else {
+-		if ((bio_op(bio) == REQ_OP_DISCARD) &&
+-		    !blk_queue_discard(bdev_get_queue(dc->bdev)))
+-			bio_endio(bio);
+-		else
+-			generic_make_request(bio);
+-	}
++	} else
++		detached_dev_do_request(d, bio);
+ 
+ 	return BLK_QC_T_NONE;
+ }
+-- 
+2.16.1
+
diff --git a/for-next/v5-0008-bcache-add-backing_request_endio-for-bi_end_io-of.patch b/for-next/v5-0008-bcache-add-backing_request_endio-for-bi_end_io-of.patch
new file mode 100644
index 0000000..4cddd78
--- /dev/null
+++ b/for-next/v5-0008-bcache-add-backing_request_endio-for-bi_end_io-of.patch
@@ -0,0 +1,255 @@
+From e76beb2960de33506c0f6e177d43f8a8cfafee30 Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Wed, 10 Jan 2018 21:01:48 +0800
+Subject: [PATCH v5 08/11] bcache: add backing_request_endio() for bi_end_io of
+ attached backing device I/O
+
+In order to catch I/O error of backing device, a separate bi_end_io
+call back is required. Then a per backing device counter can record I/O
+errors number and retire the backing device if the counter reaches a
+per backing device I/O error limit.
+
+This patch adds backing_request_endio() to bcache backing device I/O code
+path, this is a preparation for further complicated backing device failure
+handling. So far there is no real code logic change, I make this change a
+separate patch to make sure it is stable and reliable for further work.
+
+Changelog:
+v2: Fix code comments typo, remove a redundant bch_writeback_add() line
+    added in v4 patch set.
+v1: indeed this is new added in this patch set.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reviewed-by: Hannes Reinecke <hare@suse.com>
+Cc: Junhui Tang <tang.junhui@zte.com.cn>
+Cc: Michael Lyle <mlyle@lyle.org>
+---
+ drivers/md/bcache/request.c   | 93 +++++++++++++++++++++++++++++++++++--------
+ drivers/md/bcache/super.c     |  1 +
+ drivers/md/bcache/writeback.c |  1 +
+ 3 files changed, 79 insertions(+), 16 deletions(-)
+
+diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
+index e09c5ae745be..9c6dda3b0068 100644
+--- a/drivers/md/bcache/request.c
++++ b/drivers/md/bcache/request.c
+@@ -139,6 +139,7 @@ static void bch_data_invalidate(struct closure *cl)
+ 	}
+ 
+ 	op->insert_data_done = true;
++	/* get in bch_data_insert() */
+ 	bio_put(bio);
+ out:
+ 	continue_at(cl, bch_data_insert_keys, op->wq);
+@@ -630,6 +631,38 @@ static void request_endio(struct bio *bio)
+ 	closure_put(cl);
+ }
+ 
++static void backing_request_endio(struct bio *bio)
++{
++	struct closure *cl = bio->bi_private;
++
++	if (bio->bi_status) {
++		struct search *s = container_of(cl, struct search, cl);
++		/*
++		 * If a bio has REQ_PREFLUSH for writeback mode, it is
++		 * speically assembled in cached_dev_write() for a non-zero
++		 * write request which has REQ_PREFLUSH. we don't set
++		 * s->iop.status by this failure, the status will be decided
++		 * by result of bch_data_insert() operation.
++		 */
++		if (unlikely(s->iop.writeback &&
++			     bio->bi_opf & REQ_PREFLUSH)) {
++			char buf[BDEVNAME_SIZE];
++
++			bio_devname(bio, buf);
++			pr_err("Can't flush %s: returned bi_status %i",
++				buf, bio->bi_status);
++		} else {
++			/* set to orig_bio->bi_status in bio_complete() */
++			s->iop.status = bio->bi_status;
++		}
++		s->recoverable = false;
++		/* should count I/O error for backing device here */
++	}
++
++	bio_put(bio);
++	closure_put(cl);
++}
++
+ static void bio_complete(struct search *s)
+ {
+ 	if (s->orig_bio) {
+@@ -644,13 +677,21 @@ static void bio_complete(struct search *s)
+ 	}
+ }
+ 
+-static void do_bio_hook(struct search *s, struct bio *orig_bio)
++static void do_bio_hook(struct search *s,
++			struct bio *orig_bio,
++			bio_end_io_t *end_io_fn)
+ {
+ 	struct bio *bio = &s->bio.bio;
+ 
+ 	bio_init(bio, NULL, 0);
+ 	__bio_clone_fast(bio, orig_bio);
+-	bio->bi_end_io		= request_endio;
++	/*
++	 * bi_end_io can be set separately somewhere else, e.g. the
++	 * variants in,
++	 * - cache_bio->bi_end_io from cached_dev_cache_miss()
++	 * - n->bi_end_io from cache_lookup_fn()
++	 */
++	bio->bi_end_io		= end_io_fn;
+ 	bio->bi_private		= &s->cl;
+ 
+ 	bio_cnt_set(bio, 3);
+@@ -676,7 +717,7 @@ static inline struct search *search_alloc(struct bio *bio,
+ 	s = mempool_alloc(d->c->search, GFP_NOIO);
+ 
+ 	closure_init(&s->cl, NULL);
+-	do_bio_hook(s, bio);
++	do_bio_hook(s, bio, request_endio);
+ 
+ 	s->orig_bio		= bio;
+ 	s->cache_miss		= NULL;
+@@ -743,10 +784,11 @@ static void cached_dev_read_error(struct closure *cl)
+ 		trace_bcache_read_retry(s->orig_bio);
+ 
+ 		s->iop.status = 0;
+-		do_bio_hook(s, s->orig_bio);
++		do_bio_hook(s, s->orig_bio, backing_request_endio);
+ 
+ 		/* XXX: invalidate cache */
+ 
++		/* I/O request sent to backing device */
+ 		closure_bio_submit(s->iop.c, bio, cl);
+ 	}
+ 
+@@ -859,7 +901,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
+ 	bio_copy_dev(cache_bio, miss);
+ 	cache_bio->bi_iter.bi_size	= s->insert_bio_sectors << 9;
+ 
+-	cache_bio->bi_end_io	= request_endio;
++	cache_bio->bi_end_io	= backing_request_endio;
+ 	cache_bio->bi_private	= &s->cl;
+ 
+ 	bch_bio_map(cache_bio, NULL);
+@@ -872,14 +914,16 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
+ 	s->cache_miss	= miss;
+ 	s->iop.bio	= cache_bio;
+ 	bio_get(cache_bio);
++	/* I/O request sent to backing device */
+ 	closure_bio_submit(s->iop.c, cache_bio, &s->cl);
+ 
+ 	return ret;
+ out_put:
+ 	bio_put(cache_bio);
+ out_submit:
+-	miss->bi_end_io		= request_endio;
++	miss->bi_end_io		= backing_request_endio;
+ 	miss->bi_private	= &s->cl;
++	/* I/O request sent to backing device */
+ 	closure_bio_submit(s->iop.c, miss, &s->cl);
+ 	return ret;
+ }
+@@ -943,31 +987,46 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
+ 		s->iop.bio = s->orig_bio;
+ 		bio_get(s->iop.bio);
+ 
+-		if ((bio_op(bio) != REQ_OP_DISCARD) ||
+-		    blk_queue_discard(bdev_get_queue(dc->bdev)))
+-			closure_bio_submit(s->iop.c, bio, cl);
++		if (bio_op(bio) == REQ_OP_DISCARD &&
++		    !blk_queue_discard(bdev_get_queue(dc->bdev)))
++			goto insert_data;
++
++		/* I/O request sent to backing device */
++		bio->bi_end_io = backing_request_endio;
++		closure_bio_submit(s->iop.c, bio, cl);
++
+ 	} else if (s->iop.writeback) {
+ 		bch_writeback_add(dc);
+ 		s->iop.bio = bio;
+ 
+ 		if (bio->bi_opf & REQ_PREFLUSH) {
+-			/* Also need to send a flush to the backing device */
+-			struct bio *flush = bio_alloc_bioset(GFP_NOIO, 0,
+-							     dc->disk.bio_split);
+-
++			/*
++			 * Also need to send a flush to the backing
++			 * device.
++			 */
++			struct bio *flush;
++
++			flush = bio_alloc_bioset(GFP_NOIO, 0,
++						 dc->disk.bio_split);
++			if (!flush) {
++				s->iop.status = BLK_STS_RESOURCE;
++				goto insert_data;
++			}
+ 			bio_copy_dev(flush, bio);
+-			flush->bi_end_io = request_endio;
++			flush->bi_end_io = backing_request_endio;
+ 			flush->bi_private = cl;
+ 			flush->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
+-
++			/* I/O request sent to backing device */
+ 			closure_bio_submit(s->iop.c, flush, cl);
+ 		}
+ 	} else {
+ 		s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split);
+-
++		/* I/O request sent to backing device */
++		bio->bi_end_io = backing_request_endio;
+ 		closure_bio_submit(s->iop.c, bio, cl);
+ 	}
+ 
++insert_data:
+ 	closure_call(&s->iop.cl, bch_data_insert, NULL, cl);
+ 	continue_at(cl, cached_dev_write_complete, NULL);
+ }
+@@ -981,6 +1040,7 @@ static void cached_dev_nodata(struct closure *cl)
+ 		bch_journal_meta(s->iop.c, cl);
+ 
+ 	/* If it's a flush, we send the flush to the backing device too */
++	bio->bi_end_io = backing_request_endio;
+ 	closure_bio_submit(s->iop.c, bio, cl);
+ 
+ 	continue_at(cl, cached_dev_bio_complete, NULL);
+@@ -1078,6 +1138,7 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
+ 				cached_dev_read(dc, s);
+ 		}
+ 	} else
++		/* I/O request sent to backing device */
+ 		detached_dev_do_request(d, bio);
+ 
+ 	return BLK_QC_T_NONE;
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index 41ef438e7b40..082faaf2ee2f 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -265,6 +265,7 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
+ 	bio->bi_private = dc;
+ 
+ 	closure_get(cl);
++	/* I/O request sent to backing device */
+ 	__write_super(&dc->sb, bio);
+ 
+ 	closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
+diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
+index 3d7d8452e0de..4ebe0119ea7e 100644
+--- a/drivers/md/bcache/writeback.c
++++ b/drivers/md/bcache/writeback.c
+@@ -289,6 +289,7 @@ static void write_dirty(struct closure *cl)
+ 		bio_set_dev(&io->bio, io->dc->bdev);
+ 		io->bio.bi_end_io	= dirty_endio;
+ 
++		/* I/O request sent to backing device */
+ 		closure_bio_submit(io->dc->disk.c, &io->bio, cl);
+ 	}
+ 
+-- 
+2.16.1
+
diff --git a/for-next/v5-0009-bcache-add-io_disable-to-struct-cached_dev.patch b/for-next/v5-0009-bcache-add-io_disable-to-struct-cached_dev.patch
new file mode 100644
index 0000000..9a63f8d
--- /dev/null
+++ b/for-next/v5-0009-bcache-add-io_disable-to-struct-cached_dev.patch
@@ -0,0 +1,237 @@
+From fff4f5d3d40952ddba1649fd7a5ce0d025b0b3cc Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Wed, 10 Jan 2018 21:33:45 +0800
+Subject: [PATCH v5 09/11] bcache: add io_disable to struct cached_dev
+
+If a bcache device is configured to writeback mode, current code does not
+handle write I/O errors on backing devices properly.
+
+In writeback mode, write request is written to cache device, and
+latter being flushed to backing device. If I/O failed when writing from
+cache device to the backing device, bcache code just ignores the error and
+upper layer code is NOT noticed that the backing device is broken.
+
+This patch tries to handle backing device failure like how the cache device
+failure is handled,
+- Add a error counter 'io_errors' and error limit 'error_limit' in struct
+  cached_dev. Add another io_disable to struct cached_dev to disable I/Os
+  on the problematic backing device.
+- When I/O error happens on backing device, increase io_errors counter. And
+  if io_errors reaches error_limit, set cache_dev->io_disable to true, and
+  stop the bcache device.
+
+The result is, if backing device is broken of disconnected, and I/O errors
+reach its error limit, backing device will be disabled and the associated
+bcache device will be removed from system.
+
+Changelog:
+v2: remove "bcache: " prefix in pr_error(), and use correct name string to
+    print out bcache device gendisk name.
+v1: indeed this is new added in v2 patch set.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reviewed-by: Hannes Reinecke <hare@suse.com>
+Cc: Michael Lyle <mlyle@lyle.org>
+Cc: Junhui Tang <tang.junhui@zte.com.cn>
+---
+ drivers/md/bcache/bcache.h  |  7 +++++++
+ drivers/md/bcache/io.c      | 14 ++++++++++++++
+ drivers/md/bcache/request.c | 14 ++++++++++++--
+ drivers/md/bcache/super.c   | 22 ++++++++++++++++++++++
+ drivers/md/bcache/sysfs.c   | 15 ++++++++++++++-
+ 5 files changed, 69 insertions(+), 3 deletions(-)
+
+diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
+index 7917b3820dd5..822ec75bb78c 100644
+--- a/drivers/md/bcache/bcache.h
++++ b/drivers/md/bcache/bcache.h
+@@ -360,6 +360,7 @@ struct cached_dev {
+ 	unsigned		sequential_cutoff;
+ 	unsigned		readahead;
+ 
++	unsigned		io_disable:1;
+ 	unsigned		verify:1;
+ 	unsigned		bypass_torture_test:1;
+ 
+@@ -379,6 +380,10 @@ struct cached_dev {
+ 	unsigned		writeback_rate_i_term_inverse;
+ 	unsigned		writeback_rate_p_term_inverse;
+ 	unsigned		writeback_rate_minimum;
++
++#define DEFAULT_CACHED_DEV_ERROR_LIMIT 64
++	atomic_t		io_errors;
++	unsigned		error_limit;
+ };
+ 
+ enum alloc_reserve {
+@@ -888,6 +893,7 @@ static inline void closure_bio_submit(struct cache_set *c,
+ 
+ /* Forward declarations */
+ 
++void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio);
+ void bch_count_io_errors(struct cache *, blk_status_t, int, const char *);
+ void bch_bbio_count_io_errors(struct cache_set *, struct bio *,
+ 			      blk_status_t, const char *);
+@@ -915,6 +921,7 @@ int bch_bucket_alloc_set(struct cache_set *, unsigned,
+ 			 struct bkey *, int, bool);
+ bool bch_alloc_sectors(struct cache_set *, struct bkey *, unsigned,
+ 		       unsigned, unsigned, bool);
++bool bch_cached_dev_error(struct cached_dev *dc);
+ 
+ __printf(2, 3)
+ bool bch_cache_set_error(struct cache_set *, const char *, ...);
+diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
+index 8013ecbcdbda..7fac97ae036e 100644
+--- a/drivers/md/bcache/io.c
++++ b/drivers/md/bcache/io.c
+@@ -50,6 +50,20 @@ void bch_submit_bbio(struct bio *bio, struct cache_set *c,
+ }
+ 
+ /* IO errors */
++void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio)
++{
++	char buf[BDEVNAME_SIZE];
++	unsigned errors;
++
++	WARN_ONCE(!dc, "NULL pointer of struct cached_dev");
++
++	errors = atomic_add_return(1, &dc->io_errors);
++	if (errors < dc->error_limit)
++		pr_err("%s: IO error on backing device, unrecoverable",
++			bio_devname(bio, buf));
++	else
++		bch_cached_dev_error(dc);
++}
+ 
+ void bch_count_io_errors(struct cache *ca,
+ 			 blk_status_t error,
+diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
+index 9c6dda3b0068..03245e6980a6 100644
+--- a/drivers/md/bcache/request.c
++++ b/drivers/md/bcache/request.c
+@@ -637,6 +637,8 @@ static void backing_request_endio(struct bio *bio)
+ 
+ 	if (bio->bi_status) {
+ 		struct search *s = container_of(cl, struct search, cl);
++		struct cached_dev *dc = container_of(s->d,
++						     struct cached_dev, disk);
+ 		/*
+ 		 * If a bio has REQ_PREFLUSH for writeback mode, it is
+ 		 * speically assembled in cached_dev_write() for a non-zero
+@@ -657,6 +659,7 @@ static void backing_request_endio(struct bio *bio)
+ 		}
+ 		s->recoverable = false;
+ 		/* should count I/O error for backing device here */
++		bch_count_backing_io_errors(dc, bio);
+ 	}
+ 
+ 	bio_put(bio);
+@@ -1065,8 +1068,14 @@ static void detatched_dev_end_io(struct bio *bio)
+ 			    bio_data_dir(bio),
+ 			    &ddip->d->disk->part0, ddip->start_time);
+ 
+-	kfree(ddip);
++	if (bio->bi_status) {
++		struct cached_dev *dc = container_of(ddip->d,
++						     struct cached_dev, disk);
++		/* should count I/O error for backing device here */
++		bch_count_backing_io_errors(dc, bio);
++	}
+ 
++	kfree(ddip);
+ 	bio->bi_end_io(bio);
+ }
+ 
+@@ -1105,7 +1114,8 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
+ 	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+ 	int rw = bio_data_dir(bio);
+ 
+-	if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) {
++	if (unlikely((d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags)) ||
++		     dc->io_disable)) {
+ 		bio->bi_status = BLK_STS_IOERR;
+ 		bio_endio(bio);
+ 		return BLK_QC_T_NONE;
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index 082faaf2ee2f..91a08cdd55bd 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -1188,6 +1188,10 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
+ 		max(dc->disk.disk->queue->backing_dev_info->ra_pages,
+ 		    q->backing_dev_info->ra_pages);
+ 
++	atomic_set(&dc->io_errors, 0);
++	dc->io_disable = false;
++	dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT;
++
+ 	bch_cached_dev_request_init(dc);
+ 	bch_cached_dev_writeback_init(dc);
+ 	return 0;
+@@ -1339,6 +1343,24 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size)
+ 	return flash_dev_run(c, u);
+ }
+ 
++bool bch_cached_dev_error(struct cached_dev *dc)
++{
++	char name[BDEVNAME_SIZE];
++
++	if (!dc || test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
++		return false;
++
++	dc->io_disable = true;
++	/* make others know io_disable is true earlier */
++	smp_mb();
++
++	pr_err("stop %s: too many IO errors on backing device %s\n",
++		dc->disk.disk->disk_name, bdevname(dc->bdev, name));
++
++	bcache_device_stop(&dc->disk);
++	return true;
++}
++
+ /* Cache set */
+ 
+ __printf(2, 3)
+diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
+index cf973c07c856..ac3adf2dcf6c 100644
+--- a/drivers/md/bcache/sysfs.c
++++ b/drivers/md/bcache/sysfs.c
+@@ -134,7 +134,9 @@ SHOW(__bch_cached_dev)
+ 	var_print(writeback_delay);
+ 	var_print(writeback_percent);
+ 	sysfs_hprint(writeback_rate,	dc->writeback_rate.rate << 9);
+-
++	sysfs_hprint(io_errors,		atomic_read(&dc->io_errors));
++	sysfs_printf(io_error_limit,	"%i", dc->error_limit);
++	sysfs_printf(io_disable,	"%i", dc->io_disable);
+ 	var_print(writeback_rate_update_seconds);
+ 	var_print(writeback_rate_i_term_inverse);
+ 	var_print(writeback_rate_p_term_inverse);
+@@ -225,6 +227,14 @@ STORE(__cached_dev)
+ 	d_strtoul(writeback_rate_i_term_inverse);
+ 	d_strtoul_nonzero(writeback_rate_p_term_inverse);
+ 
++	sysfs_strtoul_clamp(io_error_limit, dc->error_limit, 0, INT_MAX);
++
++	if (attr == &sysfs_io_disable) {
++		int v = strtoul_or_return(buf);
++
++		dc->io_disable = v ? 1 : 0;
++	}
++
+ 	d_strtoi_h(sequential_cutoff);
+ 	d_strtoi_h(readahead);
+ 
+@@ -332,6 +342,9 @@ static struct attribute *bch_cached_dev_files[] = {
+ 	&sysfs_writeback_rate_i_term_inverse,
+ 	&sysfs_writeback_rate_p_term_inverse,
+ 	&sysfs_writeback_rate_debug,
++	&sysfs_errors,
++	&sysfs_io_error_limit,
++	&sysfs_io_disable,
+ 	&sysfs_dirty_data,
+ 	&sysfs_stripe_size,
+ 	&sysfs_partial_stripes_expensive,
+-- 
+2.16.1
+
diff --git a/for-next/v5-0010-bcache-stop-bcache-device-when-backing-device-is-.patch b/for-next/v5-0010-bcache-stop-bcache-device-when-backing-device-is-.patch
new file mode 100644
index 0000000..d03b24e
--- /dev/null
+++ b/for-next/v5-0010-bcache-stop-bcache-device-when-backing-device-is-.patch
@@ -0,0 +1,152 @@
+From 7d8d5a020e69671723932da30a1800eed91d3bcd Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sat, 13 Jan 2018 17:31:44 +0800
+Subject: [PATCH v5 10/11] bcache: stop bcache device when backing device is
+ offline
+
+Currently bcache does not handle backing device failure, if backing
+device is offline and disconnected from system, its bcache device can still
+be accessible. If the bcache device is in writeback mode, I/O requests even
+can success if the requests hit on cache device. That is to say, when and
+how bcache handles offline backing device is undefined.
+
+This patch tries to handle backing device offline in a rather simple way,
+- Add cached_dev->status_update_thread kernel thread to update backing
+  device status in every 1 second.
+- Add cached_dev->offline_seconds to record how many seconds the backing
+  device is observed to be offline. If the backing device is offline for
+  BACKING_DEV_OFFLINE_TIMEOUT (30) seconds, set dc->io_disable to 1 and
+  call bcache_device_stop() to stop the bache device which linked to the
+  offline backing device.
+
+Now if a backing device is offline for BACKING_DEV_OFFLINE_TIMEOUT seconds,
+its bcache device will be removed, then user space application writing on
+it will get error immediately, and handler the device failure in time.
+
+This patch is quite simple, does not handle more complicated situations.
+Once the bcache device is stopped, users need to recovery the backing
+device, register and attach it manually.
+
+Changelog:
+v2: remove "bcache: " prefix when calling pr_warn().
+v1: initial version.
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Reviewed-by: Hannes Reinecke <hare@suse.com>
+Cc: Michael Lyle <mlyle@lyle.org>
+Cc: Junhui Tang <tang.junhui@zte.com.cn>
+---
+ drivers/md/bcache/bcache.h |  2 ++
+ drivers/md/bcache/super.c  | 55 ++++++++++++++++++++++++++++++++++++++++++++++
+ 2 files changed, 57 insertions(+)
+
+diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
+index 822ec75bb78c..1c749352172d 100644
+--- a/drivers/md/bcache/bcache.h
++++ b/drivers/md/bcache/bcache.h
+@@ -338,6 +338,7 @@ struct cached_dev {
+ 
+ 	struct keybuf		writeback_keys;
+ 
++	struct task_struct	*status_update_thread;
+ 	/*
+ 	 * Order the write-half of writeback operations strongly in dispatch
+ 	 * order.  (Maintain LBA order; don't allow reads completing out of
+@@ -384,6 +385,7 @@ struct cached_dev {
+ #define DEFAULT_CACHED_DEV_ERROR_LIMIT 64
+ 	atomic_t		io_errors;
+ 	unsigned		error_limit;
++	unsigned		offline_seconds;
+ };
+ 
+ enum alloc_reserve {
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index 91a08cdd55bd..de0f5fb9bde2 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -646,6 +646,11 @@ static int ioctl_dev(struct block_device *b, fmode_t mode,
+ 		     unsigned int cmd, unsigned long arg)
+ {
+ 	struct bcache_device *d = b->bd_disk->private_data;
++	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
++
++	if (dc->io_disable)
++		return -EIO;
++
+ 	return d->ioctl(d, mode, cmd, arg);
+ }
+ 
+@@ -856,6 +861,45 @@ static void calc_cached_dev_sectors(struct cache_set *c)
+ 	c->cached_dev_sectors = sectors;
+ }
+ 
++#define BACKING_DEV_OFFLINE_TIMEOUT 5
++static int cached_dev_status_update(void *arg)
++{
++	struct cached_dev *dc = arg;
++	struct request_queue *q;
++	char buf[BDEVNAME_SIZE];
++
++	/*
++	 * If this delayed worker is stopping outside, directly quit here.
++	 * dc->io_disable might be set via sysfs interface, so check it
++	 * here too.
++	 */
++	while (!kthread_should_stop() && !dc->io_disable) {
++		q = bdev_get_queue(dc->bdev);
++		if (blk_queue_dying(q))
++			dc->offline_seconds++;
++		else
++			dc->offline_seconds = 0;
++
++		if (dc->offline_seconds >= BACKING_DEV_OFFLINE_TIMEOUT) {
++			pr_err("%s: device offline for %d seconds",
++				bdevname(dc->bdev, buf),
++				BACKING_DEV_OFFLINE_TIMEOUT);
++			pr_err("%s: disable I/O request due to backing "
++				"device offline", dc->disk.name);
++			dc->io_disable = true;
++			/* let others know earlier that io_disable is true */
++			smp_mb();
++			bcache_device_stop(&dc->disk);
++			break;
++		}
++
++		schedule_timeout_interruptible(HZ);
++	}
++
++	dc->status_update_thread = NULL;
++	return 0;
++}
++
+ void bch_cached_dev_run(struct cached_dev *dc)
+ {
+ 	struct bcache_device *d = &dc->disk;
+@@ -898,6 +942,15 @@ void bch_cached_dev_run(struct cached_dev *dc)
+ 	if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
+ 	    sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache"))
+ 		pr_debug("error creating sysfs link");
++
++	dc->status_update_thread = kthread_run(cached_dev_status_update,
++					       dc,
++					      "bcache_status_update");
++	if (IS_ERR(dc->status_update_thread)) {
++		pr_warn("failed to create bcache_status_update kthread, "
++			"continue to run without monitoring backing "
++			"device status");
++	}
+ }
+ 
+ /*
+@@ -1118,6 +1171,8 @@ static void cached_dev_free(struct closure *cl)
+ 		kthread_stop(dc->writeback_thread);
+ 	if (dc->writeback_write_wq)
+ 		destroy_workqueue(dc->writeback_write_wq);
++	if (!IS_ERR_OR_NULL(dc->status_update_thread))
++		kthread_stop(dc->status_update_thread);
+ 
+ 	if (atomic_read(&dc->running))
+ 		bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
+-- 
+2.16.1
+
diff --git a/for-next/v5-0011-bcache-add-stop_when_cache_set_failed-option-to-b.patch b/for-next/v5-0011-bcache-add-stop_when_cache_set_failed-option-to-b.patch
new file mode 100644
index 0000000..c782476
--- /dev/null
+++ b/for-next/v5-0011-bcache-add-stop_when_cache_set_failed-option-to-b.patch
@@ -0,0 +1,251 @@
+From e8f72263c0f4f20b85f42a617fa4998115f797af Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Mon, 5 Feb 2018 10:43:18 +0800
+Subject: [PATCH v5 11/11] bcache: add stop_when_cache_set_failed option to
+ backing device
+
+Current bcache failure handling code will stop all attached bcache devices
+when the cache set is broken or disconnected. This might not be desired
+behavior, example bcache deployed for an email service. In such workload,
+if cache device is broken but no dirty data lost, keep the bcache device
+alive and permit email service continue to access data might be a better
+solution for the cache device failure.
+
+Nix <nix@esperi.org.uk> points out the issue and provides the above example
+to explain why it might be necessary to not stop bcache device for broken
+cache device. Pavel Goran <via-bcache@pvgoran.name> provides a brilliant
+suggestion to provide "always" and "auto" options to per-cached device
+sysfs file stop_when_cache_set_failed. If cache set is retiring and the
+backing device has no dirty data on cache, it should be safe to keep the
+bcache device alive. In this case, if stop_when_cache_set_failed is set to
+"auto", the device failure handling code will not stop this bcache device
+and permit application to access the backing device with a unattached
+bcache device.
+
+Changelog:
+v2: change option values of stop_when_cache_set_failed from 1/0 to
+    "auto"/"always".
+v1: initial version, stop_when_cache_set_failed can be 0 (not stop) or 1
+    (always stop).
+
+Signed-off-by: Coly Li <colyli@suse.de>
+Cc: Nix <nix@esperi.org.uk>
+Cc: Pavel Goran <via-bcache@pvgoran.name>
+Cc: Michael Lyle <mlyle@lyle.org>
+Cc: Junhui Tang <tang.junhui@zte.com.cn>
+Cc: Hannes Reinecke <hare@suse.com>
+---
+ drivers/md/bcache/bcache.h |  8 +++++
+ drivers/md/bcache/super.c  | 90 ++++++++++++++++++++++++++++++++++++----------
+ drivers/md/bcache/sysfs.c  | 17 +++++++++
+ 3 files changed, 97 insertions(+), 18 deletions(-)
+
+diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
+index 1c749352172d..59e675304b7e 100644
+--- a/drivers/md/bcache/bcache.h
++++ b/drivers/md/bcache/bcache.h
+@@ -287,6 +287,12 @@ struct io {
+ 	sector_t		last;
+ };
+ 
++enum stop_on_faliure {
++	BCH_CACHED_DEV_STOP_ATUO = 0,
++	BCH_CACHED_DEV_STOP_ALWAYS,
++	BCH_CACHED_DEV_STOP_MODE_MAX,
++};
++
+ struct cached_dev {
+ 	struct list_head	list;
+ 	struct bcache_device	disk;
+@@ -382,6 +388,7 @@ struct cached_dev {
+ 	unsigned		writeback_rate_p_term_inverse;
+ 	unsigned		writeback_rate_minimum;
+ 
++	enum stop_on_faliure	stop_when_cache_set_failed;
+ #define DEFAULT_CACHED_DEV_ERROR_LIMIT 64
+ 	atomic_t		io_errors;
+ 	unsigned		error_limit;
+@@ -933,6 +940,7 @@ void bch_write_bdev_super(struct cached_dev *, struct closure *);
+ 
+ extern struct workqueue_struct *bcache_wq;
+ extern const char * const bch_cache_modes[];
++extern const char * const bch_stop_on_failure_modes[];
+ extern struct mutex bch_register_lock;
+ extern struct list_head bch_cache_sets;
+ 
+diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
+index de0f5fb9bde2..d2999f1e6ae2 100644
+--- a/drivers/md/bcache/super.c
++++ b/drivers/md/bcache/super.c
+@@ -47,6 +47,14 @@ const char * const bch_cache_modes[] = {
+ 	NULL
+ };
+ 
++/* Default is -1; we skip past it for stop_when_cache_set_failed */
++const char * const bch_stop_on_failure_modes[] = {
++	"default",
++	"auto",
++	"always",
++	NULL
++};
++
+ static struct kobject *bcache_kobj;
+ struct mutex bch_register_lock;
+ LIST_HEAD(bch_cache_sets);
+@@ -1246,6 +1254,8 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
+ 	atomic_set(&dc->io_errors, 0);
+ 	dc->io_disable = false;
+ 	dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT;
++	/* default to auto */
++	dc->stop_when_cache_set_failed = BCH_CACHED_DEV_STOP_ATUO;
+ 
+ 	bch_cached_dev_request_init(dc);
+ 	bch_cached_dev_writeback_init(dc);
+@@ -1541,33 +1551,77 @@ static void cache_set_flush(struct closure *cl)
+ 	closure_return(cl);
+ }
+ 
++/*
++ * This function is only called when CACHE_SET_IO_DISABLE is set, which means
++ * cache set is unregistering due to too many I/O errors. In this condition,
++ * the bcache device might be stopped, it depends on stop_when_cache_set_failed
++ * value and whether the broken cache has dirty data:
++ *
++ * dc->stop_when_cache_set_failed    dc->has_dirty   stop bcache device
++ *  BCH_CACHED_STOP_ATUO		0		NO
++ *  BCH_CACHED_STOP_ATUO		1		YES
++ *  BCH_CACHED_DEV_STOP_ALWAYS		0		YES
++ *  BCH_CACHED_DEV_STOP_ALWAYS		1		YES
++ *
++ * The expected behavior is, if stop_when_cache_set_failed is configured to
++ * "auto" via sysfs interface, the bcache device will not be stopped if the
++ * backing device is clean on the broken cache device.
++ */
++static void conditional_stop_bcache_device(struct cache_set *c,
++				   struct bcache_device *d,
++				   struct cached_dev *dc)
++{
++	if (dc->stop_when_cache_set_failed == BCH_CACHED_DEV_STOP_ALWAYS) {
++		pr_warn("stop_when_cache_set_failed of %s is \"always\", stop"
++			" it for failed cache set %pU.",
++			d->disk->disk_name, c->sb.set_uuid);
++		bcache_device_stop(d);
++		return;
++	} else if (atomic_read(&dc->has_dirty)) {
++		/*
++		 * dc->stop_when_cache_set_failed == BCH_CACHED_STOP_ATUO
++		 * and dc->has_dirty == 1
++		 */
++		pr_warn("stop_when_cache_set_failed of %s is \"auto\" and "
++			"cache is dirty, stop it to avoid potential data "
++			"corruption.",
++			d->disk->disk_name);
++		bcache_device_stop(d);
++	} else {
++		/*
++		 * dc->stop_when_cache_set_failed == BCH_CACHED_STOP_ATUO
++		 * and dc->has_dirty == 0
++		 */
++		pr_warn("stop_when_cache_set_failed of %s is \"auto\" and "
++			"cache is clean, keep it alive.",
++			d->disk->disk_name);
++	}
++}
++
+ static void __cache_set_unregister(struct closure *cl)
+ {
+ 	struct cache_set *c = container_of(cl, struct cache_set, caching);
+ 	struct cached_dev *dc;
++	struct bcache_device *d;
+ 	size_t i;
+ 
+ 	mutex_lock(&bch_register_lock);
+ 
+-	for (i = 0; i < c->devices_max_used; i++)
+-		if (c->devices[i]) {
+-			if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
+-			    test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
+-				dc = container_of(c->devices[i],
+-						  struct cached_dev, disk);
+-				bch_cached_dev_detach(dc);
+-				/*
+-				 * If we come here by too many I/O errors,
+-				 * bcache device should be stopped too, to
+-				 * keep data consistency on cache and
+-				 * backing devices.
+-				 */
+-				if (test_bit(CACHE_SET_IO_DISABLE, &c->flags))
+-					bcache_device_stop(c->devices[i]);
+-			} else {
+-				bcache_device_stop(c->devices[i]);
+-			}
++	for (i = 0; i < c->devices_max_used; i++) {
++		d = c->devices[i];
++		if (!d)
++			continue;
++
++		if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
++		    test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
++			dc = container_of(d, struct cached_dev, disk);
++			bch_cached_dev_detach(dc);
++			if (test_bit(CACHE_SET_IO_DISABLE, &c->flags))
++				conditional_stop_bcache_device(c, d, dc);
++		} else {
++			bcache_device_stop(d);
+ 		}
++	}
+ 
+ 	mutex_unlock(&bch_register_lock);
+ 
+diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
+index ac3adf2dcf6c..e88fdcc549cd 100644
+--- a/drivers/md/bcache/sysfs.c
++++ b/drivers/md/bcache/sysfs.c
+@@ -78,6 +78,7 @@ rw_attribute(congested_write_threshold_us);
+ rw_attribute(sequential_cutoff);
+ rw_attribute(data_csum);
+ rw_attribute(cache_mode);
++rw_attribute(stop_when_cache_set_failed);
+ rw_attribute(writeback_metadata);
+ rw_attribute(writeback_running);
+ rw_attribute(writeback_percent);
+@@ -126,6 +127,12 @@ SHOW(__bch_cached_dev)
+ 					       bch_cache_modes + 1,
+ 					       BDEV_CACHE_MODE(&dc->sb));
+ 
++	if (attr == &sysfs_stop_when_cache_set_failed)
++		return bch_snprint_string_list(buf, PAGE_SIZE,
++					       bch_stop_on_failure_modes + 1,
++					       dc->stop_when_cache_set_failed);
++
++
+ 	sysfs_printf(data_csum,		"%i", dc->disk.data_csum);
+ 	var_printf(verify,		"%i");
+ 	var_printf(bypass_torture_test,	"%i");
+@@ -257,6 +264,15 @@ STORE(__cached_dev)
+ 		}
+ 	}
+ 
++	if (attr == &sysfs_stop_when_cache_set_failed) {
++		v = bch_read_string_list(buf, bch_stop_on_failure_modes + 1);
++
++		if (v < 0)
++			return v;
++
++		dc->stop_when_cache_set_failed = v;
++	}
++
+ 	if (attr == &sysfs_label) {
+ 		if (size > SB_LABEL_SIZE)
+ 			return -EINVAL;
+@@ -333,6 +349,7 @@ static struct attribute *bch_cached_dev_files[] = {
+ 	&sysfs_data_csum,
+ #endif
+ 	&sysfs_cache_mode,
++	&sysfs_stop_when_cache_set_failed,
+ 	&sysfs_writeback_metadata,
+ 	&sysfs_writeback_running,
+ 	&sysfs_writeback_delay,
+-- 
+2.16.1
+
author	Coly Li <colyli@suse.de>	2018-02-05 23:15:25 +0800
committer	Coly Li <colyli@suse.de>	2018-02-05 23:15:25 +0800
commit	dcc1da5ab0d2ea18f2f5e5dd769047fcc9c0bb43 (patch)
tree	bc72e87b45912fac9995a624c0f5f12e7715fd6c
parent	2bbb32018a3cbf7780cf14ba16b9935837b2ce5f (diff)
download	bcache-patches-dcc1da5ab0d2ea18f2f5e5dd769047fcc9c0bb43.tar.gz