diff options
author | Coly Li <colyli@suse.de> | 2018-02-05 23:15:25 +0800 |
---|---|---|
committer | Coly Li <colyli@suse.de> | 2018-02-05 23:15:25 +0800 |
commit | dcc1da5ab0d2ea18f2f5e5dd769047fcc9c0bb43 (patch) | |
tree | bc72e87b45912fac9995a624c0f5f12e7715fd6c | |
parent | 2bbb32018a3cbf7780cf14ba16b9935837b2ce5f (diff) | |
download | bcache-patches-dcc1da5ab0d2ea18f2f5e5dd769047fcc9c0bb43.tar.gz |
for-next: update v5 patch set
26 files changed, 2319 insertions, 0 deletions
diff --git a/for-next/v4-0000-cover-letter.patch b/for-next/v4/v4-0000-cover-letter.patch index 0327afe..0327afe 100644 --- a/for-next/v4-0000-cover-letter.patch +++ b/for-next/v4/v4-0000-cover-letter.patch diff --git a/for-next/v4-0001-bcache-set-writeback_rate_update_seconds-in-range.patch b/for-next/v4/v4-0001-bcache-set-writeback_rate_update_seconds-in-range.patch index 51edd0b..51edd0b 100644 --- a/for-next/v4-0001-bcache-set-writeback_rate_update_seconds-in-range.patch +++ b/for-next/v4/v4-0001-bcache-set-writeback_rate_update_seconds-in-range.patch diff --git a/for-next/v4-0002-bcache-properly-set-task-state-in-bch_writeback_t.patch b/for-next/v4/v4-0002-bcache-properly-set-task-state-in-bch_writeback_t.patch index 113dd97..113dd97 100644 --- a/for-next/v4-0002-bcache-properly-set-task-state-in-bch_writeback_t.patch +++ b/for-next/v4/v4-0002-bcache-properly-set-task-state-in-bch_writeback_t.patch diff --git a/for-next/v4-0003-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch b/for-next/v4/v4-0003-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch index f85123b..f85123b 100644 --- a/for-next/v4-0003-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch +++ b/for-next/v4/v4-0003-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch diff --git a/for-next/v4-0004-bcache-quit-dc-writeback_thread-when-BCACHE_DEV_D.patch b/for-next/v4/v4-0004-bcache-quit-dc-writeback_thread-when-BCACHE_DEV_D.patch index 349a3d1..349a3d1 100644 --- a/for-next/v4-0004-bcache-quit-dc-writeback_thread-when-BCACHE_DEV_D.patch +++ b/for-next/v4/v4-0004-bcache-quit-dc-writeback_thread-when-BCACHE_DEV_D.patch diff --git a/for-next/v4-0005-bcache-stop-dc-writeback_rate_update-properly.patch b/for-next/v4/v4-0005-bcache-stop-dc-writeback_rate_update-properly.patch index 2e6ce9b..2e6ce9b 100644 --- a/for-next/v4-0005-bcache-stop-dc-writeback_rate_update-properly.patch +++ b/for-next/v4/v4-0005-bcache-stop-dc-writeback_rate_update-properly.patch diff --git a/for-next/v4-0006-bcache-set-error_limit-correctly.patch b/for-next/v4/v4-0006-bcache-set-error_limit-correctly.patch index 927468d..927468d 100644 --- a/for-next/v4-0006-bcache-set-error_limit-correctly.patch +++ b/for-next/v4/v4-0006-bcache-set-error_limit-correctly.patch diff --git a/for-next/v4-0007-bcache-add-CACHE_SET_IO_DISABLE-to-struct-cache_s.patch b/for-next/v4/v4-0007-bcache-add-CACHE_SET_IO_DISABLE-to-struct-cache_s.patch index 849d522..849d522 100644 --- a/for-next/v4-0007-bcache-add-CACHE_SET_IO_DISABLE-to-struct-cache_s.patch +++ b/for-next/v4/v4-0007-bcache-add-CACHE_SET_IO_DISABLE-to-struct-cache_s.patch diff --git a/for-next/v4-0008-bcache-stop-all-attached-bcache-devices-for-a-ret.patch b/for-next/v4/v4-0008-bcache-stop-all-attached-bcache-devices-for-a-ret.patch index eab5e76..eab5e76 100644 --- a/for-next/v4-0008-bcache-stop-all-attached-bcache-devices-for-a-ret.patch +++ b/for-next/v4/v4-0008-bcache-stop-all-attached-bcache-devices-for-a-ret.patch diff --git a/for-next/v4-0009-bcache-fix-inaccurate-io-state-for-detached-bcach.patch b/for-next/v4/v4-0009-bcache-fix-inaccurate-io-state-for-detached-bcach.patch index 048a30a..048a30a 100644 --- a/for-next/v4-0009-bcache-fix-inaccurate-io-state-for-detached-bcach.patch +++ b/for-next/v4/v4-0009-bcache-fix-inaccurate-io-state-for-detached-bcach.patch diff --git a/for-next/v4-0010-bcache-add-backing_request_endio-for-bi_end_io-of.patch b/for-next/v4/v4-0010-bcache-add-backing_request_endio-for-bi_end_io-of.patch index 80f6dc8..80f6dc8 100644 --- a/for-next/v4-0010-bcache-add-backing_request_endio-for-bi_end_io-of.patch +++ b/for-next/v4/v4-0010-bcache-add-backing_request_endio-for-bi_end_io-of.patch diff --git a/for-next/v4-0011-bcache-add-io_disable-to-struct-cached_dev.patch b/for-next/v4/v4-0011-bcache-add-io_disable-to-struct-cached_dev.patch index 6b4ae2a..6b4ae2a 100644 --- a/for-next/v4-0011-bcache-add-io_disable-to-struct-cached_dev.patch +++ b/for-next/v4/v4-0011-bcache-add-io_disable-to-struct-cached_dev.patch diff --git a/for-next/v4-0012-bcache-stop-bcache-device-when-backing-device-is-.patch b/for-next/v4/v4-0012-bcache-stop-bcache-device-when-backing-device-is-.patch index e73bf4f..e73bf4f 100644 --- a/for-next/v4-0012-bcache-stop-bcache-device-when-backing-device-is-.patch +++ b/for-next/v4/v4-0012-bcache-stop-bcache-device-when-backing-device-is-.patch diff --git a/for-next/v4-0013-bcache-add-stop_attached_devs_on_fail-to-struct-c.patch b/for-next/v4/v4-0013-bcache-add-stop_attached_devs_on_fail-to-struct-c.patch index d9edf10..d9edf10 100644 --- a/for-next/v4-0013-bcache-add-stop_attached_devs_on_fail-to-struct-c.patch +++ b/for-next/v4/v4-0013-bcache-add-stop_attached_devs_on_fail-to-struct-c.patch diff --git a/for-next/v5-0000-cover-letter.patch b/for-next/v5-0000-cover-letter.patch new file mode 100644 index 0000000..de57fc5 --- /dev/null +++ b/for-next/v5-0000-cover-letter.patch @@ -0,0 +1,92 @@ +From e8f72263c0f4f20b85f42a617fa4998115f797af Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Mon, 5 Feb 2018 18:26:45 +0800 +Subject: [PATCH v5 00/11] bcache: device failure handling improvement + +Hi maintainers and folks, + +This patch set tries to improve bcache device failure handling, includes +cache device and backing device failures. + +The basic idea to handle failed cache device is, +- Unregister cache set +- Detach all backing devices which are attached to this cache set +- Stop all the detached bcache devices (configurable) +- Stop all flash only volume on the cache set +The above process is named 'cache set retire' by me. The result of cache +set retire is, cache set and bcache devices are all removed, following +I/O requests will get failed immediately to notift upper layer or user +space coce that the cache device is failed or disconnected. +- Stop all the detached bcache devices (configurable) +- Stop all flash only volume on the cache set +The above process is named 'cache set retire' by me. The result of cache +set retire is, cache set and bcache devices are all removed +(configurable), following I/O requests will get failed immediately to +notify upper layer or user space coce that the cache device is failed or +disconnected. + +There are 2 patches from v4 patch set is merged into bcache-for-next, they +are not in v5 patch set any more. + +In v5 patch set add a new patch "bcache: add stop_when_cache_set_failed +option to backing device", which provides "auto"/"always" options to +configure whether or not to stop bcache device for a broken cache cset. + +Most of the patches are reviewed by Hannes Reinecke and Junhui Tang. There +are still severl patches need to be reviewed, +- [PATCH v5 03/11] bcache: quit dc->writeback_thread when + BCACHE_DEV_DETACHING is set +- [PATCH v5 11/11] bcache: add stop_when_cache_set_failed option to + backing device + +Any comment, question and review are warmly welcome. Thanks in advance. + +Changelog: +v5: add [PATCH v5 11/11] bcache: add stop_when_cache_set_failed option to + backing device. + fix issues from v4 patch set. + improve kernel message format, remove redundant prefix string. +v4: add per-cached_dev option stop_attached_devs_on_fail to avoid stopping + attached bcache device from a retiring cache set. +v3: fix detach issue find in v2 patch set. +v2: fixes all problems found in v1 review. + add patches to handle backing device failure. + add one more patch to set writeback_rate_update_seconds range. + include a patch from Junhui Tang. +v1: the initial version, only handles cache device failure. + +Coly Li + + +Coly Li (10): + bcache: set writeback_rate_update_seconds in range [1, 60] seconds + bcache: fix cached_dev->count usage for bch_cache_set_error() + bcache: quit dc->writeback_thread when BCACHE_DEV_DETACHING is set + bcache: stop dc->writeback_rate_update properly + bcache: add CACHE_SET_IO_DISABLE to struct cache_set flags + bcache: stop all attached bcache devices for a retired cache set + bcache: add backing_request_endio() for bi_end_io of attached backing + device I/O + bcache: add io_disable to struct cached_dev + bcache: stop bcache device when backing device is offline + bcache: add stop_when_cache_set_failed option to backing device + +Tang Junhui (1): + bcache: fix inaccurate io state for detached bcache devices + + drivers/md/bcache/alloc.c | 3 +- + drivers/md/bcache/bcache.h | 44 ++++++++- + drivers/md/bcache/btree.c | 10 +- + drivers/md/bcache/io.c | 16 +++- + drivers/md/bcache/journal.c | 4 +- + drivers/md/bcache/request.c | 185 +++++++++++++++++++++++++++++++------ + drivers/md/bcache/super.c | 206 ++++++++++++++++++++++++++++++++++++++---- + drivers/md/bcache/sysfs.c | 59 +++++++++++- + drivers/md/bcache/util.h | 6 -- + drivers/md/bcache/writeback.c | 94 ++++++++++++++++--- + drivers/md/bcache/writeback.h | 5 +- + 11 files changed, 551 insertions(+), 81 deletions(-) + +-- +2.16.1 + diff --git a/for-next/v5-0001-bcache-set-writeback_rate_update_seconds-in-range.patch b/for-next/v5-0001-bcache-set-writeback_rate_update_seconds-in-range.patch new file mode 100644 index 0000000..d218724 --- /dev/null +++ b/for-next/v5-0001-bcache-set-writeback_rate_update_seconds-in-range.patch @@ -0,0 +1,79 @@ +From 31fa907c6f962fc229c4364ac08b239f7bf5384d Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sat, 13 Jan 2018 15:11:03 +0800 +Subject: [PATCH v5 01/11] bcache: set writeback_rate_update_seconds in range + [1, 60] seconds + +dc->writeback_rate_update_seconds can be set via sysfs and its value can +be set to [1, ULONG_MAX]. It does not make sense to set such a large +value, 60 seconds is long enough value considering the default 5 seconds +works well for long time. + +Because dc->writeback_rate_update is a special delayed work, it re-arms +itself inside the delayed work routine update_writeback_rate(). When +stopping it by cancel_delayed_work_sync(), there should be a timeout to +wait and make sure the re-armed delayed work is stopped too. A small max +value of dc->writeback_rate_update_seconds is also helpful to decide a +reasonable small timeout. + +This patch limits sysfs interface to set dc->writeback_rate_update_seconds +in range of [1, 60] seconds, and replaces the hand-coded number by macros. + +Changelog: +v2: fix a rebase typo in v4, which is pointed out by Michael Lyle. +v1: initial version. + +Signed-off-by: Coly Li <colyli@suse.de> +Reviewed-by: Hannes Reinecke <hare@suse.com> +Cc: Michael Lyle <mlyle@lyle.org> +--- + drivers/md/bcache/sysfs.c | 4 +++- + drivers/md/bcache/writeback.c | 2 +- + drivers/md/bcache/writeback.h | 3 +++ + 3 files changed, 7 insertions(+), 2 deletions(-) + +diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c +index c524305cc9a7..4a6a697e1680 100644 +--- a/drivers/md/bcache/sysfs.c ++++ b/drivers/md/bcache/sysfs.c +@@ -218,7 +218,9 @@ STORE(__cached_dev) + sysfs_strtoul_clamp(writeback_rate, + dc->writeback_rate.rate, 1, INT_MAX); + +- d_strtoul_nonzero(writeback_rate_update_seconds); ++ sysfs_strtoul_clamp(writeback_rate_update_seconds, ++ dc->writeback_rate_update_seconds, ++ 1, WRITEBACK_RATE_UPDATE_SECS_MAX); + d_strtoul(writeback_rate_i_term_inverse); + d_strtoul_nonzero(writeback_rate_p_term_inverse); + +diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c +index 58218f7e77c3..f1d2fc15abcc 100644 +--- a/drivers/md/bcache/writeback.c ++++ b/drivers/md/bcache/writeback.c +@@ -655,7 +655,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) + dc->writeback_rate.rate = 1024; + dc->writeback_rate_minimum = 8; + +- dc->writeback_rate_update_seconds = 5; ++ dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT; + dc->writeback_rate_p_term_inverse = 40; + dc->writeback_rate_i_term_inverse = 10000; + +diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h +index 66f1c527fa24..587b25599856 100644 +--- a/drivers/md/bcache/writeback.h ++++ b/drivers/md/bcache/writeback.h +@@ -8,6 +8,9 @@ + #define MAX_WRITEBACKS_IN_PASS 5 + #define MAX_WRITESIZE_IN_PASS 5000 /* *512b */ + ++#define WRITEBACK_RATE_UPDATE_SECS_MAX 60 ++#define WRITEBACK_RATE_UPDATE_SECS_DEFAULT 5 ++ + /* + * 14 (16384ths) is chosen here as something that each backing device + * should be a reasonable fraction of the share, and not to blow up +-- +2.16.1 + diff --git a/for-next/v5-0002-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch b/for-next/v5-0002-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch new file mode 100644 index 0000000..1c09d03 --- /dev/null +++ b/for-next/v5-0002-bcache-fix-cached_dev-count-usage-for-bch_cache_s.patch @@ -0,0 +1,178 @@ +From c88508fc06f1d3d2241104d5e3b7b5e0045d24c0 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Mon, 8 Jan 2018 23:05:58 +0800 +Subject: [PATCH v5 02/11] bcache: fix cached_dev->count usage for + bch_cache_set_error() + +When bcache metadata I/O fails, bcache will call bch_cache_set_error() +to retire the whole cache set. The expected behavior to retire a cache +set is to unregister the cache set, and unregister all backing device +attached to this cache set, then remove sysfs entries of the cache set +and all attached backing devices, finally release memory of structs +cache_set, cache, cached_dev and bcache_device. + +In my testing when journal I/O failure triggered by disconnected cache +device, sometimes the cache set cannot be retired, and its sysfs +entry /sys/fs/bcache/<uuid> still exits and the backing device also +references it. This is not expected behavior. + +When metadata I/O failes, the call senquence to retire whole cache set is, + bch_cache_set_error() + bch_cache_set_unregister() + bch_cache_set_stop() + __cache_set_unregister() <- called as callback by calling + clousre_queue(&c->caching) + cache_set_flush() <- called as a callback when refcount + of cache_set->caching is 0 + cache_set_free() <- called as a callback when refcount + of catch_set->cl is 0 + bch_cache_set_release() <- called as a callback when refcount + of catch_set->kobj is 0 + +I find if kernel thread bch_writeback_thread() quits while-loop when +kthread_should_stop() is true and searched_full_index is false, clousre +callback cache_set_flush() set by continue_at() will never be called. The +result is, bcache fails to retire whole cache set. + +cache_set_flush() will be called when refcount of closure c->caching is 0, +and in function bcache_device_detach() refcount of closure c->caching is +released to 0 by clousre_put(). In metadata error code path, function +bcache_device_detach() is called by cached_dev_detach_finish(). This is a +callback routine being called when cached_dev->count is 0. This refcount +is decreased by cached_dev_put(). + +The above dependence indicates, cache_set_flush() will be called when +refcount of cache_set->cl is 0, and refcount of cache_set->cl to be 0 +when refcount of cache_dev->count is 0. + +The reason why sometimes cache_dev->count is not 0 (when metadata I/O fails +and bch_cache_set_error() called) is, in bch_writeback_thread(), refcount +of cache_dev is not decreased properly. + +In bch_writeback_thread(), cached_dev_put() is called only when +searched_full_index is true and cached_dev->writeback_keys is empty, a.k.a +there is no dirty data on cache. In most of run time it is correct, but +when bch_writeback_thread() quits the while-loop while cache is still +dirty, current code forget to call cached_dev_put() before this kernel +thread exits. This is why sometimes cache_set_flush() is not executed and +cache set fails to be retired. + +The reason to call cached_dev_put() in bch_writeback_rate() is, when the +cache device changes from clean to dirty, cached_dev_get() is called, to +make sure during writeback operatiions both backing and cache devices +won't be released. + +Adding following code in bch_writeback_thread() does not work, + static int bch_writeback_thread(void *arg) + } + ++ if (atomic_read(&dc->has_dirty)) ++ cached_dev_put() ++ + return 0; + } +because writeback kernel thread can be waken up and start via sysfs entry: + echo 1 > /sys/block/bcache<N>/bcache/writeback_running +It is difficult to check whether backing device is dirty without race and +extra lock. So the above modification will introduce potential refcount +underflow in some conditions. + +The correct fix is, to take cached dev refcount when creating the kernel +thread, and put it before the kernel thread exits. Then bcache does not +need to take a cached dev refcount when cache turns from clean to dirty, +or to put a cached dev refcount when cache turns from ditry to clean. The +writeback kernel thread is alwasy safe to reference data structure from +cache set, cache and cached device (because a refcount of cache device is +taken for it already), and no matter the kernel thread is stopped by I/O +errors or system reboot, cached_dev->count can always be used correctly. + +The patch is simple, but understanding how it works is quite complicated. + +Changelog: +v2: set dc->writeback_thread to NULL in this patch, as suggested by Hannes. +v1: initial version for review. + +Signed-off-by: Coly Li <colyli@suse.de> +Reviewed-by: Hannes Reinecke <hare@suse.com> +Cc: Michael Lyle <mlyle@lyle.org> +Cc: Junhui Tang <tang.junhui@zte.com.cn> +--- + drivers/md/bcache/super.c | 1 - + drivers/md/bcache/writeback.c | 11 ++++++++--- + drivers/md/bcache/writeback.h | 2 -- + 3 files changed, 8 insertions(+), 6 deletions(-) + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index a2ad37a8afc0..7d96dc6860fa 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -1052,7 +1052,6 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) + if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { + bch_sectors_dirty_init(&dc->disk); + atomic_set(&dc->has_dirty, 1); +- refcount_inc(&dc->count); + bch_writeback_queue(dc); + } + +diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c +index f1d2fc15abcc..b280c134dd4d 100644 +--- a/drivers/md/bcache/writeback.c ++++ b/drivers/md/bcache/writeback.c +@@ -572,7 +572,7 @@ static int bch_writeback_thread(void *arg) + + if (kthread_should_stop()) { + set_current_state(TASK_RUNNING); +- return 0; ++ break; + } + + schedule(); +@@ -585,7 +585,6 @@ static int bch_writeback_thread(void *arg) + if (searched_full_index && + RB_EMPTY_ROOT(&dc->writeback_keys.keys)) { + atomic_set(&dc->has_dirty, 0); +- cached_dev_put(dc); + SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); + bch_write_bdev_super(dc, NULL); + } +@@ -606,6 +605,9 @@ static int bch_writeback_thread(void *arg) + } + } + ++ dc->writeback_thread = NULL; ++ cached_dev_put(dc); ++ + return 0; + } + +@@ -669,10 +671,13 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc) + if (!dc->writeback_write_wq) + return -ENOMEM; + ++ cached_dev_get(dc); + dc->writeback_thread = kthread_create(bch_writeback_thread, dc, + "bcache_writeback"); +- if (IS_ERR(dc->writeback_thread)) ++ if (IS_ERR(dc->writeback_thread)) { ++ cached_dev_put(dc); + return PTR_ERR(dc->writeback_thread); ++ } + + schedule_delayed_work(&dc->writeback_rate_update, + dc->writeback_rate_update_seconds * HZ); +diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h +index 587b25599856..0bba8f1c6cdf 100644 +--- a/drivers/md/bcache/writeback.h ++++ b/drivers/md/bcache/writeback.h +@@ -105,8 +105,6 @@ static inline void bch_writeback_add(struct cached_dev *dc) + { + if (!atomic_read(&dc->has_dirty) && + !atomic_xchg(&dc->has_dirty, 1)) { +- refcount_inc(&dc->count); +- + if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { + SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); + /* XXX: should do this synchronously */ +-- +2.16.1 + diff --git a/for-next/v5-0003-bcache-quit-dc-writeback_thread-when-BCACHE_DEV_D.patch b/for-next/v5-0003-bcache-quit-dc-writeback_thread-when-BCACHE_DEV_D.patch new file mode 100644 index 0000000..d9aa14d --- /dev/null +++ b/for-next/v5-0003-bcache-quit-dc-writeback_thread-when-BCACHE_DEV_D.patch @@ -0,0 +1,130 @@ +From ad362b22f19cca3073dfe9290529ea5ff63c8b4b Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sun, 14 Jan 2018 21:41:57 +0800 +Subject: [PATCH v5 03/11] bcache: quit dc->writeback_thread when + BCACHE_DEV_DETACHING is set + +In patch "bcache: fix cached_dev->count usage for bch_cache_set_error()", +cached_dev_get() is called when creating dc->writeback_thread, and +cached_dev_put() is called when exiting dc->writeback_thread. This +modification works well unless people detach the bcache device manually by + 'echo 1 > /sys/block/bcache<N>/bcache/detach' +Because this sysfs interface only calls bch_cached_dev_detach() which wakes +up dc->writeback_thread but does not stop it. The reason is, before patch +"bcache: fix cached_dev->count usage for bch_cache_set_error()", inside +bch_writeback_thread(), if cache is not dirty after writeback, +cached_dev_put() will be called here. And in cached_dev_make_request() when +a new write request makes cache from clean to dirty, cached_dev_get() will +be called there. Since we don't operate dc->count in these locations, +refcount d->count cannot be dropped after cache becomes clean, and +cached_dev_detach_finish() won't be called to detach bcache device. + +This patch fixes the issue by checking whether BCACHE_DEV_DETACHING is +set inside bch_writeback_thread(). If this bit is set and cache is clean +(no existing writeback_keys), break the while-loop, call cached_dev_put() +and quit the writeback thread. + +Please note if cache is still dirty, even BCACHE_DEV_DETACHING is set the +writeback thread should continue to perform writeback, this is the original +design of manually detach. + +It is safe to do the following check without locking, let me explain why, ++ if (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && ++ (!atomic_read(&dc->has_dirty) || !dc->writeback_running)) { + +If the kenrel thread does not sleep and continue to run due to conditions +are not updated in time on the running CPU core, it just consumes more CPU +cycles and has no hurt. This should-sleep-but-run is safe here. We just +focus on the should-run-but-sleep condition, which means the writeback +thread goes to sleep in mistake while it should continue to run. +1, First of all, no matter the writeback thread is hung or not, kthread_stop() from + cached_dev_detach_finish() will wake up it and terminate by making + kthread_should_stop() return true. And in normal run time, bit on index + BCACHE_DEV_DETACHING is always cleared, the condition + !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) + is always true and can be ignored as constant value. +2, If one of the following conditions is true, the writeback thread should + go to sleep, + "!atomic_read(&dc->has_dirty)" or "!dc->writeback_running)" + each of them independently controls the writeback thread should sleep or + not, let's analyse them one by one. +2.1 condition "!atomic_read(&dc->has_dirty)" + If dc->has_dirty is set from 0 to 1 on another CPU core, bcache will + call bch_writeback_queue() immediately or call bch_writeback_add() which + indirectly calls bch_writeback_queue() too. In bch_writeback_queue(), + wake_up_process(dc->writeback_thread) is called. It sets writeback + thread's task state to TASK_RUNNING and following an implicit memory + barrier, then tries to wake up the writeback thread. + In writeback thread, its task state is set to TASK_INTERRUPTIBLE before + doing the condition check. If other CPU core sets the TASK_RUNNING state + after writeback thread setting TASK_INTERRUPTIBLE, the writeback thread + will be scheduled to run very soon because its state is not + TASK_INTERRUPTIBLE. If other CPU core sets the TASK_RUNNING state before + writeback thread setting TASK_INTERRUPTIBLE, the implict memory barrier + of wake_up_process() will make sure modification of dc->has_dirty on + other CPU core is updated and observed on the CPU core of writeback + thread. Therefore the condition check will correctly be false, and + continue writeback code without sleeping. +2.2 condition "!dc->writeback_running)" + dc->writeback_running can be changed via sysfs file, every time it is + modified, a following bch_writeback_queue() is alwasy called. So the + change is always observed on the CPU core of writeback thread. If + dc->writeback_running is changed from 0 to 1 on other CPU core, this + condition check will observe the modification and allow writeback + thread to continue to run without sleeping. +Now we can see, even without a locking protection, multiple conditions +check is safe here, no deadlock or process hang up will happen. + +I compose a separte patch because that patch "bcache: fix cached_dev->count +usage for bch_cache_set_error()" already gets a "Reviewed-by:" from Hannes +Reinecke. Also this fix is not trivial and good for a separate patch. + +Signed-off-by: Coly Li <colyli@suse.de> +Cc: Michael Lyle <mlyle@lyle.org> +Cc: Hannes Reinecke <hare@suse.com> +Cc: Huijun Tang <tang.junhui@zte.com.cn> +--- + drivers/md/bcache/writeback.c | 20 +++++++++++++++++--- + 1 file changed, 17 insertions(+), 3 deletions(-) + +diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c +index b280c134dd4d..4dbeaaa575bf 100644 +--- a/drivers/md/bcache/writeback.c ++++ b/drivers/md/bcache/writeback.c +@@ -565,9 +565,15 @@ static int bch_writeback_thread(void *arg) + while (!kthread_should_stop()) { + down_write(&dc->writeback_lock); + set_current_state(TASK_INTERRUPTIBLE); +- if (!atomic_read(&dc->has_dirty) || +- (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && +- !dc->writeback_running)) { ++ /* ++ * If the bache device is detaching, skip here and continue ++ * to perform writeback. Otherwise, if no dirty data on cache, ++ * or there is dirty data on cache but writeback is disabled, ++ * the writeback thread should sleep here and wait for others ++ * to wake up it. ++ */ ++ if (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && ++ (!atomic_read(&dc->has_dirty) || !dc->writeback_running)) { + up_write(&dc->writeback_lock); + + if (kthread_should_stop()) { +@@ -587,6 +593,14 @@ static int bch_writeback_thread(void *arg) + atomic_set(&dc->has_dirty, 0); + SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); + bch_write_bdev_super(dc, NULL); ++ /* ++ * If bcache device is detaching via sysfs interface, ++ * writeback thread should stop after there is no dirty ++ * data on cache. BCACHE_DEV_DETACHING flag is set in ++ * bch_cached_dev_detach(). ++ */ ++ if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) ++ break; + } + + up_write(&dc->writeback_lock); +-- +2.16.1 + diff --git a/for-next/v5-0004-bcache-stop-dc-writeback_rate_update-properly.patch b/for-next/v5-0004-bcache-stop-dc-writeback_rate_update-properly.patch new file mode 100644 index 0000000..83524b8 --- /dev/null +++ b/for-next/v5-0004-bcache-stop-dc-writeback_rate_update-properly.patch @@ -0,0 +1,268 @@ +From f4ce0fb52b341b89a5302daa4a0c2ce716281867 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sat, 13 Jan 2018 15:48:39 +0800 +Subject: [PATCH v5 04/11] bcache: stop dc->writeback_rate_update properly + +struct delayed_work writeback_rate_update in struct cache_dev is a delayed +worker to call function update_writeback_rate() in period (the interval is +defined by dc->writeback_rate_update_seconds). + +When a metadate I/O error happens on cache device, bcache error handling +routine bch_cache_set_error() will call bch_cache_set_unregister() to +retire whole cache set. On the unregister code path, this delayed work is +stopped by calling cancel_delayed_work_sync(&dc->writeback_rate_update). + +dc->writeback_rate_update is a special delayed work from others in bcache. +In its routine update_writeback_rate(), this delayed work is re-armed +itself. That means when cancel_delayed_work_sync() returns, this delayed +work can still be executed after several seconds defined by +dc->writeback_rate_update_seconds. + +The problem is, after cancel_delayed_work_sync() returns, the cache set +unregister code path will continue and release memory of struct cache set. +Then the delayed work is scheduled to run, __update_writeback_rate() +will reference the already released cache_set memory, and trigger a NULL +pointer deference fault. + +This patch introduces two more bcache device flags, +- BCACHE_DEV_WB_RUNNING + bit set: bcache device is in writeback mode and running, it is OK for + dc->writeback_rate_update to re-arm itself. + bit clear:bcache device is trying to stop dc->writeback_rate_update, + this delayed work should not re-arm itself and quit. +- BCACHE_DEV_RATE_DW_RUNNING + bit set: routine update_writeback_rate() is executing. + bit clear: routine update_writeback_rate() quits. + +This patch also adds a function cancel_writeback_rate_update_dwork() to +wait for dc->writeback_rate_update quits before cancel it by calling +cancel_delayed_work_sync(). In order to avoid a deadlock by unexpected +quit dc->writeback_rate_update, after time_out seconds this function will +give up and continue to call cancel_delayed_work_sync(). + +And here I explain how this patch stops self re-armed delayed work properly +with the above stuffs. + +update_writeback_rate() sets BCACHE_DEV_RATE_DW_RUNNING at its beginning +and clears BCACHE_DEV_RATE_DW_RUNNING at its end. Before calling +cancel_writeback_rate_update_dwork() clear flag BCACHE_DEV_WB_RUNNING. + +Before calling cancel_delayed_work_sync() wait utill flag +BCACHE_DEV_RATE_DW_RUNNING is clear. So when calling +cancel_delayed_work_sync(), dc->writeback_rate_update must be already re- +armed, or quite by seeing BCACHE_DEV_WB_RUNNING cleared. In both cases +delayed work routine update_writeback_rate() won't be executed after +cancel_delayed_work_sync() returns. + +Inside update_writeback_rate() before calling schedule_delayed_work(), flag +BCACHE_DEV_WB_RUNNING is checked before. If this flag is cleared, it means +someone is about to stop the delayed work. Because flag +BCACHE_DEV_RATE_DW_RUNNING is set already and cancel_delayed_work_sync() +has to wait for this flag to be cleared, we don't need to worry about race +condition here. + +If update_writeback_rate() is scheduled to run after checking +BCACHE_DEV_RATE_DW_RUNNING and before calling cancel_delayed_work_sync() +in cancel_writeback_rate_update_dwork(), it is also safe. Because at this +moment BCACHE_DEV_WB_RUNNING is cleared with memory barrier. As I mentioned +previously, update_writeback_rate() will see BCACHE_DEV_WB_RUNNING is clear +and quit immediately. + +Because there are more dependences inside update_writeback_rate() to struct +cache_set memory, dc->writeback_rate_update is not a simple self re-arm +delayed work. After trying many different methods (e.g. hold dc->count, or +use locks), this is the only way I can find which works to properly stop +dc->writeback_rate_update delayed work. + +Changelog: +v3: change values of BCACHE_DEV_WB_RUNNING and BCACHE_DEV_RATE_DW_RUNNING + to bit index, for test_bit(). +v2: Try to fix the race issue which is pointed out by Junhui. +v1: The initial version for review + +Signed-off-by: Coly Li <colyli@suse.de> +Reviewed-by: Junhui Tang <tang.junhui@zte.com.cn> +Cc: Michael Lyle <mlyle@lyle.org> +Cc: Hannes Reinecke <hare@suse.com> +--- + drivers/md/bcache/bcache.h | 9 +++++---- + drivers/md/bcache/super.c | 39 +++++++++++++++++++++++++++++++++++---- + drivers/md/bcache/sysfs.c | 3 ++- + drivers/md/bcache/writeback.c | 29 ++++++++++++++++++++++++++++- + 4 files changed, 70 insertions(+), 10 deletions(-) + +diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h +index b8c2e1bef1f1..0380626bf525 100644 +--- a/drivers/md/bcache/bcache.h ++++ b/drivers/md/bcache/bcache.h +@@ -258,10 +258,11 @@ struct bcache_device { + struct gendisk *disk; + + unsigned long flags; +-#define BCACHE_DEV_CLOSING 0 +-#define BCACHE_DEV_DETACHING 1 +-#define BCACHE_DEV_UNLINK_DONE 2 +- ++#define BCACHE_DEV_CLOSING 0 ++#define BCACHE_DEV_DETACHING 1 ++#define BCACHE_DEV_UNLINK_DONE 2 ++#define BCACHE_DEV_WB_RUNNING 3 ++#define BCACHE_DEV_RATE_DW_RUNNING 4 + unsigned nr_stripes; + unsigned stripe_size; + atomic_t *stripe_sectors_dirty; +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 7d96dc6860fa..e15cacecf078 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -899,6 +899,32 @@ void bch_cached_dev_run(struct cached_dev *dc) + pr_debug("error creating sysfs link"); + } + ++/* ++ * If BCACHE_DEV_RATE_DW_RUNNING is set, it means routine of the delayed ++ * work dc->writeback_rate_update is running. Wait until the routine ++ * quits (BCACHE_DEV_RATE_DW_RUNNING is clear), then continue to ++ * cancel it. If BCACHE_DEV_RATE_DW_RUNNING is not clear after time_out ++ * seconds, give up waiting here and continue to cancel it too. ++ */ ++static void cancel_writeback_rate_update_dwork(struct cached_dev *dc) ++{ ++ int time_out = WRITEBACK_RATE_UPDATE_SECS_MAX * HZ; ++ ++ do { ++ if (!test_bit(BCACHE_DEV_RATE_DW_RUNNING, ++ &dc->disk.flags)) ++ break; ++ time_out--; ++ schedule_timeout_interruptible(1); ++ } while (time_out > 0); ++ ++ if (time_out == 0) ++ pr_warn("give up waiting for dc->writeback_write_update" ++ " to quit"); ++ ++ cancel_delayed_work_sync(&dc->writeback_rate_update); ++} ++ + static void cached_dev_detach_finish(struct work_struct *w) + { + struct cached_dev *dc = container_of(w, struct cached_dev, detach); +@@ -911,7 +937,9 @@ static void cached_dev_detach_finish(struct work_struct *w) + + mutex_lock(&bch_register_lock); + +- cancel_delayed_work_sync(&dc->writeback_rate_update); ++ if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) ++ cancel_writeback_rate_update_dwork(dc); ++ + if (!IS_ERR_OR_NULL(dc->writeback_thread)) { + kthread_stop(dc->writeback_thread); + dc->writeback_thread = NULL; +@@ -954,6 +982,7 @@ void bch_cached_dev_detach(struct cached_dev *dc) + closure_get(&dc->disk.cl); + + bch_writeback_queue(dc); ++ + cached_dev_put(dc); + } + +@@ -1079,14 +1108,16 @@ static void cached_dev_free(struct closure *cl) + { + struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); + +- cancel_delayed_work_sync(&dc->writeback_rate_update); ++ mutex_lock(&bch_register_lock); ++ ++ if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) ++ cancel_writeback_rate_update_dwork(dc); ++ + if (!IS_ERR_OR_NULL(dc->writeback_thread)) + kthread_stop(dc->writeback_thread); + if (dc->writeback_write_wq) + destroy_workqueue(dc->writeback_write_wq); + +- mutex_lock(&bch_register_lock); +- + if (atomic_read(&dc->running)) + bd_unlink_disk_holder(dc->bdev, dc->disk.disk); + bcache_device_free(&dc->disk); +diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c +index 4a6a697e1680..399e91cbf714 100644 +--- a/drivers/md/bcache/sysfs.c ++++ b/drivers/md/bcache/sysfs.c +@@ -306,7 +306,8 @@ STORE(bch_cached_dev) + bch_writeback_queue(dc); + + if (attr == &sysfs_writeback_percent) +- schedule_delayed_work(&dc->writeback_rate_update, ++ if (!test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) ++ schedule_delayed_work(&dc->writeback_rate_update, + dc->writeback_rate_update_seconds * HZ); + + mutex_unlock(&bch_register_lock); +diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c +index 4dbeaaa575bf..8f98ef1038d3 100644 +--- a/drivers/md/bcache/writeback.c ++++ b/drivers/md/bcache/writeback.c +@@ -115,6 +115,21 @@ static void update_writeback_rate(struct work_struct *work) + struct cached_dev, + writeback_rate_update); + ++ /* ++ * should check BCACHE_DEV_RATE_DW_RUNNING before calling ++ * cancel_delayed_work_sync(). ++ */ ++ set_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); ++ /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ ++ smp_mb(); ++ ++ if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) { ++ clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); ++ /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ ++ smp_mb(); ++ return; ++ } ++ + down_read(&dc->writeback_lock); + + if (atomic_read(&dc->has_dirty) && +@@ -123,8 +138,18 @@ static void update_writeback_rate(struct work_struct *work) + + up_read(&dc->writeback_lock); + +- schedule_delayed_work(&dc->writeback_rate_update, ++ if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) { ++ schedule_delayed_work(&dc->writeback_rate_update, + dc->writeback_rate_update_seconds * HZ); ++ } ++ ++ /* ++ * should check BCACHE_DEV_RATE_DW_RUNNING before calling ++ * cancel_delayed_work_sync(). ++ */ ++ clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); ++ /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ ++ smp_mb(); + } + + static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) +@@ -675,6 +700,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) + dc->writeback_rate_p_term_inverse = 40; + dc->writeback_rate_i_term_inverse = 10000; + ++ WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); + INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); + } + +@@ -693,6 +719,7 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc) + return PTR_ERR(dc->writeback_thread); + } + ++ WARN_ON(test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); + schedule_delayed_work(&dc->writeback_rate_update, + dc->writeback_rate_update_seconds * HZ); + +-- +2.16.1 + diff --git a/for-next/v5-0005-bcache-add-CACHE_SET_IO_DISABLE-to-struct-cache_s.patch b/for-next/v5-0005-bcache-add-CACHE_SET_IO_DISABLE-to-struct-cache_s.patch new file mode 100644 index 0000000..57afe27 --- /dev/null +++ b/for-next/v5-0005-bcache-add-CACHE_SET_IO_DISABLE-to-struct-cache_s.patch @@ -0,0 +1,491 @@ +From 805fefa99ced3d184d69c65449eb3e24104346b9 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sun, 14 Jan 2018 22:15:00 +0800 +Subject: [PATCH v5 05/11] bcache: add CACHE_SET_IO_DISABLE to struct cache_set + flags + +When too many I/Os failed on cache device, bch_cache_set_error() is called +in the error handling code path to retire whole problematic cache set. If +new I/O requests continue to come and take refcount dc->count, the cache +set won't be retired immediately, this is a problem. + +Further more, there are several kernel thread and self-armed kernel work +may still running after bch_cache_set_error() is called. It needs to wait +quite a while for them to stop, or they won't stop at all. They also +prevent the cache set from being retired. + +The solution in this patch is, to add per cache set flag to disable I/O +request on this cache and all attached backing devices. Then new coming I/O +requests can be rejected in *_make_request() before taking refcount, kernel +threads and self-armed kernel worker can stop very fast when flags bit +CACHE_SET_IO_DISABLE is set. + +Because bcache also do internal I/Os for writeback, garbage collection, +bucket allocation, journaling, this kind of I/O should be disabled after +bch_cache_set_error() is called. So closure_bio_submit() is modified to +check whether CACHE_SET_IO_DISABLE is set on cache_set->flags. If set, +closure_bio_submit() will set bio->bi_status to BLK_STS_IOERR and +return, generic_make_request() won't be called. + +A sysfs interface is also added to set or clear CACHE_SET_IO_DISABLE bit +from cache_set->flags, to disable or enable cache set I/O for debugging. It +is helpful to trigger more corner case issues for failed cache device. + +Changelog +v3, change CACHE_SET_IO_DISABLE from 4 to 3, since it is bit index. + remove "bcache: " prefix when printing out kernel message. +v2, more changes by previous review, +- Use CACHE_SET_IO_DISABLE of cache_set->flags, suggested by Junhui. +- Check CACHE_SET_IO_DISABLE in bch_btree_gc() to stop a while-loop, this + is reported and inspired from origal patch of Pavel Vazharov. +v1, initial version. + +Signed-off-by: Coly Li <colyli@suse.de> +Reviewed-by: Hannes Reinecke <hare@suse.com> +Cc: Junhui Tang <tang.junhui@zte.com.cn> +Cc: Michael Lyle <mlyle@lyle.org> +Cc: Pavel Vazharov <freakpv@gmail.com> +--- + drivers/md/bcache/alloc.c | 3 ++- + drivers/md/bcache/bcache.h | 18 ++++++++++++++++++ + drivers/md/bcache/btree.c | 10 +++++++--- + drivers/md/bcache/io.c | 2 +- + drivers/md/bcache/journal.c | 4 ++-- + drivers/md/bcache/request.c | 26 +++++++++++++++++++------- + drivers/md/bcache/super.c | 6 +++++- + drivers/md/bcache/sysfs.c | 20 ++++++++++++++++++++ + drivers/md/bcache/util.h | 6 ------ + drivers/md/bcache/writeback.c | 35 +++++++++++++++++++++++++++-------- + 10 files changed, 101 insertions(+), 29 deletions(-) + +diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c +index 458e1d38577d..004cc3cc6123 100644 +--- a/drivers/md/bcache/alloc.c ++++ b/drivers/md/bcache/alloc.c +@@ -287,7 +287,8 @@ do { \ + break; \ + \ + mutex_unlock(&(ca)->set->bucket_lock); \ +- if (kthread_should_stop()) { \ ++ if (kthread_should_stop() || \ ++ test_bit(CACHE_SET_IO_DISABLE, &ca->set->flags)) { \ + set_current_state(TASK_RUNNING); \ + return 0; \ + } \ +diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h +index 0380626bf525..7917b3820dd5 100644 +--- a/drivers/md/bcache/bcache.h ++++ b/drivers/md/bcache/bcache.h +@@ -475,10 +475,15 @@ struct gc_stat { + * + * CACHE_SET_RUNNING means all cache devices have been registered and journal + * replay is complete. ++ * ++ * CACHE_SET_IO_DISABLE is set when bcache is stopping the whold cache set, all ++ * external and internal I/O should be denied when this flag is set. ++ * + */ + #define CACHE_SET_UNREGISTERING 0 + #define CACHE_SET_STOPPING 1 + #define CACHE_SET_RUNNING 2 ++#define CACHE_SET_IO_DISABLE 3 + + struct cache_set { + struct closure cl; +@@ -868,6 +873,19 @@ static inline void wake_up_allocators(struct cache_set *c) + wake_up_process(ca->alloc_thread); + } + ++static inline void closure_bio_submit(struct cache_set *c, ++ struct bio *bio, ++ struct closure *cl) ++{ ++ closure_get(cl); ++ if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) { ++ bio->bi_status = BLK_STS_IOERR; ++ bio_endio(bio); ++ return; ++ } ++ generic_make_request(bio); ++} ++ + /* Forward declarations */ + + void bch_count_io_errors(struct cache *, blk_status_t, int, const char *); +diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c +index fad9fe8817eb..8ca50f387a1d 100644 +--- a/drivers/md/bcache/btree.c ++++ b/drivers/md/bcache/btree.c +@@ -1744,6 +1744,7 @@ static void bch_btree_gc(struct cache_set *c) + + btree_gc_start(c); + ++ /* if CACHE_SET_IO_DISABLE set, gc thread should stop too */ + do { + ret = btree_root(gc_root, c, &op, &writes, &stats); + closure_sync(&writes); +@@ -1751,7 +1752,7 @@ static void bch_btree_gc(struct cache_set *c) + + if (ret && ret != -EAGAIN) + pr_warn("gc failed!"); +- } while (ret); ++ } while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags)); + + bch_btree_gc_finish(c); + wake_up_allocators(c); +@@ -1789,9 +1790,12 @@ static int bch_gc_thread(void *arg) + + while (1) { + wait_event_interruptible(c->gc_wait, +- kthread_should_stop() || gc_should_run(c)); ++ kthread_should_stop() || ++ test_bit(CACHE_SET_IO_DISABLE, &c->flags) || ++ gc_should_run(c)); + +- if (kthread_should_stop()) ++ if (kthread_should_stop() || ++ test_bit(CACHE_SET_IO_DISABLE, &c->flags)) + break; + + set_gc_sectors(c); +diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c +index a783c5a41ff1..8013ecbcdbda 100644 +--- a/drivers/md/bcache/io.c ++++ b/drivers/md/bcache/io.c +@@ -38,7 +38,7 @@ void __bch_submit_bbio(struct bio *bio, struct cache_set *c) + bio_set_dev(bio, PTR_CACHE(c, &b->key, 0)->bdev); + + b->submit_time_us = local_clock_us(); +- closure_bio_submit(bio, bio->bi_private); ++ closure_bio_submit(c, bio, bio->bi_private); + } + + void bch_submit_bbio(struct bio *bio, struct cache_set *c, +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index 1b736b860739..c94085f400a4 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -62,7 +62,7 @@ reread: left = ca->sb.bucket_size - offset; + bio_set_op_attrs(bio, REQ_OP_READ, 0); + bch_bio_map(bio, data); + +- closure_bio_submit(bio, &cl); ++ closure_bio_submit(ca->set, bio, &cl); + closure_sync(&cl); + + /* This function could be simpler now since we no longer write +@@ -674,7 +674,7 @@ static void journal_write_unlocked(struct closure *cl) + spin_unlock(&c->journal.lock); + + while ((bio = bio_list_pop(&list))) +- closure_bio_submit(bio, cl); ++ closure_bio_submit(c, bio, cl); + + continue_at(cl, journal_write_done, NULL); + } +diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c +index 1a46b41dac70..02296bda6384 100644 +--- a/drivers/md/bcache/request.c ++++ b/drivers/md/bcache/request.c +@@ -747,7 +747,7 @@ static void cached_dev_read_error(struct closure *cl) + + /* XXX: invalidate cache */ + +- closure_bio_submit(bio, cl); ++ closure_bio_submit(s->iop.c, bio, cl); + } + + continue_at(cl, cached_dev_cache_miss_done, NULL); +@@ -872,7 +872,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, + s->cache_miss = miss; + s->iop.bio = cache_bio; + bio_get(cache_bio); +- closure_bio_submit(cache_bio, &s->cl); ++ closure_bio_submit(s->iop.c, cache_bio, &s->cl); + + return ret; + out_put: +@@ -880,7 +880,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, + out_submit: + miss->bi_end_io = request_endio; + miss->bi_private = &s->cl; +- closure_bio_submit(miss, &s->cl); ++ closure_bio_submit(s->iop.c, miss, &s->cl); + return ret; + } + +@@ -945,7 +945,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) + + if ((bio_op(bio) != REQ_OP_DISCARD) || + blk_queue_discard(bdev_get_queue(dc->bdev))) +- closure_bio_submit(bio, cl); ++ closure_bio_submit(s->iop.c, bio, cl); + } else if (s->iop.writeback) { + bch_writeback_add(dc); + s->iop.bio = bio; +@@ -960,12 +960,12 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) + flush->bi_private = cl; + flush->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; + +- closure_bio_submit(flush, cl); ++ closure_bio_submit(s->iop.c, flush, cl); + } + } else { + s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split); + +- closure_bio_submit(bio, cl); ++ closure_bio_submit(s->iop.c, bio, cl); + } + + closure_call(&s->iop.cl, bch_data_insert, NULL, cl); +@@ -981,7 +981,7 @@ static void cached_dev_nodata(struct closure *cl) + bch_journal_meta(s->iop.c, cl); + + /* If it's a flush, we send the flush to the backing device too */ +- closure_bio_submit(bio, cl); ++ closure_bio_submit(s->iop.c, bio, cl); + + continue_at(cl, cached_dev_bio_complete, NULL); + } +@@ -996,6 +996,12 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, + struct cached_dev *dc = container_of(d, struct cached_dev, disk); + int rw = bio_data_dir(bio); + ++ if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) { ++ bio->bi_status = BLK_STS_IOERR; ++ bio_endio(bio); ++ return BLK_QC_T_NONE; ++ } ++ + atomic_set(&dc->backing_idle, 0); + generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); + +@@ -1112,6 +1118,12 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q, + struct bcache_device *d = bio->bi_disk->private_data; + int rw = bio_data_dir(bio); + ++ if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) { ++ bio->bi_status = BLK_STS_IOERR; ++ bio_endio(bio); ++ return BLK_QC_T_NONE; ++ } ++ + generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); + + s = search_alloc(bio, d); +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index e15cacecf078..f8b0d1196c12 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -521,7 +521,7 @@ static void prio_io(struct cache *ca, uint64_t bucket, int op, + bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags); + bch_bio_map(bio, ca->disk_buckets); + +- closure_bio_submit(bio, &ca->prio); ++ closure_bio_submit(ca->set, bio, &ca->prio); + closure_sync(cl); + } + +@@ -1349,6 +1349,9 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...) + test_bit(CACHE_SET_STOPPING, &c->flags)) + return false; + ++ if (test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags)) ++ pr_warn("CACHE_SET_IO_DISABLE already set"); ++ + /* XXX: we can be called from atomic context + acquire_console_sem(); + */ +@@ -1584,6 +1587,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) + c->congested_read_threshold_us = 2000; + c->congested_write_threshold_us = 20000; + c->error_limit = DEFAULT_IO_ERROR_LIMIT; ++ WARN_ON(test_and_clear_bit(CACHE_SET_IO_DISABLE, &c->flags)); + + return c; + err: +diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c +index 399e91cbf714..cf973c07c856 100644 +--- a/drivers/md/bcache/sysfs.c ++++ b/drivers/md/bcache/sysfs.c +@@ -95,6 +95,7 @@ read_attribute(partial_stripes_expensive); + + rw_attribute(synchronous); + rw_attribute(journal_delay_ms); ++rw_attribute(io_disable); + rw_attribute(discard); + rw_attribute(running); + rw_attribute(label); +@@ -588,6 +589,8 @@ SHOW(__bch_cache_set) + sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite); + sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled); + sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); ++ sysfs_printf(io_disable, "%i", ++ test_bit(CACHE_SET_IO_DISABLE, &c->flags)); + + if (attr == &sysfs_bset_tree_stats) + return bch_bset_print_stats(c, buf); +@@ -677,6 +680,22 @@ STORE(__bch_cache_set) + if (attr == &sysfs_io_error_halflife) + c->error_decay = strtoul_or_return(buf) / 88; + ++ if (attr == &sysfs_io_disable) { ++ int v = strtoul_or_return(buf); ++ ++ if (v) { ++ if (test_and_set_bit(CACHE_SET_IO_DISABLE, ++ &c->flags)) ++ pr_warn("CACHE_SET_IO_DISABLE" ++ " already set"); ++ } else { ++ if (!test_and_clear_bit(CACHE_SET_IO_DISABLE, ++ &c->flags)) ++ pr_warn("CACHE_SET_IO_DISABLE" ++ " already cleared"); ++ } ++ } ++ + sysfs_strtoul(journal_delay_ms, c->journal_delay_ms); + sysfs_strtoul(verify, c->verify); + sysfs_strtoul(key_merging_disabled, c->key_merging_disabled); +@@ -762,6 +781,7 @@ static struct attribute *bch_cache_set_internal_files[] = { + &sysfs_gc_always_rewrite, + &sysfs_btree_shrinker_disabled, + &sysfs_copy_gc_enabled, ++ &sysfs_io_disable, + NULL + }; + KTYPE(bch_cache_set_internal); +diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h +index a6763db7f061..268024529edd 100644 +--- a/drivers/md/bcache/util.h ++++ b/drivers/md/bcache/util.h +@@ -567,12 +567,6 @@ static inline sector_t bdev_sectors(struct block_device *bdev) + return bdev->bd_inode->i_size >> 9; + } + +-#define closure_bio_submit(bio, cl) \ +-do { \ +- closure_get(cl); \ +- generic_make_request(bio); \ +-} while (0) +- + uint64_t bch_crc64_update(uint64_t, const void *, size_t); + uint64_t bch_crc64(const void *, size_t); + +diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c +index 8f98ef1038d3..3d7d8452e0de 100644 +--- a/drivers/md/bcache/writeback.c ++++ b/drivers/md/bcache/writeback.c +@@ -114,6 +114,7 @@ static void update_writeback_rate(struct work_struct *work) + struct cached_dev *dc = container_of(to_delayed_work(work), + struct cached_dev, + writeback_rate_update); ++ struct cache_set *c = dc->disk.c; + + /* + * should check BCACHE_DEV_RATE_DW_RUNNING before calling +@@ -123,7 +124,12 @@ static void update_writeback_rate(struct work_struct *work) + /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ + smp_mb(); + +- if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) { ++ /* ++ * CACHE_SET_IO_DISABLE might be set via sysfs interface, ++ * check it here too. ++ */ ++ if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) || ++ test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { + clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags); + /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */ + smp_mb(); +@@ -138,7 +144,12 @@ static void update_writeback_rate(struct work_struct *work) + + up_read(&dc->writeback_lock); + +- if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) { ++ /* ++ * CACHE_SET_IO_DISABLE might be set via sysfs interface, ++ * check it here too. ++ */ ++ if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) && ++ !test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { + schedule_delayed_work(&dc->writeback_rate_update, + dc->writeback_rate_update_seconds * HZ); + } +@@ -278,7 +289,7 @@ static void write_dirty(struct closure *cl) + bio_set_dev(&io->bio, io->dc->bdev); + io->bio.bi_end_io = dirty_endio; + +- closure_bio_submit(&io->bio, cl); ++ closure_bio_submit(io->dc->disk.c, &io->bio, cl); + } + + atomic_set(&dc->writeback_sequence_next, next_sequence); +@@ -304,7 +315,7 @@ static void read_dirty_submit(struct closure *cl) + { + struct dirty_io *io = container_of(cl, struct dirty_io, cl); + +- closure_bio_submit(&io->bio, cl); ++ closure_bio_submit(io->dc->disk.c, &io->bio, cl); + + continue_at(cl, write_dirty, io->dc->writeback_write_wq); + } +@@ -330,7 +341,9 @@ static void read_dirty(struct cached_dev *dc) + + next = bch_keybuf_next(&dc->writeback_keys); + +- while (!kthread_should_stop() && next) { ++ while (!kthread_should_stop() && ++ !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) && ++ next) { + size = 0; + nk = 0; + +@@ -427,7 +440,9 @@ static void read_dirty(struct cached_dev *dc) + } + } + +- while (!kthread_should_stop() && delay) { ++ while (!kthread_should_stop() && ++ !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) && ++ delay) { + schedule_timeout_interruptible(delay); + delay = writeback_delay(dc, 0); + } +@@ -583,11 +598,13 @@ static bool refill_dirty(struct cached_dev *dc) + static int bch_writeback_thread(void *arg) + { + struct cached_dev *dc = arg; ++ struct cache_set *c = dc->disk.c; + bool searched_full_index; + + bch_ratelimit_reset(&dc->writeback_rate); + +- while (!kthread_should_stop()) { ++ while (!kthread_should_stop() && ++ !test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { + down_write(&dc->writeback_lock); + set_current_state(TASK_INTERRUPTIBLE); + /* +@@ -601,7 +618,8 @@ static int bch_writeback_thread(void *arg) + (!atomic_read(&dc->has_dirty) || !dc->writeback_running)) { + up_write(&dc->writeback_lock); + +- if (kthread_should_stop()) { ++ if (kthread_should_stop() || ++ test_bit(CACHE_SET_IO_DISABLE, &c->flags)) { + set_current_state(TASK_RUNNING); + break; + } +@@ -637,6 +655,7 @@ static int bch_writeback_thread(void *arg) + + while (delay && + !kthread_should_stop() && ++ !test_bit(CACHE_SET_IO_DISABLE, &c->flags) && + !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) + delay = schedule_timeout_interruptible(delay); + +-- +2.16.1 + diff --git a/for-next/v5-0006-bcache-stop-all-attached-bcache-devices-for-a-ret.patch b/for-next/v5-0006-bcache-stop-all-attached-bcache-devices-for-a-ret.patch new file mode 100644 index 0000000..7cea41c --- /dev/null +++ b/for-next/v5-0006-bcache-stop-all-attached-bcache-devices-for-a-ret.patch @@ -0,0 +1,67 @@ +From e741114d13958a91559cd8c376cf704cb2180370 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Wed, 10 Jan 2018 00:26:32 +0800 +Subject: [PATCH v5 06/11] bcache: stop all attached bcache devices for a + retired cache set + +When there are too many I/O errors on cache device, current bcache code +will retire the whole cache set, and detach all bcache devices. But the +detached bcache devices are not stopped, which is problematic when bcache +is in writeback mode. + +If the retired cache set has dirty data of backing devices, continue +writing to bcache device will write to backing device directly. If the +LBA of write request has a dirty version cached on cache device, next time +when the cache device is re-registered and backing device re-attached to +it again, the stale dirty data on cache device will be written to backing +device, and overwrite latest directly written data. This situation causes +a quite data corruption. + +This patch checkes whether cache_set->io_disable is true in +__cache_set_unregister(). If cache_set->io_disable is true, it means cache +set is unregistering by too many I/O errors, then all attached bcache +devices will be stopped as well. If cache_set->io_disable is not true, it +means __cache_set_unregister() is triggered by writing 1 to sysfs file +/sys/fs/bcache/<UUID>/bcache/stop. This is an exception because users do +it explicitly, this patch keeps existing behavior and does not stop any +bcache device. + +Even the failed cache device has no dirty data, stopping bcache device is +still a desired behavior by many Ceph and data base users. Then their +application will report I/O errors due to disappeared bcache device, and +operation people will know the cache device is broken or disconnected. + +Changelog: +v2: add reviewed-by from Hannes. +v1: initial version for review. + +Signed-off-by: Coly Li <colyli@suse.de> +Reviewed-by: Hannes Reinecke <hare@suse.com> +Cc: Junhui Tang <tang.junhui@zte.com.cn> +Cc: Michael Lyle <mlyle@lyle.org> +--- + drivers/md/bcache/super.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index f8b0d1196c12..41ef438e7b40 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -1478,6 +1478,14 @@ static void __cache_set_unregister(struct closure *cl) + dc = container_of(c->devices[i], + struct cached_dev, disk); + bch_cached_dev_detach(dc); ++ /* ++ * If we come here by too many I/O errors, ++ * bcache device should be stopped too, to ++ * keep data consistency on cache and ++ * backing devices. ++ */ ++ if (test_bit(CACHE_SET_IO_DISABLE, &c->flags)) ++ bcache_device_stop(c->devices[i]); + } else { + bcache_device_stop(c->devices[i]); + } +-- +2.16.1 + diff --git a/for-next/v5-0007-bcache-fix-inaccurate-io-state-for-detached-bcach.patch b/for-next/v5-0007-bcache-fix-inaccurate-io-state-for-detached-bcach.patch new file mode 100644 index 0000000..419c2c7 --- /dev/null +++ b/for-next/v5-0007-bcache-fix-inaccurate-io-state-for-detached-bcach.patch @@ -0,0 +1,119 @@ +From d6f9f789096d2c3473314fa10ea1166683399ac8 Mon Sep 17 00:00:00 2001 +From: Tang Junhui <tang.junhui@zte.com.cn> +Date: Tue, 9 Jan 2018 10:27:11 +0800 +Subject: [PATCH v5 07/11] bcache: fix inaccurate io state for detached bcache + devices + +When we run IO in a detached device, and run iostat to shows IO status, +normally it will show like bellow (Omitted some fields): +Device: ... avgrq-sz avgqu-sz await r_await w_await svctm %util +sdd ... 15.89 0.53 1.82 0.20 2.23 1.81 52.30 +bcache0 ... 15.89 115.42 0.00 0.00 0.00 2.40 69.60 +but after IO stopped, there are still very big avgqu-sz and %util +values as bellow: +Device: ... avgrq-sz avgqu-sz await r_await w_await svctm %util +bcache0 ... 0 5326.32 0.00 0.00 0.00 0.00 100.10 + +The reason for this issue is that, only generic_start_io_acct() called +and no generic_end_io_acct() called for detached device in +cached_dev_make_request(). See the code: +//start generic_start_io_acct() +generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); +if (cached_dev_get(dc)) { + //will callback generic_end_io_acct() +} +else { + //will not call generic_end_io_acct() +} + +This patch calls generic_end_io_acct() in the end of IO for detached +devices, so we can show IO state correctly. + +(Modified to use GFP_NOIO in kzalloc() by Coly Li) + +Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn> +Reviewed-by: Coly Li <colyli@suse.de> +Reviewed-by: Hannes Reinecke <hare@suse.com> +--- + drivers/md/bcache/request.c | 58 +++++++++++++++++++++++++++++++++++++++------ + 1 file changed, 51 insertions(+), 7 deletions(-) + +diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c +index 02296bda6384..e09c5ae745be 100644 +--- a/drivers/md/bcache/request.c ++++ b/drivers/md/bcache/request.c +@@ -986,6 +986,55 @@ static void cached_dev_nodata(struct closure *cl) + continue_at(cl, cached_dev_bio_complete, NULL); + } + ++struct detached_dev_io_private { ++ struct bcache_device *d; ++ unsigned long start_time; ++ bio_end_io_t *bi_end_io; ++ void *bi_private; ++}; ++ ++static void detatched_dev_end_io(struct bio *bio) ++{ ++ struct detached_dev_io_private *ddip; ++ ++ ddip = bio->bi_private; ++ bio->bi_end_io = ddip->bi_end_io; ++ bio->bi_private = ddip->bi_private; ++ ++ generic_end_io_acct(ddip->d->disk->queue, ++ bio_data_dir(bio), ++ &ddip->d->disk->part0, ddip->start_time); ++ ++ kfree(ddip); ++ ++ bio->bi_end_io(bio); ++} ++ ++static void detached_dev_do_request(struct bcache_device *d, struct bio *bio) ++{ ++ struct detached_dev_io_private *ddip; ++ struct cached_dev *dc = container_of(d, struct cached_dev, disk); ++ ++ /* ++ * no need to call closure_get(&dc->disk.cl), ++ * because upper layer had already opened bcache device, ++ * which would call closure_get(&dc->disk.cl) ++ */ ++ ddip = kzalloc(sizeof(struct detached_dev_io_private), GFP_NOIO); ++ ddip->d = d; ++ ddip->start_time = jiffies; ++ ddip->bi_end_io = bio->bi_end_io; ++ ddip->bi_private = bio->bi_private; ++ bio->bi_end_io = detatched_dev_end_io; ++ bio->bi_private = ddip; ++ ++ if ((bio_op(bio) == REQ_OP_DISCARD) && ++ !blk_queue_discard(bdev_get_queue(dc->bdev))) ++ bio->bi_end_io(bio); ++ else ++ generic_make_request(bio); ++} ++ + /* Cached devices - read & write stuff */ + + static blk_qc_t cached_dev_make_request(struct request_queue *q, +@@ -1028,13 +1077,8 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, + else + cached_dev_read(dc, s); + } +- } else { +- if ((bio_op(bio) == REQ_OP_DISCARD) && +- !blk_queue_discard(bdev_get_queue(dc->bdev))) +- bio_endio(bio); +- else +- generic_make_request(bio); +- } ++ } else ++ detached_dev_do_request(d, bio); + + return BLK_QC_T_NONE; + } +-- +2.16.1 + diff --git a/for-next/v5-0008-bcache-add-backing_request_endio-for-bi_end_io-of.patch b/for-next/v5-0008-bcache-add-backing_request_endio-for-bi_end_io-of.patch new file mode 100644 index 0000000..4cddd78 --- /dev/null +++ b/for-next/v5-0008-bcache-add-backing_request_endio-for-bi_end_io-of.patch @@ -0,0 +1,255 @@ +From e76beb2960de33506c0f6e177d43f8a8cfafee30 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Wed, 10 Jan 2018 21:01:48 +0800 +Subject: [PATCH v5 08/11] bcache: add backing_request_endio() for bi_end_io of + attached backing device I/O + +In order to catch I/O error of backing device, a separate bi_end_io +call back is required. Then a per backing device counter can record I/O +errors number and retire the backing device if the counter reaches a +per backing device I/O error limit. + +This patch adds backing_request_endio() to bcache backing device I/O code +path, this is a preparation for further complicated backing device failure +handling. So far there is no real code logic change, I make this change a +separate patch to make sure it is stable and reliable for further work. + +Changelog: +v2: Fix code comments typo, remove a redundant bch_writeback_add() line + added in v4 patch set. +v1: indeed this is new added in this patch set. + +Signed-off-by: Coly Li <colyli@suse.de> +Reviewed-by: Hannes Reinecke <hare@suse.com> +Cc: Junhui Tang <tang.junhui@zte.com.cn> +Cc: Michael Lyle <mlyle@lyle.org> +--- + drivers/md/bcache/request.c | 93 +++++++++++++++++++++++++++++++++++-------- + drivers/md/bcache/super.c | 1 + + drivers/md/bcache/writeback.c | 1 + + 3 files changed, 79 insertions(+), 16 deletions(-) + +diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c +index e09c5ae745be..9c6dda3b0068 100644 +--- a/drivers/md/bcache/request.c ++++ b/drivers/md/bcache/request.c +@@ -139,6 +139,7 @@ static void bch_data_invalidate(struct closure *cl) + } + + op->insert_data_done = true; ++ /* get in bch_data_insert() */ + bio_put(bio); + out: + continue_at(cl, bch_data_insert_keys, op->wq); +@@ -630,6 +631,38 @@ static void request_endio(struct bio *bio) + closure_put(cl); + } + ++static void backing_request_endio(struct bio *bio) ++{ ++ struct closure *cl = bio->bi_private; ++ ++ if (bio->bi_status) { ++ struct search *s = container_of(cl, struct search, cl); ++ /* ++ * If a bio has REQ_PREFLUSH for writeback mode, it is ++ * speically assembled in cached_dev_write() for a non-zero ++ * write request which has REQ_PREFLUSH. we don't set ++ * s->iop.status by this failure, the status will be decided ++ * by result of bch_data_insert() operation. ++ */ ++ if (unlikely(s->iop.writeback && ++ bio->bi_opf & REQ_PREFLUSH)) { ++ char buf[BDEVNAME_SIZE]; ++ ++ bio_devname(bio, buf); ++ pr_err("Can't flush %s: returned bi_status %i", ++ buf, bio->bi_status); ++ } else { ++ /* set to orig_bio->bi_status in bio_complete() */ ++ s->iop.status = bio->bi_status; ++ } ++ s->recoverable = false; ++ /* should count I/O error for backing device here */ ++ } ++ ++ bio_put(bio); ++ closure_put(cl); ++} ++ + static void bio_complete(struct search *s) + { + if (s->orig_bio) { +@@ -644,13 +677,21 @@ static void bio_complete(struct search *s) + } + } + +-static void do_bio_hook(struct search *s, struct bio *orig_bio) ++static void do_bio_hook(struct search *s, ++ struct bio *orig_bio, ++ bio_end_io_t *end_io_fn) + { + struct bio *bio = &s->bio.bio; + + bio_init(bio, NULL, 0); + __bio_clone_fast(bio, orig_bio); +- bio->bi_end_io = request_endio; ++ /* ++ * bi_end_io can be set separately somewhere else, e.g. the ++ * variants in, ++ * - cache_bio->bi_end_io from cached_dev_cache_miss() ++ * - n->bi_end_io from cache_lookup_fn() ++ */ ++ bio->bi_end_io = end_io_fn; + bio->bi_private = &s->cl; + + bio_cnt_set(bio, 3); +@@ -676,7 +717,7 @@ static inline struct search *search_alloc(struct bio *bio, + s = mempool_alloc(d->c->search, GFP_NOIO); + + closure_init(&s->cl, NULL); +- do_bio_hook(s, bio); ++ do_bio_hook(s, bio, request_endio); + + s->orig_bio = bio; + s->cache_miss = NULL; +@@ -743,10 +784,11 @@ static void cached_dev_read_error(struct closure *cl) + trace_bcache_read_retry(s->orig_bio); + + s->iop.status = 0; +- do_bio_hook(s, s->orig_bio); ++ do_bio_hook(s, s->orig_bio, backing_request_endio); + + /* XXX: invalidate cache */ + ++ /* I/O request sent to backing device */ + closure_bio_submit(s->iop.c, bio, cl); + } + +@@ -859,7 +901,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, + bio_copy_dev(cache_bio, miss); + cache_bio->bi_iter.bi_size = s->insert_bio_sectors << 9; + +- cache_bio->bi_end_io = request_endio; ++ cache_bio->bi_end_io = backing_request_endio; + cache_bio->bi_private = &s->cl; + + bch_bio_map(cache_bio, NULL); +@@ -872,14 +914,16 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, + s->cache_miss = miss; + s->iop.bio = cache_bio; + bio_get(cache_bio); ++ /* I/O request sent to backing device */ + closure_bio_submit(s->iop.c, cache_bio, &s->cl); + + return ret; + out_put: + bio_put(cache_bio); + out_submit: +- miss->bi_end_io = request_endio; ++ miss->bi_end_io = backing_request_endio; + miss->bi_private = &s->cl; ++ /* I/O request sent to backing device */ + closure_bio_submit(s->iop.c, miss, &s->cl); + return ret; + } +@@ -943,31 +987,46 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) + s->iop.bio = s->orig_bio; + bio_get(s->iop.bio); + +- if ((bio_op(bio) != REQ_OP_DISCARD) || +- blk_queue_discard(bdev_get_queue(dc->bdev))) +- closure_bio_submit(s->iop.c, bio, cl); ++ if (bio_op(bio) == REQ_OP_DISCARD && ++ !blk_queue_discard(bdev_get_queue(dc->bdev))) ++ goto insert_data; ++ ++ /* I/O request sent to backing device */ ++ bio->bi_end_io = backing_request_endio; ++ closure_bio_submit(s->iop.c, bio, cl); ++ + } else if (s->iop.writeback) { + bch_writeback_add(dc); + s->iop.bio = bio; + + if (bio->bi_opf & REQ_PREFLUSH) { +- /* Also need to send a flush to the backing device */ +- struct bio *flush = bio_alloc_bioset(GFP_NOIO, 0, +- dc->disk.bio_split); +- ++ /* ++ * Also need to send a flush to the backing ++ * device. ++ */ ++ struct bio *flush; ++ ++ flush = bio_alloc_bioset(GFP_NOIO, 0, ++ dc->disk.bio_split); ++ if (!flush) { ++ s->iop.status = BLK_STS_RESOURCE; ++ goto insert_data; ++ } + bio_copy_dev(flush, bio); +- flush->bi_end_io = request_endio; ++ flush->bi_end_io = backing_request_endio; + flush->bi_private = cl; + flush->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; +- ++ /* I/O request sent to backing device */ + closure_bio_submit(s->iop.c, flush, cl); + } + } else { + s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split); +- ++ /* I/O request sent to backing device */ ++ bio->bi_end_io = backing_request_endio; + closure_bio_submit(s->iop.c, bio, cl); + } + ++insert_data: + closure_call(&s->iop.cl, bch_data_insert, NULL, cl); + continue_at(cl, cached_dev_write_complete, NULL); + } +@@ -981,6 +1040,7 @@ static void cached_dev_nodata(struct closure *cl) + bch_journal_meta(s->iop.c, cl); + + /* If it's a flush, we send the flush to the backing device too */ ++ bio->bi_end_io = backing_request_endio; + closure_bio_submit(s->iop.c, bio, cl); + + continue_at(cl, cached_dev_bio_complete, NULL); +@@ -1078,6 +1138,7 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, + cached_dev_read(dc, s); + } + } else ++ /* I/O request sent to backing device */ + detached_dev_do_request(d, bio); + + return BLK_QC_T_NONE; +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 41ef438e7b40..082faaf2ee2f 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -265,6 +265,7 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) + bio->bi_private = dc; + + closure_get(cl); ++ /* I/O request sent to backing device */ + __write_super(&dc->sb, bio); + + closure_return_with_destructor(cl, bch_write_bdev_super_unlock); +diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c +index 3d7d8452e0de..4ebe0119ea7e 100644 +--- a/drivers/md/bcache/writeback.c ++++ b/drivers/md/bcache/writeback.c +@@ -289,6 +289,7 @@ static void write_dirty(struct closure *cl) + bio_set_dev(&io->bio, io->dc->bdev); + io->bio.bi_end_io = dirty_endio; + ++ /* I/O request sent to backing device */ + closure_bio_submit(io->dc->disk.c, &io->bio, cl); + } + +-- +2.16.1 + diff --git a/for-next/v5-0009-bcache-add-io_disable-to-struct-cached_dev.patch b/for-next/v5-0009-bcache-add-io_disable-to-struct-cached_dev.patch new file mode 100644 index 0000000..9a63f8d --- /dev/null +++ b/for-next/v5-0009-bcache-add-io_disable-to-struct-cached_dev.patch @@ -0,0 +1,237 @@ +From fff4f5d3d40952ddba1649fd7a5ce0d025b0b3cc Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Wed, 10 Jan 2018 21:33:45 +0800 +Subject: [PATCH v5 09/11] bcache: add io_disable to struct cached_dev + +If a bcache device is configured to writeback mode, current code does not +handle write I/O errors on backing devices properly. + +In writeback mode, write request is written to cache device, and +latter being flushed to backing device. If I/O failed when writing from +cache device to the backing device, bcache code just ignores the error and +upper layer code is NOT noticed that the backing device is broken. + +This patch tries to handle backing device failure like how the cache device +failure is handled, +- Add a error counter 'io_errors' and error limit 'error_limit' in struct + cached_dev. Add another io_disable to struct cached_dev to disable I/Os + on the problematic backing device. +- When I/O error happens on backing device, increase io_errors counter. And + if io_errors reaches error_limit, set cache_dev->io_disable to true, and + stop the bcache device. + +The result is, if backing device is broken of disconnected, and I/O errors +reach its error limit, backing device will be disabled and the associated +bcache device will be removed from system. + +Changelog: +v2: remove "bcache: " prefix in pr_error(), and use correct name string to + print out bcache device gendisk name. +v1: indeed this is new added in v2 patch set. + +Signed-off-by: Coly Li <colyli@suse.de> +Reviewed-by: Hannes Reinecke <hare@suse.com> +Cc: Michael Lyle <mlyle@lyle.org> +Cc: Junhui Tang <tang.junhui@zte.com.cn> +--- + drivers/md/bcache/bcache.h | 7 +++++++ + drivers/md/bcache/io.c | 14 ++++++++++++++ + drivers/md/bcache/request.c | 14 ++++++++++++-- + drivers/md/bcache/super.c | 22 ++++++++++++++++++++++ + drivers/md/bcache/sysfs.c | 15 ++++++++++++++- + 5 files changed, 69 insertions(+), 3 deletions(-) + +diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h +index 7917b3820dd5..822ec75bb78c 100644 +--- a/drivers/md/bcache/bcache.h ++++ b/drivers/md/bcache/bcache.h +@@ -360,6 +360,7 @@ struct cached_dev { + unsigned sequential_cutoff; + unsigned readahead; + ++ unsigned io_disable:1; + unsigned verify:1; + unsigned bypass_torture_test:1; + +@@ -379,6 +380,10 @@ struct cached_dev { + unsigned writeback_rate_i_term_inverse; + unsigned writeback_rate_p_term_inverse; + unsigned writeback_rate_minimum; ++ ++#define DEFAULT_CACHED_DEV_ERROR_LIMIT 64 ++ atomic_t io_errors; ++ unsigned error_limit; + }; + + enum alloc_reserve { +@@ -888,6 +893,7 @@ static inline void closure_bio_submit(struct cache_set *c, + + /* Forward declarations */ + ++void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio); + void bch_count_io_errors(struct cache *, blk_status_t, int, const char *); + void bch_bbio_count_io_errors(struct cache_set *, struct bio *, + blk_status_t, const char *); +@@ -915,6 +921,7 @@ int bch_bucket_alloc_set(struct cache_set *, unsigned, + struct bkey *, int, bool); + bool bch_alloc_sectors(struct cache_set *, struct bkey *, unsigned, + unsigned, unsigned, bool); ++bool bch_cached_dev_error(struct cached_dev *dc); + + __printf(2, 3) + bool bch_cache_set_error(struct cache_set *, const char *, ...); +diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c +index 8013ecbcdbda..7fac97ae036e 100644 +--- a/drivers/md/bcache/io.c ++++ b/drivers/md/bcache/io.c +@@ -50,6 +50,20 @@ void bch_submit_bbio(struct bio *bio, struct cache_set *c, + } + + /* IO errors */ ++void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio) ++{ ++ char buf[BDEVNAME_SIZE]; ++ unsigned errors; ++ ++ WARN_ONCE(!dc, "NULL pointer of struct cached_dev"); ++ ++ errors = atomic_add_return(1, &dc->io_errors); ++ if (errors < dc->error_limit) ++ pr_err("%s: IO error on backing device, unrecoverable", ++ bio_devname(bio, buf)); ++ else ++ bch_cached_dev_error(dc); ++} + + void bch_count_io_errors(struct cache *ca, + blk_status_t error, +diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c +index 9c6dda3b0068..03245e6980a6 100644 +--- a/drivers/md/bcache/request.c ++++ b/drivers/md/bcache/request.c +@@ -637,6 +637,8 @@ static void backing_request_endio(struct bio *bio) + + if (bio->bi_status) { + struct search *s = container_of(cl, struct search, cl); ++ struct cached_dev *dc = container_of(s->d, ++ struct cached_dev, disk); + /* + * If a bio has REQ_PREFLUSH for writeback mode, it is + * speically assembled in cached_dev_write() for a non-zero +@@ -657,6 +659,7 @@ static void backing_request_endio(struct bio *bio) + } + s->recoverable = false; + /* should count I/O error for backing device here */ ++ bch_count_backing_io_errors(dc, bio); + } + + bio_put(bio); +@@ -1065,8 +1068,14 @@ static void detatched_dev_end_io(struct bio *bio) + bio_data_dir(bio), + &ddip->d->disk->part0, ddip->start_time); + +- kfree(ddip); ++ if (bio->bi_status) { ++ struct cached_dev *dc = container_of(ddip->d, ++ struct cached_dev, disk); ++ /* should count I/O error for backing device here */ ++ bch_count_backing_io_errors(dc, bio); ++ } + ++ kfree(ddip); + bio->bi_end_io(bio); + } + +@@ -1105,7 +1114,8 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, + struct cached_dev *dc = container_of(d, struct cached_dev, disk); + int rw = bio_data_dir(bio); + +- if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) { ++ if (unlikely((d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags)) || ++ dc->io_disable)) { + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + return BLK_QC_T_NONE; +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 082faaf2ee2f..91a08cdd55bd 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -1188,6 +1188,10 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) + max(dc->disk.disk->queue->backing_dev_info->ra_pages, + q->backing_dev_info->ra_pages); + ++ atomic_set(&dc->io_errors, 0); ++ dc->io_disable = false; ++ dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT; ++ + bch_cached_dev_request_init(dc); + bch_cached_dev_writeback_init(dc); + return 0; +@@ -1339,6 +1343,24 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size) + return flash_dev_run(c, u); + } + ++bool bch_cached_dev_error(struct cached_dev *dc) ++{ ++ char name[BDEVNAME_SIZE]; ++ ++ if (!dc || test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags)) ++ return false; ++ ++ dc->io_disable = true; ++ /* make others know io_disable is true earlier */ ++ smp_mb(); ++ ++ pr_err("stop %s: too many IO errors on backing device %s\n", ++ dc->disk.disk->disk_name, bdevname(dc->bdev, name)); ++ ++ bcache_device_stop(&dc->disk); ++ return true; ++} ++ + /* Cache set */ + + __printf(2, 3) +diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c +index cf973c07c856..ac3adf2dcf6c 100644 +--- a/drivers/md/bcache/sysfs.c ++++ b/drivers/md/bcache/sysfs.c +@@ -134,7 +134,9 @@ SHOW(__bch_cached_dev) + var_print(writeback_delay); + var_print(writeback_percent); + sysfs_hprint(writeback_rate, dc->writeback_rate.rate << 9); +- ++ sysfs_hprint(io_errors, atomic_read(&dc->io_errors)); ++ sysfs_printf(io_error_limit, "%i", dc->error_limit); ++ sysfs_printf(io_disable, "%i", dc->io_disable); + var_print(writeback_rate_update_seconds); + var_print(writeback_rate_i_term_inverse); + var_print(writeback_rate_p_term_inverse); +@@ -225,6 +227,14 @@ STORE(__cached_dev) + d_strtoul(writeback_rate_i_term_inverse); + d_strtoul_nonzero(writeback_rate_p_term_inverse); + ++ sysfs_strtoul_clamp(io_error_limit, dc->error_limit, 0, INT_MAX); ++ ++ if (attr == &sysfs_io_disable) { ++ int v = strtoul_or_return(buf); ++ ++ dc->io_disable = v ? 1 : 0; ++ } ++ + d_strtoi_h(sequential_cutoff); + d_strtoi_h(readahead); + +@@ -332,6 +342,9 @@ static struct attribute *bch_cached_dev_files[] = { + &sysfs_writeback_rate_i_term_inverse, + &sysfs_writeback_rate_p_term_inverse, + &sysfs_writeback_rate_debug, ++ &sysfs_errors, ++ &sysfs_io_error_limit, ++ &sysfs_io_disable, + &sysfs_dirty_data, + &sysfs_stripe_size, + &sysfs_partial_stripes_expensive, +-- +2.16.1 + diff --git a/for-next/v5-0010-bcache-stop-bcache-device-when-backing-device-is-.patch b/for-next/v5-0010-bcache-stop-bcache-device-when-backing-device-is-.patch new file mode 100644 index 0000000..d03b24e --- /dev/null +++ b/for-next/v5-0010-bcache-stop-bcache-device-when-backing-device-is-.patch @@ -0,0 +1,152 @@ +From 7d8d5a020e69671723932da30a1800eed91d3bcd Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sat, 13 Jan 2018 17:31:44 +0800 +Subject: [PATCH v5 10/11] bcache: stop bcache device when backing device is + offline + +Currently bcache does not handle backing device failure, if backing +device is offline and disconnected from system, its bcache device can still +be accessible. If the bcache device is in writeback mode, I/O requests even +can success if the requests hit on cache device. That is to say, when and +how bcache handles offline backing device is undefined. + +This patch tries to handle backing device offline in a rather simple way, +- Add cached_dev->status_update_thread kernel thread to update backing + device status in every 1 second. +- Add cached_dev->offline_seconds to record how many seconds the backing + device is observed to be offline. If the backing device is offline for + BACKING_DEV_OFFLINE_TIMEOUT (30) seconds, set dc->io_disable to 1 and + call bcache_device_stop() to stop the bache device which linked to the + offline backing device. + +Now if a backing device is offline for BACKING_DEV_OFFLINE_TIMEOUT seconds, +its bcache device will be removed, then user space application writing on +it will get error immediately, and handler the device failure in time. + +This patch is quite simple, does not handle more complicated situations. +Once the bcache device is stopped, users need to recovery the backing +device, register and attach it manually. + +Changelog: +v2: remove "bcache: " prefix when calling pr_warn(). +v1: initial version. + +Signed-off-by: Coly Li <colyli@suse.de> +Reviewed-by: Hannes Reinecke <hare@suse.com> +Cc: Michael Lyle <mlyle@lyle.org> +Cc: Junhui Tang <tang.junhui@zte.com.cn> +--- + drivers/md/bcache/bcache.h | 2 ++ + drivers/md/bcache/super.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++ + 2 files changed, 57 insertions(+) + +diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h +index 822ec75bb78c..1c749352172d 100644 +--- a/drivers/md/bcache/bcache.h ++++ b/drivers/md/bcache/bcache.h +@@ -338,6 +338,7 @@ struct cached_dev { + + struct keybuf writeback_keys; + ++ struct task_struct *status_update_thread; + /* + * Order the write-half of writeback operations strongly in dispatch + * order. (Maintain LBA order; don't allow reads completing out of +@@ -384,6 +385,7 @@ struct cached_dev { + #define DEFAULT_CACHED_DEV_ERROR_LIMIT 64 + atomic_t io_errors; + unsigned error_limit; ++ unsigned offline_seconds; + }; + + enum alloc_reserve { +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 91a08cdd55bd..de0f5fb9bde2 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -646,6 +646,11 @@ static int ioctl_dev(struct block_device *b, fmode_t mode, + unsigned int cmd, unsigned long arg) + { + struct bcache_device *d = b->bd_disk->private_data; ++ struct cached_dev *dc = container_of(d, struct cached_dev, disk); ++ ++ if (dc->io_disable) ++ return -EIO; ++ + return d->ioctl(d, mode, cmd, arg); + } + +@@ -856,6 +861,45 @@ static void calc_cached_dev_sectors(struct cache_set *c) + c->cached_dev_sectors = sectors; + } + ++#define BACKING_DEV_OFFLINE_TIMEOUT 5 ++static int cached_dev_status_update(void *arg) ++{ ++ struct cached_dev *dc = arg; ++ struct request_queue *q; ++ char buf[BDEVNAME_SIZE]; ++ ++ /* ++ * If this delayed worker is stopping outside, directly quit here. ++ * dc->io_disable might be set via sysfs interface, so check it ++ * here too. ++ */ ++ while (!kthread_should_stop() && !dc->io_disable) { ++ q = bdev_get_queue(dc->bdev); ++ if (blk_queue_dying(q)) ++ dc->offline_seconds++; ++ else ++ dc->offline_seconds = 0; ++ ++ if (dc->offline_seconds >= BACKING_DEV_OFFLINE_TIMEOUT) { ++ pr_err("%s: device offline for %d seconds", ++ bdevname(dc->bdev, buf), ++ BACKING_DEV_OFFLINE_TIMEOUT); ++ pr_err("%s: disable I/O request due to backing " ++ "device offline", dc->disk.name); ++ dc->io_disable = true; ++ /* let others know earlier that io_disable is true */ ++ smp_mb(); ++ bcache_device_stop(&dc->disk); ++ break; ++ } ++ ++ schedule_timeout_interruptible(HZ); ++ } ++ ++ dc->status_update_thread = NULL; ++ return 0; ++} ++ + void bch_cached_dev_run(struct cached_dev *dc) + { + struct bcache_device *d = &dc->disk; +@@ -898,6 +942,15 @@ void bch_cached_dev_run(struct cached_dev *dc) + if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || + sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) + pr_debug("error creating sysfs link"); ++ ++ dc->status_update_thread = kthread_run(cached_dev_status_update, ++ dc, ++ "bcache_status_update"); ++ if (IS_ERR(dc->status_update_thread)) { ++ pr_warn("failed to create bcache_status_update kthread, " ++ "continue to run without monitoring backing " ++ "device status"); ++ } + } + + /* +@@ -1118,6 +1171,8 @@ static void cached_dev_free(struct closure *cl) + kthread_stop(dc->writeback_thread); + if (dc->writeback_write_wq) + destroy_workqueue(dc->writeback_write_wq); ++ if (!IS_ERR_OR_NULL(dc->status_update_thread)) ++ kthread_stop(dc->status_update_thread); + + if (atomic_read(&dc->running)) + bd_unlink_disk_holder(dc->bdev, dc->disk.disk); +-- +2.16.1 + diff --git a/for-next/v5-0011-bcache-add-stop_when_cache_set_failed-option-to-b.patch b/for-next/v5-0011-bcache-add-stop_when_cache_set_failed-option-to-b.patch new file mode 100644 index 0000000..c782476 --- /dev/null +++ b/for-next/v5-0011-bcache-add-stop_when_cache_set_failed-option-to-b.patch @@ -0,0 +1,251 @@ +From e8f72263c0f4f20b85f42a617fa4998115f797af Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Mon, 5 Feb 2018 10:43:18 +0800 +Subject: [PATCH v5 11/11] bcache: add stop_when_cache_set_failed option to + backing device + +Current bcache failure handling code will stop all attached bcache devices +when the cache set is broken or disconnected. This might not be desired +behavior, example bcache deployed for an email service. In such workload, +if cache device is broken but no dirty data lost, keep the bcache device +alive and permit email service continue to access data might be a better +solution for the cache device failure. + +Nix <nix@esperi.org.uk> points out the issue and provides the above example +to explain why it might be necessary to not stop bcache device for broken +cache device. Pavel Goran <via-bcache@pvgoran.name> provides a brilliant +suggestion to provide "always" and "auto" options to per-cached device +sysfs file stop_when_cache_set_failed. If cache set is retiring and the +backing device has no dirty data on cache, it should be safe to keep the +bcache device alive. In this case, if stop_when_cache_set_failed is set to +"auto", the device failure handling code will not stop this bcache device +and permit application to access the backing device with a unattached +bcache device. + +Changelog: +v2: change option values of stop_when_cache_set_failed from 1/0 to + "auto"/"always". +v1: initial version, stop_when_cache_set_failed can be 0 (not stop) or 1 + (always stop). + +Signed-off-by: Coly Li <colyli@suse.de> +Cc: Nix <nix@esperi.org.uk> +Cc: Pavel Goran <via-bcache@pvgoran.name> +Cc: Michael Lyle <mlyle@lyle.org> +Cc: Junhui Tang <tang.junhui@zte.com.cn> +Cc: Hannes Reinecke <hare@suse.com> +--- + drivers/md/bcache/bcache.h | 8 +++++ + drivers/md/bcache/super.c | 90 ++++++++++++++++++++++++++++++++++++---------- + drivers/md/bcache/sysfs.c | 17 +++++++++ + 3 files changed, 97 insertions(+), 18 deletions(-) + +diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h +index 1c749352172d..59e675304b7e 100644 +--- a/drivers/md/bcache/bcache.h ++++ b/drivers/md/bcache/bcache.h +@@ -287,6 +287,12 @@ struct io { + sector_t last; + }; + ++enum stop_on_faliure { ++ BCH_CACHED_DEV_STOP_ATUO = 0, ++ BCH_CACHED_DEV_STOP_ALWAYS, ++ BCH_CACHED_DEV_STOP_MODE_MAX, ++}; ++ + struct cached_dev { + struct list_head list; + struct bcache_device disk; +@@ -382,6 +388,7 @@ struct cached_dev { + unsigned writeback_rate_p_term_inverse; + unsigned writeback_rate_minimum; + ++ enum stop_on_faliure stop_when_cache_set_failed; + #define DEFAULT_CACHED_DEV_ERROR_LIMIT 64 + atomic_t io_errors; + unsigned error_limit; +@@ -933,6 +940,7 @@ void bch_write_bdev_super(struct cached_dev *, struct closure *); + + extern struct workqueue_struct *bcache_wq; + extern const char * const bch_cache_modes[]; ++extern const char * const bch_stop_on_failure_modes[]; + extern struct mutex bch_register_lock; + extern struct list_head bch_cache_sets; + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index de0f5fb9bde2..d2999f1e6ae2 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -47,6 +47,14 @@ const char * const bch_cache_modes[] = { + NULL + }; + ++/* Default is -1; we skip past it for stop_when_cache_set_failed */ ++const char * const bch_stop_on_failure_modes[] = { ++ "default", ++ "auto", ++ "always", ++ NULL ++}; ++ + static struct kobject *bcache_kobj; + struct mutex bch_register_lock; + LIST_HEAD(bch_cache_sets); +@@ -1246,6 +1254,8 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) + atomic_set(&dc->io_errors, 0); + dc->io_disable = false; + dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT; ++ /* default to auto */ ++ dc->stop_when_cache_set_failed = BCH_CACHED_DEV_STOP_ATUO; + + bch_cached_dev_request_init(dc); + bch_cached_dev_writeback_init(dc); +@@ -1541,33 +1551,77 @@ static void cache_set_flush(struct closure *cl) + closure_return(cl); + } + ++/* ++ * This function is only called when CACHE_SET_IO_DISABLE is set, which means ++ * cache set is unregistering due to too many I/O errors. In this condition, ++ * the bcache device might be stopped, it depends on stop_when_cache_set_failed ++ * value and whether the broken cache has dirty data: ++ * ++ * dc->stop_when_cache_set_failed dc->has_dirty stop bcache device ++ * BCH_CACHED_STOP_ATUO 0 NO ++ * BCH_CACHED_STOP_ATUO 1 YES ++ * BCH_CACHED_DEV_STOP_ALWAYS 0 YES ++ * BCH_CACHED_DEV_STOP_ALWAYS 1 YES ++ * ++ * The expected behavior is, if stop_when_cache_set_failed is configured to ++ * "auto" via sysfs interface, the bcache device will not be stopped if the ++ * backing device is clean on the broken cache device. ++ */ ++static void conditional_stop_bcache_device(struct cache_set *c, ++ struct bcache_device *d, ++ struct cached_dev *dc) ++{ ++ if (dc->stop_when_cache_set_failed == BCH_CACHED_DEV_STOP_ALWAYS) { ++ pr_warn("stop_when_cache_set_failed of %s is \"always\", stop" ++ " it for failed cache set %pU.", ++ d->disk->disk_name, c->sb.set_uuid); ++ bcache_device_stop(d); ++ return; ++ } else if (atomic_read(&dc->has_dirty)) { ++ /* ++ * dc->stop_when_cache_set_failed == BCH_CACHED_STOP_ATUO ++ * and dc->has_dirty == 1 ++ */ ++ pr_warn("stop_when_cache_set_failed of %s is \"auto\" and " ++ "cache is dirty, stop it to avoid potential data " ++ "corruption.", ++ d->disk->disk_name); ++ bcache_device_stop(d); ++ } else { ++ /* ++ * dc->stop_when_cache_set_failed == BCH_CACHED_STOP_ATUO ++ * and dc->has_dirty == 0 ++ */ ++ pr_warn("stop_when_cache_set_failed of %s is \"auto\" and " ++ "cache is clean, keep it alive.", ++ d->disk->disk_name); ++ } ++} ++ + static void __cache_set_unregister(struct closure *cl) + { + struct cache_set *c = container_of(cl, struct cache_set, caching); + struct cached_dev *dc; ++ struct bcache_device *d; + size_t i; + + mutex_lock(&bch_register_lock); + +- for (i = 0; i < c->devices_max_used; i++) +- if (c->devices[i]) { +- if (!UUID_FLASH_ONLY(&c->uuids[i]) && +- test_bit(CACHE_SET_UNREGISTERING, &c->flags)) { +- dc = container_of(c->devices[i], +- struct cached_dev, disk); +- bch_cached_dev_detach(dc); +- /* +- * If we come here by too many I/O errors, +- * bcache device should be stopped too, to +- * keep data consistency on cache and +- * backing devices. +- */ +- if (test_bit(CACHE_SET_IO_DISABLE, &c->flags)) +- bcache_device_stop(c->devices[i]); +- } else { +- bcache_device_stop(c->devices[i]); +- } ++ for (i = 0; i < c->devices_max_used; i++) { ++ d = c->devices[i]; ++ if (!d) ++ continue; ++ ++ if (!UUID_FLASH_ONLY(&c->uuids[i]) && ++ test_bit(CACHE_SET_UNREGISTERING, &c->flags)) { ++ dc = container_of(d, struct cached_dev, disk); ++ bch_cached_dev_detach(dc); ++ if (test_bit(CACHE_SET_IO_DISABLE, &c->flags)) ++ conditional_stop_bcache_device(c, d, dc); ++ } else { ++ bcache_device_stop(d); + } ++ } + + mutex_unlock(&bch_register_lock); + +diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c +index ac3adf2dcf6c..e88fdcc549cd 100644 +--- a/drivers/md/bcache/sysfs.c ++++ b/drivers/md/bcache/sysfs.c +@@ -78,6 +78,7 @@ rw_attribute(congested_write_threshold_us); + rw_attribute(sequential_cutoff); + rw_attribute(data_csum); + rw_attribute(cache_mode); ++rw_attribute(stop_when_cache_set_failed); + rw_attribute(writeback_metadata); + rw_attribute(writeback_running); + rw_attribute(writeback_percent); +@@ -126,6 +127,12 @@ SHOW(__bch_cached_dev) + bch_cache_modes + 1, + BDEV_CACHE_MODE(&dc->sb)); + ++ if (attr == &sysfs_stop_when_cache_set_failed) ++ return bch_snprint_string_list(buf, PAGE_SIZE, ++ bch_stop_on_failure_modes + 1, ++ dc->stop_when_cache_set_failed); ++ ++ + sysfs_printf(data_csum, "%i", dc->disk.data_csum); + var_printf(verify, "%i"); + var_printf(bypass_torture_test, "%i"); +@@ -257,6 +264,15 @@ STORE(__cached_dev) + } + } + ++ if (attr == &sysfs_stop_when_cache_set_failed) { ++ v = bch_read_string_list(buf, bch_stop_on_failure_modes + 1); ++ ++ if (v < 0) ++ return v; ++ ++ dc->stop_when_cache_set_failed = v; ++ } ++ + if (attr == &sysfs_label) { + if (size > SB_LABEL_SIZE) + return -EINVAL; +@@ -333,6 +349,7 @@ static struct attribute *bch_cached_dev_files[] = { + &sysfs_data_csum, + #endif + &sysfs_cache_mode, ++ &sysfs_stop_when_cache_set_failed, + &sysfs_writeback_metadata, + &sysfs_writeback_running, + &sysfs_writeback_delay, +-- +2.16.1 + |