diff options
author | Coly Li <colyli@suse.de> | 2023-03-04 00:12:38 +0800 |
---|---|---|
committer | Coly Li <colyli@suse.de> | 2023-03-04 00:12:38 +0800 |
commit | 62192d0140511f8489265a1e81ca2543e20c37cd (patch) | |
tree | 4431656ed1325f92014814c49b117f6ec21455a2 | |
parent | 94559fbf1632bd111ccfdd449dcd346d36fe553c (diff) | |
download | bcache-patches-62192d0140511f8489265a1e81ca2543e20c37cd.tar.gz |
update for-next and for-test
23 files changed, 4244 insertions, 0 deletions
diff --git a/for-next/20221207_ye_xingchen_bcache_convert_to_use_sysfs_emit_sysfs_emit_at_apis.mbx b/for-next/20221207_ye_xingchen_bcache_convert_to_use_sysfs_emit_sysfs_emit_at_apis.mbx new file mode 100644 index 0000000..d054edb --- /dev/null +++ b/for-next/20221207_ye_xingchen_bcache_convert_to_use_sysfs_emit_sysfs_emit_at_apis.mbx @@ -0,0 +1,74 @@ +From git@z Thu Jan 1 00:00:00 1970 +Subject: [PATCH] bcache: Convert to use sysfs_emit()/sysfs_emit_at() APIs +From: ye.xingchen@zte.com.cn <ye.xingchen@zte.com.cn> +Date: Wed, 07 Dec 2022 17:02:35 +0800 +Message-Id: <202212071702359325169@zte.com.cn> +To: <colyli@suse.de> +Cc: <kent.overstreet@gmail.com>, <linux-bcache@vger.kernel.org>, <linux-kernel@vger.kernel.org> +List-Id: <linux-bcache.vger.kernel.org> +MIME-Version: 1.0 +Content-Type: text/plain; charset="utf-8" +Content-Transfer-Encoding: 7bit + +From: ye xingchen <ye.xingchen@zte.com.cn> + +Follow the advice of the Documentation/filesystems/sysfs.rst and show() +should only use sysfs_emit() or sysfs_emit_at() when formatting the +value to be returned to user space. + +Signed-off-by: ye xingchen <ye.xingchen@zte.com.cn> +--- + drivers/md/bcache/sysfs.c | 31 +++++++++++++++---------------- + 1 file changed, 15 insertions(+), 16 deletions(-) + +diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c +index c6f677059214..0e2c1880f60b 100644 +--- a/drivers/md/bcache/sysfs.c ++++ b/drivers/md/bcache/sysfs.c +@@ -1111,26 +1111,25 @@ SHOW(__bch_cache) + + vfree(p); + +- ret = scnprintf(buf, PAGE_SIZE, +- "Unused: %zu%%\n" +- "Clean: %zu%%\n" +- "Dirty: %zu%%\n" +- "Metadata: %zu%%\n" +- "Average: %llu\n" +- "Sectors per Q: %zu\n" +- "Quantiles: [", +- unused * 100 / (size_t) ca->sb.nbuckets, +- available * 100 / (size_t) ca->sb.nbuckets, +- dirty * 100 / (size_t) ca->sb.nbuckets, +- meta * 100 / (size_t) ca->sb.nbuckets, sum, +- n * ca->sb.bucket_size / (ARRAY_SIZE(q) + 1)); ++ ret = sysfs_emit(buf, ++ "Unused: %zu%%\n" ++ "Clean: %zu%%\n" ++ "Dirty: %zu%%\n" ++ "Metadata: %zu%%\n" ++ "Average: %llu\n" ++ "Sectors per Q: %zu\n" ++ "Quantiles: [", ++ unused * 100 / (size_t) ca->sb.nbuckets, ++ available * 100 / (size_t) ca->sb.nbuckets, ++ dirty * 100 / (size_t) ca->sb.nbuckets, ++ meta * 100 / (size_t) ca->sb.nbuckets, sum, ++ n * ca->sb.bucket_size / (ARRAY_SIZE(q) + 1)); + + for (i = 0; i < ARRAY_SIZE(q); i++) +- ret += scnprintf(buf + ret, PAGE_SIZE - ret, +- "%u ", q[i]); ++ ret += sysfs_emit_at(buf, ret, "%u ", q[i]); + ret--; + +- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "]\n"); ++ ret += sysfs_emit_at(buf, ret, "]\n"); + + return ret; + } + +-- +2.25.1 + + diff --git a/for-next/20230214_linux_bcache_make_kobj_type_structures_constant.mbx b/for-next/20230214_linux_bcache_make_kobj_type_structures_constant.mbx new file mode 100644 index 0000000..2d70d11 --- /dev/null +++ b/for-next/20230214_linux_bcache_make_kobj_type_structures_constant.mbx @@ -0,0 +1,64 @@ +From git@z Thu Jan 1 00:00:00 1970 +Subject: [PATCH] bcache: make kobj_type structures constant +From: Thomas Weißschuh <linux@weissschuh.net> +Date: Tue, 14 Feb 2023 03:13:39 +0000 +Message-Id: <20230214-kobj_type-bcache-v1-1-cf00ead7bee7@weissschuh.net> +MIME-Version: 1.0 +Content-Type: text/plain; charset="utf-8" +Content-Transfer-Encoding: 8bit + +Since commit ee6d3dd4ed48 ("driver core: make kobj_type constant.") +the driver core allows the usage of const struct kobj_type. + +Take advantage of this to constify the structure definitions to prevent +modification at runtime. + +Signed-off-by: Thomas Weißschuh <linux@weissschuh.net> +--- + drivers/md/bcache/bcache.h | 10 +++++----- + drivers/md/bcache/sysfs.h | 2 +- + 2 files changed, 6 insertions(+), 6 deletions(-) + +diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h +index aebb7ef10e63..a522f4f1f992 100644 +--- a/drivers/md/bcache/bcache.h ++++ b/drivers/md/bcache/bcache.h +@@ -1004,11 +1004,11 @@ extern struct workqueue_struct *bch_flush_wq; + extern struct mutex bch_register_lock; + extern struct list_head bch_cache_sets; + +-extern struct kobj_type bch_cached_dev_ktype; +-extern struct kobj_type bch_flash_dev_ktype; +-extern struct kobj_type bch_cache_set_ktype; +-extern struct kobj_type bch_cache_set_internal_ktype; +-extern struct kobj_type bch_cache_ktype; ++extern const struct kobj_type bch_cached_dev_ktype; ++extern const struct kobj_type bch_flash_dev_ktype; ++extern const struct kobj_type bch_cache_set_ktype; ++extern const struct kobj_type bch_cache_set_internal_ktype; ++extern const struct kobj_type bch_cache_ktype; + + void bch_cached_dev_release(struct kobject *kobj); + void bch_flash_dev_release(struct kobject *kobj); +diff --git a/drivers/md/bcache/sysfs.h b/drivers/md/bcache/sysfs.h +index a2ff6447b699..65b8bd975ab1 100644 +--- a/drivers/md/bcache/sysfs.h ++++ b/drivers/md/bcache/sysfs.h +@@ -3,7 +3,7 @@ + #define _BCACHE_SYSFS_H_ + + #define KTYPE(type) \ +-struct kobj_type type ## _ktype = { \ ++const struct kobj_type type ## _ktype = { \ + .release = type ## _release, \ + .sysfs_ops = &((const struct sysfs_ops) { \ + .show = type ## _show, \ + +--- +base-commit: f6feea56f66d34259c4222fa02e8171c4f2673d1 +change-id: 20230214-kobj_type-bcache-6d2bd129b0fa + +Best regards, +-- +Thomas Weißschuh <linux@weissschuh.net> + diff --git a/for-next/20230225_andrea_tomassetti_opensource_bcache_remove_dead_references_to_cache_readaheads.mbx b/for-next/20230225_andrea_tomassetti_opensource_bcache_remove_dead_references_to_cache_readaheads.mbx new file mode 100644 index 0000000..650d185 --- /dev/null +++ b/for-next/20230225_andrea_tomassetti_opensource_bcache_remove_dead_references_to_cache_readaheads.mbx @@ -0,0 +1,47 @@ +From git@z Thu Jan 1 00:00:00 1970 +Subject: [PATCH] bcache: Remove dead references to cache_readaheads +From: Andrea Tomassetti <andrea.tomassetti-opensource@devo.com> +Date: Sat, 25 Feb 2023 16:33:55 +0100 +Message-Id: <20230225153355.2779474-1-andrea.tomassetti-opensource@devo.com> +MIME-Version: 1.0 +Content-Type: text/plain; charset="utf-8" +Content-Transfer-Encoding: 7bit + +The cache_readaheads stat counter is not used anymore and should be +removed. + +Signed-off-by: Andrea Tomassetti <andrea.tomassetti-opensource@devo.com> +--- + Documentation/admin-guide/bcache.rst | 3 --- + drivers/md/bcache/stats.h | 1 - + 2 files changed, 4 deletions(-) + +diff --git a/Documentation/admin-guide/bcache.rst b/Documentation/admin-guide/bcache.rst +index bb5032a99234..6fdb495ac466 100644 +--- a/Documentation/admin-guide/bcache.rst ++++ b/Documentation/admin-guide/bcache.rst +@@ -508,9 +508,6 @@ cache_miss_collisions + cache miss, but raced with a write and data was already present (usually 0 + since the synchronization for cache misses was rewritten) + +-cache_readaheads +- Count of times readahead occurred. +- + Sysfs - cache set + ~~~~~~~~~~~~~~~~~ + +diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h +index bd3afc856d53..21b445f8af15 100644 +--- a/drivers/md/bcache/stats.h ++++ b/drivers/md/bcache/stats.h +@@ -18,7 +18,6 @@ struct cache_stats { + unsigned long cache_misses; + unsigned long cache_bypass_hits; + unsigned long cache_bypass_misses; +- unsigned long cache_readaheads; + unsigned long cache_miss_collisions; + unsigned long sectors_bypassed; + +-- +2.39.2 + diff --git a/for-next/[PATCH 1_2] bcache_ fixup btree_cache_wait list damage.eml b/for-next/[PATCH 1_2] bcache_ fixup btree_cache_wait list damage.eml new file mode 100644 index 0000000..25188fd --- /dev/null +++ b/for-next/[PATCH 1_2] bcache_ fixup btree_cache_wait list damage.eml @@ -0,0 +1,171 @@ +Return-Path: <mingzhe.zou@easystack.cn> +Delivered-To: colyli +Received: from dovecot-director2.suse.de ([192.168.254.65]) + (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)) + by imap2.suse-dmz.suse.de with LMTPS + id tcrmJrvvRmK1NwAAMHmgww + (envelope-from <mingzhe.zou@easystack.cn>) + for <colyli>; Fri, 01 Apr 2022 12:27:39 +0000 +Received: from relay2.suse.de ([149.44.160.134]) + (using TLSv1.2 with cipher ECDHE-ECDSA-AES128-GCM-SHA256 (128/128 bits)) + by dovecot-director2.suse.de with LMTPS + id KGCAJLvvRmJsTQAApTUePA + (envelope-from <mingzhe.zou@easystack.cn>) + for <colyli@imap.suse.de>; Fri, 01 Apr 2022 12:27:39 +0000 +Received: from relay2.suse.de (localhost [127.0.0.1]) + by relay2.suse.de (Postfix) with ESMTP id 8C5A4A3B89 + for <colyli@imap.suse.de>; Fri, 1 Apr 2022 12:27:39 +0000 (UTC) +X-Virus-Scanned: by amavisd-new at relay2.suse.de +X-Spam-Flag: NO +X-Spam-Score: 0.77 +X-Spam-Level: +X-Spam-Status: No, score=0.77 tagged_above=-9999 required=5 + tests=[BAYES_50=0.8, RCVD_IN_DNSWL_NONE=-0.0001, + RCVD_IN_MSPIKE_H4=-0.01, RCVD_IN_MSPIKE_WL=-0.01, + T_SCC_BODY_TEXT_LINE=-0.01] autolearn=no autolearn_force=no +Received: from relay2.suse.de ([127.0.0.1]) + by relay2.suse.de (relay2.suse.de [127.0.0.1]) (amavisd-new, port 10026) + with ESMTP id tYeLYEP5FXWf for <colyli@imap.suse.de>; + Fri, 1 Apr 2022 12:27:33 +0000 (UTC) +Received: from mx2.suse.de (unknown [149.44.161.68]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by relay2.suse.de (Postfix) with ESMTPS id 9FD91A3B95 + for <colyli@imap.suse.de>; Fri, 1 Apr 2022 12:27:32 +0000 (UTC) +Received: from mail-m2835.qiye.163.com (mail-m2835.qiye.163.com [103.74.28.35]) + (using TLSv1.2 with cipher ECDHE-ECDSA-AES128-GCM-SHA256 (128/128 bits)) + (No client certificate requested) + by mx2.suse.de (Postfix) with ESMTPS id 5B880AD0E + for <colyli@suse.de>; Fri, 1 Apr 2022 12:27:30 +0000 (UTC) +Received: from localhost.localdomain (unknown [218.94.118.90]) + by mail-m2835.qiye.163.com (Hmail) with ESMTPA id 1FCC38A07D6; + Fri, 1 Apr 2022 20:27:28 +0800 (CST) +From: mingzhe.zou@easystack.cn +To: colyli@suse.de, + linux-bcache@vger.kernel.org +Cc: zoumingzhe@qq.com, + ZouMingzhe <mingzhe.zou@easystack.cn> +Subject: [PATCH 1/2] bcache: fixup btree_cache_wait list damage +Date: Fri, 1 Apr 2022 20:27:24 +0800 +Message-Id: <20220401122725.17725-1-mingzhe.zou@easystack.cn> +X-Mailer: git-send-email 2.17.1 +X-HM-Spam-Status: e1kfGhgUHx5ZQUtXWQgPGg8OCBgUHx5ZQUlOS1dZCBgUCR5ZQVlLVUtZV1 + kWDxoPAgseWUFZKDYvK1lXWShZQUlCN1dZLVlBSVdZDwkaFQgSH1lBWRpKSUhWHRpMTU4fTBodSk + 1LVRkRExYaEhckFA4PWVdZFhoPEhUdFFlBWU9LSFVKSktISkxVS1kG +X-HM-Sender-Digest: e1kMHhlZQR0aFwgeV1kSHx4VD1lBWUc6ODY6Tjo*ATIoFExDGDYvDhM2 + KzwwFFFVSlVKTU9DQ0pNS09DT0xDVTMWGhIXVRYSFRwBEx5VARQOOx4aCAIIDxoYEFUYFUVZV1kS + C1lBWUlKQ1VCT1VKSkNVQktZV1kIAVlBT0JKSzcG +X-HM-Tid: 0a7fe518488e841dkuqw1fcc38a07d6 + +From: ZouMingzhe <mingzhe.zou@easystack.cn> + +We get a kernel crash about "list_add corruption. next->prev should be +prev (ffff9c801bc01210), but was ffff9c77b688237c. (next=ffffae586d8afe68)." + +crash> struct list_head 0xffff9c801bc01210 +struct list_head { + next = 0xffffae586d8afe68, + prev = 0xffffae586d8afe68 +} +crash> struct list_head 0xffff9c77b688237c +struct list_head { + next = 0x0, + prev = 0x0 +} +crash> struct list_head 0xffffae586d8afe68 +struct list_head struct: invalid kernel virtual address: ffffae586d8afe68 type: "gdb_readmem_callback" +Cannot access memory at address 0xffffae586d8afe68 + +[230469.019492] Call Trace: +[230469.032041] prepare_to_wait+0x8a/0xb0 +[230469.044363] ? bch_btree_keys_free+0x6c/0xc0 [escache] +[230469.056533] mca_cannibalize_lock+0x72/0x90 [escache] +[230469.068788] mca_alloc+0x2ae/0x450 [escache] +[230469.080790] bch_btree_node_get+0x136/0x2d0 [escache] +[230469.092681] bch_btree_check_thread+0x1e1/0x260 [escache] +[230469.104382] ? finish_wait+0x80/0x80 +[230469.115884] ? bch_btree_check_recurse+0x1a0/0x1a0 [escache] +[230469.127259] kthread+0x112/0x130 +[230469.138448] ? kthread_flush_work_fn+0x10/0x10 +[230469.149477] ret_from_fork+0x35/0x40 + +bch_btree_check_thread() and bch_dirty_init_thread() maybe call +mca_cannibalize() to cannibalize other cached btree nodes. Only +one thread can do it at a time, so the op of other threads will +be added to the btree_cache_wait list. + +We must call finish_wait() to remove op from btree_cache_wait +before free it's memory address. Otherwise, the list will be +damaged. Also should call bch_cannibalize_unlock() to release +the btree_cache_alloc_lock and wake_up other waiters. + +Signed-off-by: Mingzhe Zou <mingzhe.zou@easystack.cn> +--- + drivers/md/bcache/btree.c | 10 +++++++++- + drivers/md/bcache/btree.h | 2 ++ + drivers/md/bcache/writeback.c | 8 ++++++++ + 3 files changed, 19 insertions(+), 1 deletion(-) + +diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c +index ad9f16689419..f8e6f5c7c736 100644 +--- a/drivers/md/bcache/btree.c ++++ b/drivers/md/bcache/btree.c +@@ -885,7 +885,7 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op, + * cannibalize_bucket() will take. This means every time we unlock the root of + * the btree, we need to release this lock if we have it held. + */ +-static void bch_cannibalize_unlock(struct cache_set *c) ++void bch_cannibalize_unlock(struct cache_set *c) + { + spin_lock(&c->btree_cannibalize_lock); + if (c->btree_cache_alloc_lock == current) { +@@ -1968,6 +1968,14 @@ static int bch_btree_check_thread(void *arg) + c->gc_stats.nodes++; + bch_btree_op_init(&op, 0); + ret = bcache_btree(check_recurse, p, c->root, &op); ++ /* The op may be added to cache_set's btree_cache_wait ++ * in mca_cannibalize(), must ensure it is removed from ++ * the list and release btree_cache_alloc_lock before ++ * free op memory. ++ * Otherwise, the btree_cache_wait will be damaged. ++ */ ++ bch_cannibalize_unlock(c); ++ finish_wait(&c->btree_cache_wait, &(&op)->wait); + if (ret) + goto out; + } +diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h +index 50482107134f..435e82574ac3 100644 +--- a/drivers/md/bcache/btree.h ++++ b/drivers/md/bcache/btree.h +@@ -365,6 +365,8 @@ static inline void force_wake_up_gc(struct cache_set *c) + _r; \ + }) + ++void bch_cannibalize_unlock(struct cache_set *c); ++ + #define MAP_DONE 0 + #define MAP_CONTINUE 1 + +diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c +index 9ee0005874cd..5b828555bca8 100644 +--- a/drivers/md/bcache/writeback.c ++++ b/drivers/md/bcache/writeback.c +@@ -865,6 +865,14 @@ static int bch_root_node_dirty_init(struct cache_set *c, + } + } while (ret == -EAGAIN); + ++ /* The op may be added to cache_set's btree_cache_wait ++ * in mca_cannibalize(), must ensure it is removed from ++ * the list and release btree_cache_alloc_lock before ++ * free op memory. ++ * Otherwise, the btree_cache_wait will be damaged. ++ */ ++ bch_cannibalize_unlock(c); ++ finish_wait(&c->btree_cache_wait, &(&op.op)->wait); + return ret; + } + +-- +2.17.1 + diff --git a/for-next/v3_20230217_zyytlz_wz_bcache_remove_some_unnecessary_null_point_check_for_the_return_value_of___bch_.mbx b/for-next/v3_20230217_zyytlz_wz_bcache_remove_some_unnecessary_null_point_check_for_the_return_value_of___bch_.mbx new file mode 100644 index 0000000..48f99a9 --- /dev/null +++ b/for-next/v3_20230217_zyytlz_wz_bcache_remove_some_unnecessary_null_point_check_for_the_return_value_of___bch_.mbx @@ -0,0 +1,101 @@ +From git@z Thu Jan 1 00:00:00 1970 +Subject: [PATCH v3] bcache: Remove some unnecessary NULL point check for + the return value of __bch_btree_node_alloc-related pointer +From: Zheng Wang <zyytlz.wz@163.com> +Date: Fri, 17 Feb 2023 18:09:01 +0800 +Message-Id: <20230217100901.707245-1-zyytlz.wz@163.com> +MIME-Version: 1.0 +Content-Type: text/plain; charset="utf-8" +Content-Transfer-Encoding: 7bit + +Due to the previously fix of __bch_btree_node_alloc, the return value will +never be a NULL pointer. So IS_ERR is enough to handle the failure + situation. Fix it by replacing IS_ERR_OR_NULL check to IS_ERR check. + +Fixes: cafe56359144 ("bcache: A block layer cache") +Cc: stable@vger.kernel.org +Signed-off-by: Zheng Wang <zyytlz.wz@163.com> +--- +v3: +- Add Cc: stable@vger.kernel.org suggested by Eric +v2: +- Replace more checks +--- + drivers/md/bcache/btree.c | 10 +++++----- + drivers/md/bcache/super.c | 4 ++-- + 2 files changed, 7 insertions(+), 7 deletions(-) + +diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c +index 147c493a989a..7c21e54468bf 100644 +--- a/drivers/md/bcache/btree.c ++++ b/drivers/md/bcache/btree.c +@@ -1138,7 +1138,7 @@ static struct btree *btree_node_alloc_replacement(struct btree *b, + { + struct btree *n = bch_btree_node_alloc(b->c, op, b->level, b->parent); + +- if (!IS_ERR_OR_NULL(n)) { ++ if (!IS_ERR(n)) { + mutex_lock(&n->write_lock); + bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort); + bkey_copy_key(&n->key, &b->key); +@@ -1340,7 +1340,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, + memset(new_nodes, 0, sizeof(new_nodes)); + closure_init_stack(&cl); + +- while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b)) ++ while (nodes < GC_MERGE_NODES && !IS_ERR(r[nodes].b)) + keys += r[nodes++].keys; + + blocks = btree_default_blocks(b->c) * 2 / 3; +@@ -1352,7 +1352,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, + + for (i = 0; i < nodes; i++) { + new_nodes[i] = btree_node_alloc_replacement(r[i].b, NULL); +- if (IS_ERR_OR_NULL(new_nodes[i])) ++ if (IS_ERR(new_nodes[i])) + goto out_nocoalesce; + } + +@@ -1487,7 +1487,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, + bch_keylist_free(&keylist); + + for (i = 0; i < nodes; i++) +- if (!IS_ERR_OR_NULL(new_nodes[i])) { ++ if (!IS_ERR(new_nodes[i])) { + btree_node_free(new_nodes[i]); + rw_unlock(true, new_nodes[i]); + } +@@ -1669,7 +1669,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op, + if (should_rewrite) { + n = btree_node_alloc_replacement(b, NULL); + +- if (!IS_ERR_OR_NULL(n)) { ++ if (!IS_ERR(n)) { + bch_btree_node_write_sync(n); + + bch_btree_set_root(n); +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index ba3909bb6bea..7660962e7b8b 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -1724,7 +1724,7 @@ static void cache_set_flush(struct closure *cl) + if (!IS_ERR_OR_NULL(c->gc_thread)) + kthread_stop(c->gc_thread); + +- if (!IS_ERR_OR_NULL(c->root)) ++ if (!IS_ERR(c->root)) + list_add(&c->root->list, &c->btree_cache); + + /* +@@ -2088,7 +2088,7 @@ static int run_cache_set(struct cache_set *c) + + err = "cannot allocate new btree root"; + c->root = __bch_btree_node_alloc(c, NULL, 0, true, NULL); +- if (IS_ERR_OR_NULL(c->root)) ++ if (IS_ERR(c->root)) + goto err; + + mutex_lock(&c->root->write_lock); +-- +2.25.1 + diff --git a/for-next/v3_20230218_zyytlz_wz_bcache_fix___bch_btree_node_alloc_to_make_the_failure_behavior_consistent.mbx b/for-next/v3_20230218_zyytlz_wz_bcache_fix___bch_btree_node_alloc_to_make_the_failure_behavior_consistent.mbx new file mode 100644 index 0000000..9aef3d4 --- /dev/null +++ b/for-next/v3_20230218_zyytlz_wz_bcache_fix___bch_btree_node_alloc_to_make_the_failure_behavior_consistent.mbx @@ -0,0 +1,50 @@ +From git@z Thu Jan 1 00:00:00 1970 +Subject: [PATCH v3] bcache: Fix __bch_btree_node_alloc to make the failure + behavior consistent +From: Zheng Wang <zyytlz.wz@163.com> +Date: Sat, 18 Feb 2023 15:23:35 +0800 +Message-Id: <20230218072335.1537099-1-zyytlz.wz@163.com> +MIME-Version: 1.0 +Content-Type: text/plain; charset="utf-8" +Content-Transfer-Encoding: 7bit + +In some specific situation, the return value of __bch_btree_node_alloc may +be NULL. This may lead to poential NULL pointer dereference in caller + function like a calling chaion : + btree_split->bch_btree_node_alloc->__bch_btree_node_alloc. + +Fix it by initialize return value in __bch_btree_node_alloc before return. + +Fixes: cafe56359144 ("bcache: A block layer cache") +Cc: stable@vger.kernel.org +Signed-off-by: Zheng Wang <zyytlz.wz@163.com> +--- +v3: +- Add Cc: stable@vger.kernel.org suggested by Eric +v2: +- split patch v1 into two patches to make it clearer suggested by Coly Li +--- + drivers/md/bcache/btree.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c +index 147c493a989a..cae25e74b9e0 100644 +--- a/drivers/md/bcache/btree.c ++++ b/drivers/md/bcache/btree.c +@@ -1090,10 +1090,12 @@ struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op, + struct btree *parent) + { + BKEY_PADDED(key) k; +- struct btree *b = ERR_PTR(-EAGAIN); ++ struct btree *b; + + mutex_lock(&c->bucket_lock); + retry: ++ /* return ERR_PTR(-EAGAIN) when it fails */ ++ b = ERR_PTR(-EAGAIN); + if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, wait)) + goto err; + +-- +2.25.1 + diff --git a/for-test/Re_ [RFC] Live resize of backing device.eml b/for-test/Re_ [RFC] Live resize of backing device.eml new file mode 100644 index 0000000..d3f094f --- /dev/null +++ b/for-test/Re_ [RFC] Live resize of backing device.eml @@ -0,0 +1,192 @@ +From: Andrea Tomassetti <andrea.tomassetti-opensource@devo.com> +Subject: [PATCH v2] bcache: Add support for live resize of backing devices + +Signed-off-by: Andrea Tomassetti <andrea.tomassetti-opensource@devo.com> +--- +Hi Coly, +this is the second version of the patch. As you correctly pointed out, +I implemented roll-back functionalities in case of error. +I'm testing this funcionality using QEMU/KVM vm via libvirt. +Here the steps: + 1. make-bcache --writeback -B /dev/vdb -C /dev/vdc + 2. mkfs.xfs /dev/bcache0 + 3. mount /dev/bcache0 /mnt + 3. dd if=/dev/random of=/mnt/random0 bs=1M count=1000 + 4. md5sum /mnt/random0 | tee /mnt/random0.md5 + 5. [HOST] virsh blockresize <vm-name> --path <disk-path> --size +<new-size> + 6. xfs_growfs /dev/bcache0 + 6. Repeat steps 3 and 4 with a different file name (e.g. random1.md5) + 7. umount/reboot/remount and check that the md5 hashes are correct with + md5sum -c /mnt/random?.md5 + + drivers/md/bcache/super.c | 84 ++++++++++++++++++++++++++++++++++++++- + 1 file changed, 83 insertions(+), 1 deletion(-) + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index ba3909bb6bea..1435a3f605f8 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -2443,6 +2443,85 @@ static bool bch_is_open(dev_t dev) + return bch_is_open_cache(dev) || bch_is_open_backing(dev); + } + ++static bool bch_update_capacity(dev_t dev) ++{ ++ const size_t max_stripes = min_t(size_t, INT_MAX, ++ SIZE_MAX / sizeof(atomic_t)); ++ ++ uint64_t n, n_old, orig_cached_sectors = 0; ++ void *tmp_realloc; ++ ++ int nr_stripes_old; ++ bool res = false; ++ ++ struct bcache_device *d; ++ struct cache_set *c, *tc; ++ struct cached_dev *dcp, *t, *dc = NULL; ++ ++ uint64_t parent_nr_sectors; ++ ++ list_for_each_entry_safe(c, tc, &bch_cache_sets, list) ++ list_for_each_entry_safe(dcp, t, &c->cached_devs, list) ++ if (dcp->bdev->bd_dev == dev) { ++ dc = dcp; ++ goto dc_found; ++ } ++ ++dc_found: ++ if (!dc) ++ return false; ++ ++ parent_nr_sectors = bdev_nr_sectors(dc->bdev) - dc->sb.data_offset; ++ ++ if (parent_nr_sectors == bdev_nr_sectors(dc->disk.disk->part0)) ++ return false; ++ ++ d = &dc->disk; ++ orig_cached_sectors = d->c->cached_dev_sectors; ++ ++ /* Force cached device sectors re-calc */ ++ calc_cached_dev_sectors(d->c); ++ ++ /* Block writeback thread */ ++ down_write(&dc->writeback_lock); ++ nr_stripes_old = d->nr_stripes; ++ n = DIV_ROUND_UP_ULL(parent_nr_sectors, d->stripe_size); ++ if (!n || n > max_stripes) { ++ pr_err("nr_stripes too large or invalid: %llu (start sector beyond +end of disk?)\n", ++ n); ++ goto restore_dev_sectors; ++ } ++ d->nr_stripes = n; ++ ++ n = d->nr_stripes * sizeof(atomic_t); ++ n_old = nr_stripes_old * sizeof(atomic_t); ++ tmp_realloc = kvrealloc(d->stripe_sectors_dirty, n_old, ++ n, GFP_KERNEL); ++ if (!tmp_realloc) ++ goto restore_nr_stripes; ++ ++ d->stripe_sectors_dirty = (atomic_t *) tmp_realloc; ++ ++ n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long); ++ n_old = BITS_TO_LONGS(nr_stripes_old) * sizeof(unsigned long); ++ tmp_realloc = kvrealloc(d->full_dirty_stripes, n_old, n, GFP_KERNEL); ++ if (!tmp_realloc) ++ goto restore_nr_stripes; ++ ++ d->full_dirty_stripes = (unsigned long *) tmp_realloc; ++ ++ if ((res = set_capacity_and_notify(dc->disk.disk, parent_nr_sectors))) ++ goto unblock_and_exit; ++ ++restore_nr_stripes: ++ d->nr_stripes = nr_stripes_old; ++restore_dev_sectors: ++ d->c->cached_dev_sectors = orig_cached_sectors; ++unblock_and_exit: ++ up_write(&dc->writeback_lock); ++ return res; ++} ++ + struct async_reg_args { + struct delayed_work reg_work; + char *path; +@@ -2569,7 +2648,10 @@ static ssize_t register_bcache(struct kobject *k, +struct kobj_attribute *attr, + mutex_lock(&bch_register_lock); + if (lookup_bdev(strim(path), &dev) == 0 && + bch_is_open(dev)) +- err = "device already registered"; ++ if (bch_update_capacity(dev)) ++ err = "capacity changed"; ++ else ++ err = "device already registered"; + else + err = "device busy"; + mutex_unlock(&bch_register_lock); +-- +2.39.0 + + + +On 25/1/23 18:59, Coly Li wrote: +> +> +>> 2023年1月25日 18:07,Andrea Tomassetti <andrea.tomassetti-opensource@devo.com> 写道: +>> +>> On Tue, Jan 17, 2023 at 5:18 PM Coly Li <colyli@suse.de> wrote: +>>>> +> +>>>>> +>>>>>> struct async_reg_args { +>>>>>> struct delayed_work reg_work; +>>>>>> char *path; +>>>>>> @@ -2569,7 +2639,10 @@ static ssize_t register_bcache(struct kobject +>>>>>> *k, struct kobj_attribute *attr, +>>>>>> mutex_lock(&bch_register_lock); +>>>>>> if (lookup_bdev(strim(path), &dev) == 0 && +>>>>>> bch_is_open(dev)) +>>>>>> - err = "device already registered"; +>>>>>> + if (bch_update_capacity(dev)) +>>>>>> + err = "capacity changed"; +>>>>>> + else +>>>>>> + err = "device already registered"; +>>>>> +>>>>> +>>>>> As I said, it should be a separated write-only sysfile under the cache +>>>>> device's directory. +>>>> Can I ask why you don't like the automatic resize way? Why should the +>>>> resize be manual? +>>> +>>> Most of system administrators don’t like such silently automatic things. They want to extend the size explicitly, especially when there is other dependences in their configurations. +>>> +>> What I was trying to say is that, in order to resize a block device, a +>> manual command should be executed. So, this is already a "non-silent" +>> automatic thing. +>> Moreover, if the block device has a FS on it, the FS needs to be +>> manually grown with some special utilities, e.g. xfs_growfs. So, +>> again, another non-silent automatic step. Don't you agree? +>> For example, to resize a qcow device attached to a VM I'm manually +>> doing a `virsh blockresize`. As soon as I issue that command, the +>> virtio_blk driver inside the VM detects the disk size change and calls +>> the `set_capacity_and_notify` function. Why then should bcache behave +>> differently? +> +> The above VM example makes sense, I am almost convinced. +> +>> +>> If you're concerned that this can somehow break the +>> behaviour-compatibility with older versions of the driver, can we +>> protect this automatic discovery with an optional parameter? Will this +>> be an option you will take into account? +> +> Then let’s forget the option sysfs at this moment. Once you feel the patch is ready for me to testing, please notice me with detailed steps to redo your testing. +> At that time during my testing, let’s discuss whether an extra option is necesssary, for now just keep your idea as automatically resize the cached device. +> +> Thanks for your detailed explanation. +> +> Coly Li +> diff --git a/for-test/nvdimm-support/meta-dev-20230303/0001-bcache-add-initial-data-structures-for-nvm-pages.patch b/for-test/nvdimm-support/meta-dev-20230303/0001-bcache-add-initial-data-structures-for-nvm-pages.patch new file mode 100644 index 0000000..5758371 --- /dev/null +++ b/for-test/nvdimm-support/meta-dev-20230303/0001-bcache-add-initial-data-structures-for-nvm-pages.patch @@ -0,0 +1,343 @@ +From e9147021c678184512de1776d163b5a994a209a3 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Mon, 26 Jul 2021 00:26:28 +0800 +Subject: [PATCH 01/16] bcache: add initial data structures for nvm pages + +This patch initializes the prototype data structures for nvm pages +allocator, + +- struct bch_nvmpg_sb + This is the super block allocated on each nvdimm namespace for the nvm +pages allocator. A nvdimm pages allocator set may have multiple name- +spaces, bch_nvmpg_sb->set_uuid is used to mark which nvdimm set this +namespace belongs to. + +- struct bch_nvmpg_header + This is a table for all heads of all allocation record lists. An allo- +cation record list traces all page(s) allocated from nvdimm namespace(s) +to a specific requester (identified by uuid). After system reboot, a +requester can retrieve all previously allocated nvdimm pages from its +record list by a pre-defined uuid. + +- struct bch_nvmpg_head + This is a head of an allocation record list. Each nvdimm pages +requester (typically it's a driver) has and only has one allocation +record list, and an allocated nvdimm page only belongs to a specific +allocation record list. Member uuid[] will be set as the requester's +uuid, e.g. for bcache it is the cache set uuid. Member label is not +mandatory, it is a human-readable string for debug purpose. The nvm +offset format pointers recs_offset[] point to the location of actual +allocator record lists on each namespace of the nvdimm pages allocator +set. Each per namespace record list is represented by the following +struct bch_nvmpg_recs. + +- struct bch_nvmpg_recs + This structure represents a requester's allocation record list. Member +uuid is same value as the uuid of its corresponding struct +bch_nvmpg_head. Member recs[] is a table of struct bch_pgalloc_rec +objects to trace all allocated nvmdimm pages. If the table recs[] is +full, the nvmpg format offset is a pointer points to the next struct +bch_nvmpg_recs object, nvm pages allocator will look for available free +allocation record there. All the linked struct bch_nvmpg_recs objects +compose a requester's alloction record list which is headed by the above +struct bch_nvmpg_head. + +- struct bch_nvmpg_recs + This structure records a range of allocated nvdimm pages. Member pgoff +is offset in unit of page size of this allocation range. Member order +indicates size of the allocation range by (1 << order) in unit of page +size. Because the nvdimm pages allocator set may have multiple nvdimm +namespaces, member ns_id is used to identify which namespace the pgoff +belongs to. + - Bits 0 - 51: pgoff - is pages offset of the allocated pages. + - Bits 52 - 57: order - allocaed size in page_size * order-of-2 + - Bits 58 - 60: ns_id - identify which namespace the pages stays on + - Bits 61 - 63: reserved. +Since each of the allocated nvm pages are power of 2, using 6 bits to +represent allocated size can have (1<<(1<<64) - 1) * PAGE_SIZE maximum +value. It can be a 76 bits width range size in byte for 4KB page size, +which is large enough currently. + +All the structure members having _offset suffix are in a special fomat. +E.g. bch_nvmpg_sb.{sb_offset, pages_offset, set_header_offset}, +bch_nvmpg_head.recs_offset, bch_nvmpg_recs.{head_offset, next_offset}, +the offset value is 64bit, the most significant 3 bits are used to +identify which namespace this offset belongs to, and the rested 61 bits +are actual offset inside the namespace. Following patches will have +helper routines to do the conversion between memory pointer and offset. + +Signed-off-by: Coly Li <colyli@suse.de> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: Jianpeng Ma <jianpeng.ma@intel.com> +Cc: Qiaowei Ren <qiaowei.ren@intel.com> +Cc: Ying Huang <ying.huang@intel.com> +--- + drivers/md/bcache/nvmpg_format.h | 253 +++++++++++++++++++++++++++++++ + 1 file changed, 253 insertions(+) + create mode 100644 drivers/md/bcache/nvmpg_format.h + +diff --git a/drivers/md/bcache/nvmpg_format.h b/drivers/md/bcache/nvmpg_format.h +new file mode 100644 +index 000000000000..e9eb6371fd78 +--- /dev/null ++++ b/drivers/md/bcache/nvmpg_format.h +@@ -0,0 +1,253 @@ ++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ ++ ++#ifndef _NVMPG_FORMAT_H ++#define _NVMPG_FORMAT_H ++ ++/* ++ * Bcache on NVDIMM data structures ++ */ ++ ++/* ++ * - struct bch_nvmpg_sb ++ * This is the super block allocated on each nvdimm namespace for the nvm ++ * pages allocator. A nvdimm pages allocator set may have multiple namespaces, ++ * bch_nvmpg_sb->set_uuid is used to mark which nvdimm set this name space ++ * belongs to. ++ * ++ * - struct bch_nvmpg_header ++ * This is a table for all heads of all allocation record lists. An allo- ++ * cation record list traces all page(s) allocated from nvdimm namespace(s) to ++ * a specific requester (identified by uuid). After system reboot, a requester ++ * can retrieve all previously allocated nvdimm pages from its record list by a ++ * pre-defined uuid. ++ * ++ * - struct bch_nvmpg_head ++ * This is a head of an allocation record list. Each nvdimm pages requester ++ * (typically it's a driver) has and only has one allocation record list, and ++ * an allocated nvdimm page only bedlones to a specific allocation record list. ++ * Member uuid[] will be set as the requester's uuid, e.g. for bcache it is the ++ * cache set uuid. Member label is not mandatory, it is a human-readable string ++ * for debug purpose. The nvm offset format pointers recs_offset[] point to the ++ * location of actual allocator record lists on each name space of the nvdimm ++ * pages allocator set. Each per name space record list is represented by the ++ * following struct bch_nvmpg_recs. ++ * ++ * - struct bch_nvmpg_recs ++ * This structure represents a requester's allocation record list. Member uuid ++ * is same value as the uuid of its corresponding struct bch_nvmpg_head. Member ++ * recs[] is a table of struct bch_pgalloc_rec objects to trace all allocated ++ * nvmdimm pages. If the table recs[] is full, the nvmpg format offset is a ++ * pointer points to the next struct bch_nvmpg_recs object, nvm pages allocator ++ * will look for available free allocation record there. All the linked ++ * struct bch_nvmpg_recs objects compose a requester's alloction record list ++ * which is headed by the above struct bch_nvmpg_head. ++ * ++ * - struct bch_nvmpg_rec ++ * This structure records a range of allocated nvdimm pages. Member pgoff is ++ * offset in unit of page size of this allocation range. Member order indicates ++ * size of the allocation range by (1 << order) in unit of page size. Because ++ * the nvdimm pages allocator set may have multiple nvdimm name spaces, member ++ * ns_id is used to identify which name space the pgoff belongs to. ++ * ++ * All allocation record lists are stored on the first initialized nvdimm name- ++ * space (ns_id 0). The meta data default layout of nvm pages allocator on ++ * namespace 0 is, ++ * ++ * 0 +---------------------------------+ ++ * | | ++ * 4KB +---------------------------------+ <-- BCH_NVMPG_SB_OFFSET ++ * | bch_nvmpg_sb | ++ * 8KB +---------------------------------+ <-- BCH_NVMPG_RECLIST_HEAD_OFFSET ++ * | bch_nvmpg_header | ++ * | | ++ * 16KB +---------------------------------+ <-- BCH_NVMPG_SYSRECS_OFFSET ++ * | bch_nvmpg_recs | ++ * | (nvm pages internal usage) | ++ * 24KB +---------------------------------+ ++ * | | ++ * | | ++ * 16MB +---------------------------------+ <-- BCH_NVMPG_START ++ * | allocable nvm pages | ++ * | for buddy allocator | ++ * end +---------------------------------+ ++ * ++ * ++ * ++ * Meta data default layout on rested nvdimm namespaces, ++ * ++ * 0 +---------------------------------+ ++ * | | ++ * 4KB +---------------------------------+ <-- BCH_NVMPG_SB_OFFSET ++ * | bch_nvmpg_sb | ++ * 8KB +---------------------------------+ ++ * | | ++ * | | ++ * | | ++ * | | ++ * | | ++ * | | ++ * 16MB +---------------------------------+ <-- BCH_NVMPG_START ++ * | allocable nvm pages | ++ * | for buddy allocator | ++ * end +---------------------------------+ ++ * ++ * ++ * - The nvmpg offset format pointer ++ * All member names ending with _offset in this header are nvmpg offset ++ * format pointer. The offset format is, ++ * [highest 3 bits: ns_id] ++ * [rested 61 bits: offset in No. ns_id namespace] ++ * ++ * The above offset is byte unit, the procedure to reference a nvmpg offset ++ * format pointer is, ++ * 1) Identify the namespace related in-memory structure by ns_id from the ++ * highest 3 bits of offset value. ++ * 2) Get the DAX mapping base address from the in-memory structure. ++ * 3) Calculate the actual memory address on nvdimm by plusing the DAX base ++ * address with offset value in rested low 61 bits. ++ * All related in-memory structure and conversion routines don't belong to ++ * user space api, they are defined by nvm-pages allocator code in ++ * drivers/md/bcache/nvm-pages.{c,h} ++ * ++ */ ++ ++#include <linux/types.h> ++ ++/* In sectors */ ++#define BCH_NVMPG_SB_OFFSET 4096 ++#define BCH_NVMPG_START (16 << 20) ++ ++#define BCH_NVMPG_LBL_SIZE 32 ++#define BCH_NVMPG_NS_MAX 8 ++ ++#define BCH_NVMPG_RECLIST_HEAD_OFFSET (8<<10) ++#define BCH_NVMPG_SYSRECS_OFFSET (16<<10) ++ ++#define BCH_NVMPG_SB_VERSION 0 ++#define BCH_NVMPG_SB_VERSION_MAX 0 ++ ++static const __u8 bch_nvmpg_magic[] = { ++ 0x17, 0xbd, 0x53, 0x7f, 0x1b, 0x23, 0xd6, 0x83, ++ 0x46, 0xa4, 0xf8, 0x28, 0x17, 0xda, 0xec, 0xa9 }; ++static const __u8 bch_nvmpg_recs_magic[] = { ++ 0x39, 0x25, 0x3f, 0xf7, 0x27, 0x17, 0xd0, 0xb9, ++ 0x10, 0xe6, 0xd2, 0xda, 0x38, 0x68, 0x26, 0xae }; ++ ++/* takes 64bit width */ ++struct bch_nvmpg_rec { ++ union { ++ struct { ++ __u64 pgoff:52; ++ __u64 order:6; ++ __u64 ns_id:3; ++ __u64 reserved:3; ++ }; ++ __u64 _v; ++ }; ++}; ++ ++struct bch_nvmpg_recs { ++ union { ++ struct { ++ /* ++ * A nvmpg offset format pointer to ++ * struct bch_nvmpg_head ++ */ ++ __u64 head_offset; ++ /* ++ * A nvmpg offset format pointer to ++ * struct bch_nvm_pgalloc_recs which contains ++ * the next recs[] array. ++ */ ++ __u64 next_offset; ++ __u8 magic[16]; ++ __u8 uuid[16]; ++ __u32 size; ++ __u32 used; ++ __u64 _pad[4]; ++ struct bch_nvmpg_rec recs[]; ++ }; ++ __u8 pad[8192]; ++ }; ++}; ++ ++#define BCH_NVMPG_MAX_RECS \ ++ ((sizeof(struct bch_nvmpg_recs) - \ ++ offsetof(struct bch_nvmpg_recs, recs)) / \ ++ sizeof(struct bch_nvmpg_rec)) ++ ++#define BCH_NVMPG_HD_STAT_FREE 0x0 ++#define BCH_NVMPG_HD_STAT_ALLOC 0x1 ++struct bch_nvmpg_head { ++ __u8 uuid[16]; ++ __u8 label[BCH_NVMPG_LBL_SIZE]; ++ __u32 state; ++ __u32 flags; ++ /* ++ * Array of offset values from the nvmpg offset format ++ * pointers, each of the pointer points to a per-namespace ++ * struct bch_nvmpg_recs. ++ */ ++ __u64 recs_offset[BCH_NVMPG_NS_MAX]; ++}; ++ ++/* heads[0] is always for nvm_pages internal usage */ ++struct bch_nvmpg_set_header { ++ union { ++ struct { ++ __u32 size; ++ __u32 used; ++ __u64 _pad[4]; ++ struct bch_nvmpg_head heads[]; ++ }; ++ __u8 pad[8192]; ++ }; ++}; ++ ++#define BCH_NVMPG_MAX_HEADS \ ++ ((sizeof(struct bch_nvmpg_set_header) - \ ++ offsetof(struct bch_nvmpg_set_header, heads)) / \ ++ sizeof(struct bch_nvmpg_head)) ++ ++/* The on-media bit order is local CPU order */ ++struct bch_nvmpg_sb { ++ __u64 csum; ++ __u64 sb_offset; ++ __u64 ns_start; ++ __u64 version; ++ __u8 magic[16]; ++ __u8 uuid[16]; ++ __u32 page_size; ++ __u32 total_ns; ++ __u32 this_ns; ++ union { ++ __u8 set_uuid[16]; ++ __u64 set_magic; ++ }; ++ ++ __u64 flags; ++ __u64 seq; ++ ++ __u64 feature_compat; ++ __u64 feature_incompat; ++ __u64 feature_ro_compat; ++ ++ /* For allocable nvm pages from buddy systems */ ++ __u64 pages_offset; ++ __u64 pages_total; ++ ++ __u64 pad[8]; ++ ++ /* ++ * A nvmpg offset format pointer, it points ++ * to struct bch_nvmpg_set_header which is ++ * stored only on the first name space. ++ */ ++ __u64 set_header_offset; ++ ++ /* Just for csum_set() */ ++ __u32 keys; ++ __u64 d[0]; ++}; ++ ++#endif /* _NVMPG_FORMAT_H */ +-- +2.39.2 + diff --git a/for-test/nvdimm-support/meta-dev-20230303/0002-bcache-initialize-the-nvm-pages-allocator.patch b/for-test/nvdimm-support/meta-dev-20230303/0002-bcache-initialize-the-nvm-pages-allocator.patch new file mode 100644 index 0000000..8a2f463 --- /dev/null +++ b/for-test/nvdimm-support/meta-dev-20230303/0002-bcache-initialize-the-nvm-pages-allocator.patch @@ -0,0 +1,535 @@ +From 08ce6a36470047a30ac9db26714a566280adddde Mon Sep 17 00:00:00 2001 +From: Jianpeng Ma <jianpeng.ma@intel.com> +Date: Mon, 26 Jul 2021 10:33:30 +0800 +Subject: [PATCH 02/16] bcache: initialize the nvm pages allocator + +This patch define the prototype data structures in memory and +initializes the nvm pages allocator. + +The nvm address space which is managed by this allocator can consist of +many nvm namespaces, and some namespaces can compose into one nvm set, +like cache set. For this initial implementation, only one set can be +supported. + +The users of this nvm pages allocator need to call register_namespace() +to register the nvdimm device (like /dev/pmemX) into this allocator as +the instance of struct nvm_namespace. + +Reported-by: Randy Dunlap <rdunlap@infradead.org> +Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com> +Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com> +Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +--- + drivers/md/bcache/Kconfig | 10 ++ + drivers/md/bcache/Makefile | 1 + + drivers/md/bcache/nvmpg.c | 333 +++++++++++++++++++++++++++++++++++++ + drivers/md/bcache/nvmpg.h | 97 +++++++++++ + drivers/md/bcache/super.c | 3 + + 5 files changed, 444 insertions(+) + create mode 100644 drivers/md/bcache/nvmpg.c + create mode 100644 drivers/md/bcache/nvmpg.h + +diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig +index cf3e8096942a..4a7c13e882bb 100644 +--- a/drivers/md/bcache/Kconfig ++++ b/drivers/md/bcache/Kconfig +@@ -36,3 +36,13 @@ config BCACHE_ASYNC_REGISTRATION + device path into this file will returns immediately and the real + registration work is handled in kernel work queue in asynchronous + way. ++ ++config BCACHE_NVM_PAGES ++ bool "NVDIMM support for bcache (EXPERIMENTAL)" ++ depends on BCACHE ++ depends on 64BIT ++ depends on LIBNVDIMM ++ depends on DAX ++ help ++ Allocate/release NV-memory pages for bcache and provide allocated pages ++ for each requestor after system reboot. +diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile +index 5b87e59676b8..276b33be5ad5 100644 +--- a/drivers/md/bcache/Makefile ++++ b/drivers/md/bcache/Makefile +@@ -5,3 +5,4 @@ obj-$(CONFIG_BCACHE) += bcache.o + bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\ + io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ + util.o writeback.o features.o ++bcache-$(CONFIG_BCACHE_NVM_PAGES) += nvmpg.o +diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c +new file mode 100644 +index 000000000000..8f4e7fc1ad14 +--- /dev/null ++++ b/drivers/md/bcache/nvmpg.c +@@ -0,0 +1,333 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++/* ++ * Nvdimm page-buddy allocator ++ * ++ * Copyright (c) 2021, Intel Corporation. ++ * Copyright (c) 2021, Qiaowei Ren <qiaowei.ren@intel.com>. ++ * Copyright (c) 2021, Jianpeng Ma <jianpeng.ma@intel.com>. ++ */ ++ ++#include "bcache.h" ++#include "nvmpg.h" ++ ++#include <linux/slab.h> ++#include <linux/list.h> ++#include <linux/mutex.h> ++#include <linux/dax.h> ++#include <linux/pfn_t.h> ++#include <linux/libnvdimm.h> ++#include <linux/mm_types.h> ++#include <linux/err.h> ++#include <linux/pagemap.h> ++#include <linux/bitmap.h> ++#include <linux/blkdev.h> ++ ++struct bch_nvmpg_set *global_nvmpg_set; ++ ++void *bch_nvmpg_offset_to_ptr(unsigned long offset) ++{ ++ int ns_id; ++ struct bch_nvmpg_ns *ns; ++ ++ if (offset == 0) ++ return NULL; ++ ++ ns_id = BCH_NVMPG_GET_NS_ID(offset); ++ ns = global_nvmpg_set->ns_tbl[ns_id]; ++ ++ if (ns) ++ return (void *)(ns->base_addr + BCH_NVMPG_GET_OFFSET(offset)); ++ ++ pr_err("Invalid ns_id %u\n", ns_id); ++ return NULL; ++} ++ ++unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr) ++{ ++ int ns_id = ns->ns_id; ++ unsigned long offset = (unsigned long)(ptr - ns->base_addr); ++ ++ return BCH_NVMPG_OFFSET(ns_id, offset); ++} ++ ++static void release_ns_tbl(struct bch_nvmpg_set *set) ++{ ++ int i; ++ struct bch_nvmpg_ns *ns; ++ ++ for (i = 0; i < BCH_NVMPG_NS_MAX; i++) { ++ ns = set->ns_tbl[i]; ++ if (ns) { ++ fs_put_dax(ns->dax_dev); ++ blkdev_put(ns->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXEC); ++ set->ns_tbl[i] = NULL; ++ set->attached_ns--; ++ kfree(ns); ++ } ++ } ++ ++ if (set->attached_ns) ++ pr_err("unexpected attached_ns: %u\n", set->attached_ns); ++} ++ ++static void release_nvmpg_set(struct bch_nvmpg_set *set) ++{ ++ release_ns_tbl(set); ++ kfree(set); ++} ++ ++/* Namespace 0 contains all meta data of the nvmpg allocation set */ ++static int init_nvmpg_set_header(struct bch_nvmpg_ns *ns) ++{ ++ struct bch_nvmpg_set_header *set_header; ++ ++ if (ns->ns_id != 0) { ++ pr_err("unexpected ns_id %u for first nvmpg namespace.\n", ++ ns->ns_id); ++ return -EINVAL; ++ } ++ ++ set_header = bch_nvmpg_offset_to_ptr(ns->sb->set_header_offset); ++ ++ mutex_lock(&global_nvmpg_set->lock); ++ global_nvmpg_set->set_header = set_header; ++ global_nvmpg_set->heads_size = set_header->size; ++ global_nvmpg_set->heads_used = set_header->used; ++ mutex_unlock(&global_nvmpg_set->lock); ++ ++ return 0; ++} ++ ++static int attach_nvmpg_set(struct bch_nvmpg_ns *ns) ++{ ++ struct bch_nvmpg_sb *sb = ns->sb; ++ int rc = 0; ++ ++ mutex_lock(&global_nvmpg_set->lock); ++ ++ if (global_nvmpg_set->ns_tbl[sb->this_ns]) { ++ pr_err("ns_id %u already attached.\n", ns->ns_id); ++ rc = -EEXIST; ++ goto unlock; ++ } ++ ++ if (ns->ns_id != 0) { ++ pr_err("unexpected ns_id %u for first namespace.\n", ns->ns_id); ++ rc = -EINVAL; ++ goto unlock; ++ } ++ ++ if (global_nvmpg_set->attached_ns > 0) { ++ pr_err("multiple namespace attaching not supported yet\n"); ++ rc = -EOPNOTSUPP; ++ goto unlock; ++ } ++ ++ if ((global_nvmpg_set->attached_ns + 1) > sb->total_ns) { ++ pr_err("namespace counters error: attached %u > total %u\n", ++ global_nvmpg_set->attached_ns, ++ global_nvmpg_set->total_ns); ++ rc = -EINVAL; ++ goto unlock; ++ } ++ ++ memcpy(global_nvmpg_set->set_uuid, sb->set_uuid, 16); ++ global_nvmpg_set->ns_tbl[sb->this_ns] = ns; ++ global_nvmpg_set->attached_ns++; ++ global_nvmpg_set->total_ns = sb->total_ns; ++ ++unlock: ++ mutex_unlock(&global_nvmpg_set->lock); ++ return rc; ++} ++ ++static int read_nvdimm_meta_super(struct block_device *bdev, ++ struct bch_nvmpg_ns *ns) ++{ ++ struct page *page; ++ struct bch_nvmpg_sb *sb; ++ uint64_t expected_csum = 0; ++ int r; ++ ++ page = read_cache_page_gfp(bdev->bd_inode->i_mapping, ++ BCH_NVMPG_SB_OFFSET >> PAGE_SHIFT, GFP_KERNEL); ++ ++ if (IS_ERR(page)) ++ return -EIO; ++ ++ sb = (struct bch_nvmpg_sb *) ++ (page_address(page) + offset_in_page(BCH_NVMPG_SB_OFFSET)); ++ ++ r = -EINVAL; ++ expected_csum = csum_set(sb); ++ if (expected_csum != sb->csum) { ++ pr_info("csum is not match with expected one\n"); ++ goto put_page; ++ } ++ ++ if (memcmp(sb->magic, bch_nvmpg_magic, sizeof(bch_nvmpg_magic))) { ++ pr_info("invalid bch_nvmpg_magic\n"); ++ goto put_page; ++ } ++ ++ if (sb->sb_offset != ++ BCH_NVMPG_OFFSET(sb->this_ns, BCH_NVMPG_SB_OFFSET)) { ++ pr_info("invalid superblock offset 0x%llx\n", sb->sb_offset); ++ goto put_page; ++ } ++ ++ r = -EOPNOTSUPP; ++ if (sb->total_ns != 1) { ++ pr_info("multiple name space not supported yet.\n"); ++ goto put_page; ++ } ++ ++ ++ r = 0; ++ /* Necessary for DAX mapping */ ++ ns->page_size = sb->page_size; ++ ns->pages_total = sb->pages_total; ++ ++put_page: ++ put_page(page); ++ return r; ++} ++ ++struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path, size_t size) ++{ ++ struct bch_nvmpg_ns *ns = NULL; ++ struct bch_nvmpg_sb *sb = NULL; ++ char buf[BDEVNAME_SIZE]; ++ struct block_device *bdev; ++ pgoff_t pgoff; ++ u64 start_off; ++ int id, err; ++ char *path; ++ long dax_ret = 0; ++ ++ path = kstrndup(dev_path, size, GFP_KERNEL); ++ if (!path) { ++ pr_err("kstrndup failed\n"); ++ return ERR_PTR(-ENOMEM); ++ } ++ ++ bdev = blkdev_get_by_path(strim(path), ++ FMODE_READ|FMODE_WRITE|FMODE_EXEC, ++ global_nvmpg_set); ++ if (IS_ERR(bdev)) { ++ pr_err("get %s error: %ld\n", dev_path, PTR_ERR(bdev)); ++ kfree(path); ++ return ERR_PTR(PTR_ERR(bdev)); ++ } ++ ++ err = -ENOMEM; ++ ns = kzalloc(sizeof(struct bch_nvmpg_ns), GFP_KERNEL); ++ if (!ns) ++ goto bdput; ++ ++ err = -EIO; ++ if (read_nvdimm_meta_super(bdev, ns)) { ++ pr_err("%s read nvdimm meta super block failed.\n", ++ bdevname(bdev, buf)); ++ goto free_ns; ++ } ++ ++ err = -EOPNOTSUPP; ++ ns->dax_dev = fs_dax_get_by_bdev(bdev, &start_off); ++ if (!ns->dax_dev) { ++ pr_err("%s don't support DAX\n", bdevname(bdev, buf)); ++ goto free_ns; ++ } ++ ++ pgoff = start_off >> PAGE_SHIFT; ++ ++ err = -EINVAL; ++ id = dax_read_lock(); ++ dax_ret = dax_direct_access(ns->dax_dev, pgoff, ns->pages_total, ++ DAX_ACCESS, &ns->base_addr, &ns->start_pfn); ++ if (dax_ret <= 0) { ++ pr_err("dax_direct_access error\n"); ++ dax_read_unlock(id); ++ goto free_ns; ++ } ++ ++ if (dax_ret < ns->pages_total) { ++ pr_warn("mapped range %ld is less than ns->pages_total %lu\n", ++ dax_ret, ns->pages_total); ++ } ++ dax_read_unlock(id); ++ ++ sb = (struct bch_nvmpg_sb *)(ns->base_addr + BCH_NVMPG_SB_OFFSET); ++ ++ err = -EINVAL; ++ /* Check magic again to make sure DAX mapping is correct */ ++ if (memcmp(sb->magic, bch_nvmpg_magic, sizeof(bch_nvmpg_magic))) { ++ pr_err("invalid bch_nvmpg_magic after DAX mapping\n"); ++ goto free_ns; ++ } ++ ++ if ((global_nvmpg_set->attached_ns > 0) && ++ memcmp(sb->set_uuid, global_nvmpg_set->set_uuid, 16)) { ++ pr_err("set uuid does not match with ns_id %u\n", ns->ns_id); ++ goto free_ns; ++ } ++ ++ if (sb->set_header_offset != ++ BCH_NVMPG_OFFSET(sb->this_ns, BCH_NVMPG_RECLIST_HEAD_OFFSET)) { ++ pr_err("Invalid header offset: this_ns %u, ns_id %llu, offset 0x%llx\n", ++ sb->this_ns, ++ BCH_NVMPG_GET_NS_ID(sb->set_header_offset), ++ BCH_NVMPG_GET_OFFSET(sb->set_header_offset)); ++ goto free_ns; ++ } ++ ++ ns->page_size = sb->page_size; ++ ns->pages_offset = sb->pages_offset; ++ ns->pages_total = sb->pages_total; ++ ns->sb = sb; ++ ns->free = 0; ++ ns->bdev = bdev; ++ ns->set = global_nvmpg_set; ++ ++ err = attach_nvmpg_set(ns); ++ if (err < 0) ++ goto free_ns; ++ ++ mutex_init(&ns->lock); ++ ++ err = init_nvmpg_set_header(ns); ++ if (err < 0) ++ goto free_ns; ++ ++ kfree(path); ++ return ns; ++ ++free_ns: ++ if (ns->dax_dev) ++ fs_put_dax(ns->dax_dev); ++ kfree(ns); ++bdput: ++ blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXEC); ++ kfree(path); ++ return ERR_PTR(err); ++} ++EXPORT_SYMBOL_GPL(bch_register_namespace); ++ ++int __init bch_nvmpg_init(void) ++{ ++ global_nvmpg_set = kzalloc(sizeof(*global_nvmpg_set), GFP_KERNEL); ++ if (!global_nvmpg_set) ++ return -ENOMEM; ++ ++ global_nvmpg_set->total_ns = 0; ++ mutex_init(&global_nvmpg_set->lock); ++ ++ pr_info("bcache nvm init\n"); ++ return 0; ++} ++ ++void bch_nvmpg_exit(void) ++{ ++ release_nvmpg_set(global_nvmpg_set); ++ pr_info("bcache nvm exit\n"); ++} +diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h +new file mode 100644 +index 000000000000..45e14df202ca +--- /dev/null ++++ b/drivers/md/bcache/nvmpg.h +@@ -0,0 +1,97 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++ ++#ifndef _BCACHE_NVM_PAGES_H ++#define _BCACHE_NVM_PAGES_H ++ ++#include <linux/libnvdimm.h> ++ ++#include "nvmpg_format.h" ++ ++/* ++ * Bcache NVDIMM in memory data structures ++ */ ++ ++/* ++ * The following three structures in memory records which page(s) allocated ++ * to which owner. After reboot from power failure, they will be initialized ++ * based on nvm pages superblock in NVDIMM device. ++ */ ++struct bch_nvmpg_ns { ++ struct bch_nvmpg_sb *sb; ++ void *base_addr; ++ ++ unsigned char uuid[16]; ++ int ns_id; ++ unsigned int page_size; ++ unsigned long free; ++ unsigned long pages_offset; ++ unsigned long pages_total; ++ pfn_t start_pfn; ++ ++ struct dax_device *dax_dev; ++ struct block_device *bdev; ++ struct bch_nvmpg_set *set; ++ ++ struct mutex lock; ++}; ++ ++/* ++ * A set of namespaces. Currently only one set can be supported. ++ */ ++struct bch_nvmpg_set { ++ unsigned char set_uuid[16]; ++ ++ int heads_size; ++ int heads_used; ++ struct bch_nvmpg_set_header *set_header; ++ ++ struct bch_nvmpg_ns *ns_tbl[BCH_NVMPG_NS_MAX]; ++ int total_ns; ++ int attached_ns; ++ ++ struct mutex lock; ++}; ++ ++#define BCH_NVMPG_NS_ID_BITS 3 ++#define BCH_NVMPG_OFFSET_BITS 61 ++#define BCH_NVMPG_NS_ID_MASK ((1UL<<BCH_NVMPG_NS_ID_BITS) - 1) ++#define BCH_NVMPG_OFFSET_MASK ((1UL<<BCH_NVMPG_OFFSET_BITS) - 1) ++ ++#define BCH_NVMPG_GET_NS_ID(offset) \ ++ (((offset) >> BCH_NVMPG_OFFSET_BITS) & BCH_NVMPG_NS_ID_MASK) ++ ++#define BCH_NVMPG_GET_OFFSET(offset) ((offset) & BCH_NVMPG_OFFSET_MASK) ++ ++#define BCH_NVMPG_OFFSET(ns_id, offset) \ ++ ((((ns_id) & BCH_NVMPG_NS_ID_MASK) << BCH_NVMPG_OFFSET_BITS) | \ ++ ((offset) & BCH_NVMPG_OFFSET_MASK)) ++ ++/* Indicate which field in bch_nvmpg_sb to be updated */ ++#define BCH_NVMPG_TOTAL_NS 0 /* total_ns */ ++ ++void *bch_nvmpg_offset_to_ptr(unsigned long offset); ++unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr); ++ ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++ ++struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path, size_t size); ++int bch_nvmpg_init(void); ++void bch_nvmpg_exit(void); ++ ++#else ++ ++static inline struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path, size_t size) ++{ ++ return NULL; ++} ++ ++static inline int bch_nvmpg_init(void) ++{ ++ return 0; ++} ++ ++static inline void bch_nvmpg_exit(void) { } ++ ++#endif /* CONFIG_BCACHE_NVM_PAGES */ ++ ++#endif /* _BCACHE_NVM_PAGES_H */ +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 3563d15dbaf2..ffe79871aa69 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -14,6 +14,7 @@ + #include "request.h" + #include "writeback.h" + #include "features.h" ++#include "nvmpg.h" + + #include <linux/blkdev.h> + #include <linux/pagemap.h> +@@ -2816,6 +2817,7 @@ static void bcache_exit(void) + { + bch_debug_exit(); + bch_request_exit(); ++ bch_nvmpg_exit(); + if (bcache_kobj) + kobject_put(bcache_kobj); + if (bcache_wq) +@@ -2914,6 +2916,7 @@ static int __init bcache_init(void) + + bch_debug_init(); + closure_debug_init(); ++ bch_nvmpg_init(); + + bcache_is_reboot = false; + +-- +2.39.2 + diff --git a/for-test/nvdimm-support/meta-dev-20230303/0003-bcache-initialization-of-the-buddy.patch b/for-test/nvdimm-support/meta-dev-20230303/0003-bcache-initialization-of-the-buddy.patch new file mode 100644 index 0000000..1b2ebca --- /dev/null +++ b/for-test/nvdimm-support/meta-dev-20230303/0003-bcache-initialization-of-the-buddy.patch @@ -0,0 +1,358 @@ +From 83b67de501eda0f93e0c77f3201db343577f1f2f Mon Sep 17 00:00:00 2001 +From: Jianpeng Ma <jianpeng.ma@intel.com> +Date: Mon, 4 Jul 2022 14:30:24 +0800 +Subject: [PATCH 03/16] bcache: initialization of the buddy + +This nvm pages allocator will implement the simple buddy allocator to +anage the nvm address space. This patch initializes this buddy allocator +for new namespace. + +the unit of alloc/free of the buddy allocator is page. DAX device has +their struct page(in dram or PMEM). + + struct { /* ZONE_DEVICE pages */ + /** @pgmap: Points to the hosting device page map. */ + struct dev_pagemap *pgmap; + void *zone_device_data; + /* + * ZONE_DEVICE private pages are counted as being + * mapped so the next 3 words hold the mapping, index, + * and private fields from the source anonymous or + * page cache page while the page is migrated to device + * private memory. + * ZONE_DEVICE MEMORY_DEVICE_FS_DAX pages also + * use the mapping, index, and private fields when + * pmem backed DAX files are mapped. + */ + }; + +ZONE_DEVICE pages only use pgmap. Other 4 words[16/32 bytes] don't use. +So the second/third word will be used as 'struct list_head ' which list +in buddy. The fourth word(that is normal struct page::index) store pgoff +which the page-offset in the dax device. And the fifth word (that is +normal struct page::private) store order of buddy. page_type will be used +to store buddy flags. + +Reported-by: kernel test robot <lkp@intel.com> +Reported-by: Dan Carpenter <dan.carpenter@oracle.com> +Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com> +Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com> +Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +--- + drivers/md/bcache/nvmpg.c | 211 +++++++++++++++++++++++++++++++++++++- + drivers/md/bcache/nvmpg.h | 12 +++ + 2 files changed, 220 insertions(+), 3 deletions(-) + +diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c +index 8f4e7fc1ad14..feba36ab5541 100644 +--- a/drivers/md/bcache/nvmpg.c ++++ b/drivers/md/bcache/nvmpg.c +@@ -50,6 +50,36 @@ unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr) + return BCH_NVMPG_OFFSET(ns_id, offset); + } + ++static struct page *bch_nvmpg_va_to_pg(void *addr) ++{ ++ return virt_to_page(addr); ++} ++ ++static void *bch_nvmpg_pgoff_to_ptr(struct bch_nvmpg_ns *ns, pgoff_t pgoff) ++{ ++ return ns->base_addr + (pgoff << PAGE_SHIFT); ++} ++ ++static void *bch_nvmpg_rec_to_ptr(struct bch_nvmpg_rec *r) ++{ ++ struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[r->ns_id]; ++ pgoff_t pgoff = r->pgoff; ++ ++ return bch_nvmpg_pgoff_to_ptr(ns, pgoff); ++} ++ ++static inline void reserve_nvmpg_pages(struct bch_nvmpg_ns *ns, ++ pgoff_t pgoff, u64 nr) ++{ ++ while (nr > 0) { ++ unsigned int num = nr > UINT_MAX ? UINT_MAX : nr; ++ ++ bitmap_set(ns->pages_bitmap, pgoff, num); ++ nr -= num; ++ pgoff += num; ++ } ++} ++ + static void release_ns_tbl(struct bch_nvmpg_set *set) + { + int i; +@@ -58,6 +88,10 @@ static void release_ns_tbl(struct bch_nvmpg_set *set) + for (i = 0; i < BCH_NVMPG_NS_MAX; i++) { + ns = set->ns_tbl[i]; + if (ns) { ++ kvfree(ns->pages_bitmap); ++ if (ns->recs_bitmap) ++ bitmap_free(ns->recs_bitmap); ++ + fs_put_dax(ns->dax_dev); + blkdev_put(ns->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXEC); + set->ns_tbl[i] = NULL; +@@ -76,10 +110,73 @@ static void release_nvmpg_set(struct bch_nvmpg_set *set) + kfree(set); + } + ++static int validate_recs(int ns_id, ++ struct bch_nvmpg_head *head, ++ struct bch_nvmpg_recs *recs) ++{ ++ if (memcmp(recs->magic, bch_nvmpg_recs_magic, sizeof(bch_nvmpg_recs_magic))) { ++ pr_err("Invalid bch_nvmpg_recs magic\n"); ++ return -EINVAL; ++ } ++ ++ if (memcmp(recs->uuid, head->uuid, 16)) { ++ pr_err("Invalid bch_nvmpg_recs uuid\n"); ++ return -EINVAL; ++ } ++ ++ if (recs->head_offset != ++ bch_nvmpg_ptr_to_offset(global_nvmpg_set->ns_tbl[ns_id], head)) { ++ pr_err("Invalid recs head_offset\n"); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++static int reserve_nvmpg_recs(struct bch_nvmpg_recs *recs) ++{ ++ int i, used = 0; ++ ++ for (i = 0; i < recs->size; i++) { ++ struct bch_nvmpg_rec *r = &recs->recs[i]; ++ struct bch_nvmpg_ns *ns; ++ struct page *page; ++ void *addr; ++ ++ if (r->pgoff == 0) ++ continue; ++ ++ ns = global_nvmpg_set->ns_tbl[r->ns_id]; ++ addr = bch_nvmpg_rec_to_ptr(r); ++ if (addr < ns->base_addr) { ++ pr_err("Invalid recorded address\n"); ++ return -EINVAL; ++ } ++ ++ /* init struct page: index/private */ ++ page = bch_nvmpg_va_to_pg(addr); ++ set_page_private(page, r->order); ++ page->index = r->pgoff; ++ ++ reserve_nvmpg_pages(ns, r->pgoff, 1L << r->order); ++ used++; ++ } ++ ++ if (used != recs->used) { ++ pr_err("used %d doesn't match recs->used %d\n", ++ used, recs->used); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ + /* Namespace 0 contains all meta data of the nvmpg allocation set */ + static int init_nvmpg_set_header(struct bch_nvmpg_ns *ns) + { + struct bch_nvmpg_set_header *set_header; ++ struct bch_nvmpg_recs *sys_recs; ++ int i, j, used = 0, rc = 0; + + if (ns->ns_id != 0) { + pr_err("unexpected ns_id %u for first nvmpg namespace.\n", +@@ -93,9 +190,82 @@ static int init_nvmpg_set_header(struct bch_nvmpg_ns *ns) + global_nvmpg_set->set_header = set_header; + global_nvmpg_set->heads_size = set_header->size; + global_nvmpg_set->heads_used = set_header->used; ++ ++ /* Reserve the used space from buddy allocator */ ++ reserve_nvmpg_pages(ns, 0, div_u64(ns->pages_offset, ns->page_size)); ++ ++ sys_recs = ns->base_addr + BCH_NVMPG_SYSRECS_OFFSET; ++ for (i = 0; i < set_header->size; i++) { ++ struct bch_nvmpg_head *head; ++ ++ head = &set_header->heads[i]; ++ if (head->state == BCH_NVMPG_HD_STAT_FREE) ++ continue; ++ ++ used++; ++ if (used > global_nvmpg_set->heads_size) { ++ pr_err("used heads %d > heads size %d.\n", ++ used, global_nvmpg_set->heads_size); ++ goto unlock; ++ } ++ ++ for (j = 0; j < BCH_NVMPG_NS_MAX; j++) { ++ struct bch_nvmpg_recs *recs; ++ ++ recs = bch_nvmpg_offset_to_ptr(head->recs_offset[j]); ++ ++ /* Iterate the recs list */ ++ while (recs) { ++ rc = validate_recs(j, head, recs); ++ if (rc < 0) ++ goto unlock; ++ ++ rc = reserve_nvmpg_recs(recs); ++ if (rc < 0) ++ goto unlock; ++ ++ bitmap_set(ns->recs_bitmap, recs - sys_recs, 1); ++ recs = bch_nvmpg_offset_to_ptr(recs->next_offset); ++ } ++ } ++ } ++unlock: + mutex_unlock(&global_nvmpg_set->lock); ++ return rc; ++} + +- return 0; ++static void bch_nvmpg_init_free_space(struct bch_nvmpg_ns *ns) ++{ ++ unsigned int start, end, pages; ++ int i; ++ struct page *page; ++ pgoff_t pgoff_start; ++ ++ for_each_clear_bitrange(start, end, ns->pages_bitmap, ns->pages_total) { ++ pgoff_start = start; ++ pages = end - start; ++ ++ while (pages) { ++ void *addr; ++ ++ for (i = BCH_MAX_ORDER - 1; i >= 0; i--) { ++ if ((pgoff_start % (1L << i) == 0) && ++ (pages >= (1L << i))) ++ break; ++ } ++ ++ addr = bch_nvmpg_pgoff_to_ptr(ns, pgoff_start); ++ page = bch_nvmpg_va_to_pg(addr); ++ set_page_private(page, i); ++ page->index = pgoff_start; ++ __SetPageBuddy(page); ++ list_add((struct list_head *)&page->zone_device_data, ++ &ns->free_area[i]); ++ ++ pgoff_start += 1L << i; ++ pages -= 1L << i; ++ } ++ } + } + + static int attach_nvmpg_set(struct bch_nvmpg_ns *ns) +@@ -201,7 +371,7 @@ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path, size_t size) + struct block_device *bdev; + pgoff_t pgoff; + u64 start_off; +- int id, err; ++ int id, i, err; + char *path; + long dax_ret = 0; + +@@ -295,13 +465,48 @@ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path, size_t size) + + mutex_init(&ns->lock); + ++ /* ++ * parameters of bitmap_set/clear are unsigned int. ++ * Given currently size of nvm is far from exceeding this limit, ++ * so only add a WARN_ON message. ++ */ ++ WARN_ON(BITS_TO_LONGS(ns->pages_total) > UINT_MAX); ++ ns->pages_bitmap = kvcalloc(BITS_TO_LONGS(ns->pages_total), ++ sizeof(unsigned long), GFP_KERNEL); ++ if (!ns->pages_bitmap) { ++ err = -ENOMEM; ++ goto clear_ns_nr; ++ } ++ ++ if (ns->sb->this_ns == 0) { ++ ns->recs_bitmap = ++ bitmap_zalloc(BCH_MAX_PGALLOC_RECS, GFP_KERNEL); ++ if (ns->recs_bitmap == NULL) { ++ err = -ENOMEM; ++ goto free_pages_bitmap; ++ } ++ } ++ ++ for (i = 0; i < BCH_MAX_ORDER; i++) ++ INIT_LIST_HEAD(&ns->free_area[i]); ++ + err = init_nvmpg_set_header(ns); + if (err < 0) +- goto free_ns; ++ goto free_recs_bitmap; ++ ++ if (ns->sb->this_ns == 0) ++ /* init buddy allocator */ ++ bch_nvmpg_init_free_space(ns); + + kfree(path); + return ns; + ++free_recs_bitmap: ++ bitmap_free(ns->recs_bitmap); ++free_pages_bitmap: ++ kvfree(ns->pages_bitmap); ++clear_ns_nr: ++ global_nvmpg_set->ns_tbl[sb->this_ns] = NULL; + free_ns: + if (ns->dax_dev) + fs_put_dax(ns->dax_dev); +diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h +index 45e14df202ca..1e2108221630 100644 +--- a/drivers/md/bcache/nvmpg.h ++++ b/drivers/md/bcache/nvmpg.h +@@ -11,6 +11,8 @@ + * Bcache NVDIMM in memory data structures + */ + ++#define BCH_MAX_ORDER 20 ++ + /* + * The following three structures in memory records which page(s) allocated + * to which owner. After reboot from power failure, they will be initialized +@@ -28,6 +30,11 @@ struct bch_nvmpg_ns { + unsigned long pages_total; + pfn_t start_pfn; + ++ unsigned long *pages_bitmap; ++ struct list_head free_area[BCH_MAX_ORDER]; ++ ++ unsigned long *recs_bitmap; ++ + struct dax_device *dax_dev; + struct block_device *bdev; + struct bch_nvmpg_set *set; +@@ -69,6 +76,11 @@ struct bch_nvmpg_set { + /* Indicate which field in bch_nvmpg_sb to be updated */ + #define BCH_NVMPG_TOTAL_NS 0 /* total_ns */ + ++#define BCH_MAX_PGALLOC_RECS \ ++ (min_t(unsigned int, 64, \ ++ (BCH_NVMPG_START - BCH_NVMPG_SYSRECS_OFFSET) / \ ++ sizeof(struct bch_nvmpg_recs))) ++ + void *bch_nvmpg_offset_to_ptr(unsigned long offset); + unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr); + +-- +2.39.2 + diff --git a/for-test/nvdimm-support/meta-dev-20230303/0004-bcache-bch_nvmpg_alloc_pages-of-the-buddy.patch b/for-test/nvdimm-support/meta-dev-20230303/0004-bcache-bch_nvmpg_alloc_pages-of-the-buddy.patch new file mode 100644 index 0000000..519f493 --- /dev/null +++ b/for-test/nvdimm-support/meta-dev-20230303/0004-bcache-bch_nvmpg_alloc_pages-of-the-buddy.patch @@ -0,0 +1,309 @@ +From 7e5c0ec244687ee77485b002030740142bbb97cb Mon Sep 17 00:00:00 2001 +From: Jianpeng Ma <jianpeng.ma@intel.com> +Date: Mon, 4 Jul 2022 14:53:04 +0800 +Subject: [PATCH 04/16] bcache: bch_nvmpg_alloc_pages() of the buddy + +This patch implements the bch_nvmpg_alloc_pages() of the nvm pages buddy +allocator. In terms of function, this func is like current +page-buddy-alloc. But the differences are: +a: it need owner_uuid as parameter which record owner info. And it +make those info persistence. +b: it don't need flags like GFP_*. All allocs are the equal. +c: it don't trigger other ops etc swap/recycle. + +Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com> +Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com> +Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +--- + drivers/md/bcache/nvmpg.c | 222 ++++++++++++++++++++++++++++++++++++++ + drivers/md/bcache/nvmpg.h | 9 ++ + 2 files changed, 231 insertions(+) + +diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c +index feba36ab5541..875983fee67e 100644 +--- a/drivers/md/bcache/nvmpg.c ++++ b/drivers/md/bcache/nvmpg.c +@@ -42,6 +42,11 @@ void *bch_nvmpg_offset_to_ptr(unsigned long offset) + return NULL; + } + ++static unsigned long bch_nvmpg_offset_to_pgoff(unsigned long nvmpg_offset) ++{ ++ return BCH_NVMPG_GET_OFFSET(nvmpg_offset) >> PAGE_SHIFT; ++} ++ + unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr) + { + int ns_id = ns->ns_id; +@@ -60,6 +65,15 @@ static void *bch_nvmpg_pgoff_to_ptr(struct bch_nvmpg_ns *ns, pgoff_t pgoff) + return ns->base_addr + (pgoff << PAGE_SHIFT); + } + ++static unsigned long bch_nvmpg_pgoff_to_offset(struct bch_nvmpg_ns *ns, ++ pgoff_t pgoff) ++{ ++ int ns_id = ns->ns_id; ++ unsigned long offset = pgoff << PAGE_SHIFT; ++ ++ return BCH_NVMPG_OFFSET(ns_id, offset); ++} ++ + static void *bch_nvmpg_rec_to_ptr(struct bch_nvmpg_rec *r) + { + struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[r->ns_id]; +@@ -268,6 +282,214 @@ static void bch_nvmpg_init_free_space(struct bch_nvmpg_ns *ns) + } + } + ++ ++/* If not found, it will create if create == true */ ++static struct bch_nvmpg_head *find_nvmpg_head(const char *uuid, bool create) ++{ ++ struct bch_nvmpg_set_header *set_header = global_nvmpg_set->set_header; ++ struct bch_nvmpg_head *head = NULL; ++ int i; ++ ++ if (set_header == NULL) ++ goto out; ++ ++ for (i = 0; i < set_header->size; i++) { ++ struct bch_nvmpg_head *h = &set_header->heads[i]; ++ ++ if (h->state != BCH_NVMPG_HD_STAT_ALLOC) ++ continue; ++ ++ if (!memcmp(uuid, h->uuid, 16)) { ++ head = h; ++ break; ++ } ++ } ++ ++ if (!head && create) { ++ u32 used = set_header->used; ++ ++ if (set_header->size > used) { ++ head = &set_header->heads[used]; ++ memset(head, 0, sizeof(struct bch_nvmpg_head)); ++ head->state = BCH_NVMPG_HD_STAT_ALLOC; ++ memcpy(head->uuid, uuid, 16); ++ global_nvmpg_set->heads_used++; ++ set_header->used++; ++ } else ++ pr_info("No free bch_nvmpg_head\n"); ++ } ++ ++out: ++ return head; ++} ++ ++static struct bch_nvmpg_recs *find_empty_nvmpg_recs(void) ++{ ++ unsigned int start; ++ struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[0]; ++ struct bch_nvmpg_recs *recs; ++ ++ start = bitmap_find_next_zero_area(ns->recs_bitmap, ++ BCH_MAX_PGALLOC_RECS, 0, 1, 0); ++ if (start > BCH_MAX_PGALLOC_RECS) { ++ pr_info("No free struct bch_nvmpg_recs\n"); ++ return NULL; ++ } ++ ++ bitmap_set(ns->recs_bitmap, start, 1); ++ recs = (struct bch_nvmpg_recs *) ++ bch_nvmpg_offset_to_ptr(BCH_NVMPG_SYSRECS_OFFSET) ++ + start; ++ ++ memset(recs, 0, sizeof(struct bch_nvmpg_recs)); ++ return recs; ++} ++ ++ ++static struct bch_nvmpg_recs *find_nvmpg_recs(struct bch_nvmpg_ns *ns, ++ struct bch_nvmpg_head *head, ++ bool create) ++{ ++ int ns_id = ns->sb->this_ns; ++ struct bch_nvmpg_recs *prev_recs = NULL, *recs = NULL; ++ ++ recs = bch_nvmpg_offset_to_ptr(head->recs_offset[ns_id]); ++ ++ /* If create=false, we return recs[nr] */ ++ if (!create) ++ return recs; ++ ++ /* ++ * If create=true, it mean we need a empty struct bch_nvmpg_rec ++ * So we should find non-empty struct bch_nvmpg_recs or alloc ++ * new struct bch_nvmpg_recs. And return this bch_nvmpg_recs ++ */ ++ while (recs && (recs->used == recs->size)) { ++ prev_recs = recs; ++ recs = bch_nvmpg_offset_to_ptr(recs->next_offset); ++ } ++ ++ /* Found empty struct bch_nvmpg_recs */ ++ if (recs) ++ return recs; ++ ++ /* Need alloc new struct bch_nvmpg_recs */ ++ recs = find_empty_nvmpg_recs(); ++ if (recs) { ++ unsigned long offset; ++ ++ recs->next_offset = 0; ++ recs->head_offset = bch_nvmpg_ptr_to_offset(ns, head); ++ memcpy(recs->magic, bch_nvmpg_recs_magic, sizeof(bch_nvmpg_recs_magic)); ++ memcpy(recs->uuid, head->uuid, 16); ++ recs->size = BCH_NVMPG_MAX_RECS; ++ recs->used = 0; ++ ++ offset = bch_nvmpg_ptr_to_offset(ns, recs); ++ if (prev_recs) ++ prev_recs->next_offset = offset; ++ else ++ head->recs_offset[ns_id] = offset; ++ } ++ ++ return recs; ++} ++ ++static void add_nvmpg_rec(struct bch_nvmpg_ns *ns, ++ struct bch_nvmpg_recs *recs, ++ unsigned long nvmpg_offset, ++ int order) ++{ ++ int i, ns_id; ++ unsigned long pgoff; ++ ++ pgoff = bch_nvmpg_offset_to_pgoff(nvmpg_offset); ++ ns_id = ns->sb->this_ns; ++ ++ for (i = 0; i < recs->size; i++) { ++ if (recs->recs[i].pgoff == 0) { ++ recs->recs[i].pgoff = pgoff; ++ recs->recs[i].order = order; ++ recs->recs[i].ns_id = ns_id; ++ recs->used++; ++ break; ++ } ++ } ++ BUG_ON(i == recs->size); ++} ++ ++ ++unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid) ++{ ++ unsigned long nvmpg_offset = 0; ++ struct bch_nvmpg_head *head; ++ int n, o; ++ ++ mutex_lock(&global_nvmpg_set->lock); ++ head = find_nvmpg_head(uuid, true); ++ ++ if (!head) { ++ pr_err("Cannot find bch_nvmpg_recs by uuid.\n"); ++ goto unlock; ++ } ++ ++ for (n = 0; n < global_nvmpg_set->total_ns; n++) { ++ struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[n]; ++ ++ if (!ns || (ns->free < (1L << order))) ++ continue; ++ ++ for (o = order; o < BCH_MAX_ORDER; o++) { ++ struct list_head *list; ++ struct page *page, *buddy_page; ++ ++ if (list_empty(&ns->free_area[o])) ++ continue; ++ ++ list = ns->free_area[o].next; ++ page = container_of((void *)list, struct page, ++ zone_device_data); ++ ++ list_del(list); ++ ++ while (o != order) { ++ void *addr; ++ pgoff_t pgoff; ++ ++ pgoff = page->index + (1L << (o - 1)); ++ addr = bch_nvmpg_pgoff_to_ptr(ns, pgoff); ++ buddy_page = bch_nvmpg_va_to_pg(addr); ++ set_page_private(buddy_page, o - 1); ++ buddy_page->index = pgoff; ++ __SetPageBuddy(buddy_page); ++ list_add((struct list_head *)&buddy_page->zone_device_data, ++ &ns->free_area[o - 1]); ++ o--; ++ } ++ ++ set_page_private(page, order); ++ __ClearPageBuddy(page); ++ ns->free -= 1L << order; ++ nvmpg_offset = bch_nvmpg_pgoff_to_offset(ns, page->index); ++ break; ++ } ++ ++ if (o < BCH_MAX_ORDER) { ++ struct bch_nvmpg_recs *recs; ++ ++ recs = find_nvmpg_recs(ns, head, true); ++ /* ToDo: handle pgalloc_recs==NULL */ ++ add_nvmpg_rec(ns, recs, nvmpg_offset, order); ++ break; ++ } ++ } ++ ++unlock: ++ mutex_unlock(&global_nvmpg_set->lock); ++ return nvmpg_offset; ++} ++EXPORT_SYMBOL_GPL(bch_nvmpg_alloc_pages); ++ + static int attach_nvmpg_set(struct bch_nvmpg_ns *ns) + { + struct bch_nvmpg_sb *sb = ns->sb; +diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h +index 1e2108221630..e52bb1f3f79e 100644 +--- a/drivers/md/bcache/nvmpg.h ++++ b/drivers/md/bcache/nvmpg.h +@@ -76,6 +76,9 @@ struct bch_nvmpg_set { + /* Indicate which field in bch_nvmpg_sb to be updated */ + #define BCH_NVMPG_TOTAL_NS 0 /* total_ns */ + ++#define BCH_PGOFF_TO_KVADDR(pgoff) \ ++ ((void *)((unsigned long)(pgoff) << PAGE_SHIFT)) ++ + #define BCH_MAX_PGALLOC_RECS \ + (min_t(unsigned int, 64, \ + (BCH_NVMPG_START - BCH_NVMPG_SYSRECS_OFFSET) / \ +@@ -89,6 +92,7 @@ unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr); + struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path, size_t size); + int bch_nvmpg_init(void); + void bch_nvmpg_exit(void); ++unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid); + + #else + +@@ -104,6 +108,11 @@ static inline int bch_nvmpg_init(void) + + static inline void bch_nvmpg_exit(void) { } + ++static inline unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid) ++{ ++ return 0; ++} ++ + #endif /* CONFIG_BCACHE_NVM_PAGES */ + + #endif /* _BCACHE_NVM_PAGES_H */ +-- +2.39.2 + diff --git a/for-test/nvdimm-support/meta-dev-20230303/0005-bcache-bch_nvmpg_free_pages-of-the-buddy-allocator.patch b/for-test/nvdimm-support/meta-dev-20230303/0005-bcache-bch_nvmpg_free_pages-of-the-buddy-allocator.patch new file mode 100644 index 0000000..b760b14 --- /dev/null +++ b/for-test/nvdimm-support/meta-dev-20230303/0005-bcache-bch_nvmpg_free_pages-of-the-buddy-allocator.patch @@ -0,0 +1,252 @@ +From 3eeb48d89ffb7cf3670f44e0723a6ed73b14efec Mon Sep 17 00:00:00 2001 +From: Jianpeng Ma <jianpeng.ma@intel.com> +Date: Thu, 21 Oct 2021 19:06:35 +0800 +Subject: [PATCH 05/16] bcache: bch_nvmpg_free_pages() of the buddy allocator + +This patch implements the bch_nvmpg_free_pages() of the buddy allocator. + +The difference between this and page-buddy-free: +it need owner_uuid to free owner allocated pages, and must +persistent after free. + +Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com> +Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com> +Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +--- + drivers/md/bcache/nvmpg.c | 165 ++++++++++++++++++++++++++++++++++++-- + drivers/md/bcache/nvmpg.h | 3 + + 2 files changed, 161 insertions(+), 7 deletions(-) + +diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c +index 875983fee67e..129938603bc7 100644 +--- a/drivers/md/bcache/nvmpg.c ++++ b/drivers/md/bcache/nvmpg.c +@@ -248,6 +248,57 @@ static int init_nvmpg_set_header(struct bch_nvmpg_ns *ns) + return rc; + } + ++static void __free_space(struct bch_nvmpg_ns *ns, unsigned long nvmpg_offset, ++ int order) ++{ ++ unsigned long add_pages = (1L << order); ++ pgoff_t pgoff; ++ struct page *page; ++ void *va; ++ ++ if (nvmpg_offset == 0) { ++ pr_err("free pages on offset 0\n"); ++ return; ++ } ++ ++ page = bch_nvmpg_va_to_pg(bch_nvmpg_offset_to_ptr(nvmpg_offset)); ++ WARN_ON((!page) || (page->private != order)); ++ pgoff = page->index; ++ ++ while (order < BCH_MAX_ORDER - 1) { ++ struct page *buddy_page; ++ ++ pgoff_t buddy_pgoff = pgoff ^ (1L << order); ++ pgoff_t parent_pgoff = pgoff & ~(1L << order); ++ ++ if ((parent_pgoff + (1L << (order + 1)) > ns->pages_total)) ++ break; ++ ++ va = bch_nvmpg_pgoff_to_ptr(ns, buddy_pgoff); ++ buddy_page = bch_nvmpg_va_to_pg(va); ++ WARN_ON(!buddy_page); ++ ++ if (PageBuddy(buddy_page) && (buddy_page->private == order)) { ++ list_del((struct list_head *)&buddy_page->zone_device_data); ++ __ClearPageBuddy(buddy_page); ++ pgoff = parent_pgoff; ++ order++; ++ continue; ++ } ++ break; ++ } ++ ++ va = bch_nvmpg_pgoff_to_ptr(ns, pgoff); ++ page = bch_nvmpg_va_to_pg(va); ++ WARN_ON(!page); ++ list_add((struct list_head *)&page->zone_device_data, ++ &ns->free_area[order]); ++ page->index = pgoff; ++ set_page_private(page, order); ++ __SetPageBuddy(page); ++ ns->free += add_pages; ++} ++ + static void bch_nvmpg_init_free_space(struct bch_nvmpg_ns *ns) + { + unsigned int start, end, pages; +@@ -260,21 +311,19 @@ static void bch_nvmpg_init_free_space(struct bch_nvmpg_ns *ns) + pages = end - start; + + while (pages) { +- void *addr; +- + for (i = BCH_MAX_ORDER - 1; i >= 0; i--) { + if ((pgoff_start % (1L << i) == 0) && + (pages >= (1L << i))) + break; + } + +- addr = bch_nvmpg_pgoff_to_ptr(ns, pgoff_start); +- page = bch_nvmpg_va_to_pg(addr); ++ page = bch_nvmpg_va_to_pg( ++ bch_nvmpg_pgoff_to_ptr(ns, pgoff_start)); + set_page_private(page, i); + page->index = pgoff_start; +- __SetPageBuddy(page); +- list_add((struct list_head *)&page->zone_device_data, +- &ns->free_area[i]); ++ ++ /* In order to update ns->free */ ++ __free_space(ns, bch_nvmpg_pgoff_to_offset(ns, pgoff_start), i); + + pgoff_start += 1L << i; + pages -= 1L << i; +@@ -490,6 +539,107 @@ unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid) + } + EXPORT_SYMBOL_GPL(bch_nvmpg_alloc_pages); + ++static inline void *nvm_end_addr(struct bch_nvmpg_ns *ns) ++{ ++ return ns->base_addr + (ns->pages_total << PAGE_SHIFT); ++} ++ ++static inline bool in_nvmpg_ns_range(struct bch_nvmpg_ns *ns, ++ void *start_addr, void *end_addr) ++{ ++ return (start_addr >= ns->base_addr) && (end_addr < nvm_end_addr(ns)); ++} ++ ++static int remove_nvmpg_rec(struct bch_nvmpg_recs *recs, int ns_id, ++ unsigned long nvmpg_offset, int order) ++{ ++ struct bch_nvmpg_head *head; ++ struct bch_nvmpg_recs *prev_recs, *sys_recs; ++ struct bch_nvmpg_ns *ns; ++ unsigned long pgoff; ++ int i; ++ ++ ns = global_nvmpg_set->ns_tbl[0]; ++ pgoff = bch_nvmpg_offset_to_pgoff(nvmpg_offset); ++ ++ head = bch_nvmpg_offset_to_ptr(recs->head_offset); ++ prev_recs = recs; ++ sys_recs = bch_nvmpg_offset_to_ptr(BCH_NVMPG_SYSRECS_OFFSET); ++ while (recs) { ++ for (i = 0; i < recs->size; i++) { ++ struct bch_nvmpg_rec *rec = &(recs->recs[i]); ++ ++ if ((rec->pgoff == pgoff) && (rec->ns_id == ns_id)) { ++ WARN_ON(rec->order != order); ++ rec->_v = 0; ++ recs->used--; ++ ++ if (recs->used == 0) { ++ int recs_pos = recs - sys_recs; ++ ++ if (recs == prev_recs) ++ head->recs_offset[ns_id] = ++ recs->next_offset; ++ else ++ prev_recs->next_offset = ++ recs->next_offset; ++ ++ recs->next_offset = 0; ++ recs->head_offset = 0; ++ ++ bitmap_clear(ns->recs_bitmap, recs_pos, 1); ++ } ++ goto out; ++ } ++ } ++ prev_recs = recs; ++ recs = bch_nvmpg_offset_to_ptr(recs->next_offset); ++ } ++out: ++ return (recs ? 0 : -ENOENT); ++} ++ ++void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order, ++ const char *uuid) ++{ ++ struct bch_nvmpg_ns *ns; ++ struct bch_nvmpg_head *head; ++ struct bch_nvmpg_recs *recs; ++ int r; ++ ++ mutex_lock(&global_nvmpg_set->lock); ++ ++ ns = global_nvmpg_set->ns_tbl[BCH_NVMPG_GET_NS_ID(nvmpg_offset)]; ++ if (!ns) { ++ pr_err("can't find namespace by given kaddr from namespace\n"); ++ goto unlock; ++ } ++ ++ head = find_nvmpg_head(uuid, false); ++ if (!head) { ++ pr_err("can't found bch_nvmpg_head by uuid\n"); ++ goto unlock; ++ } ++ ++ recs = find_nvmpg_recs(ns, head, false); ++ if (!recs) { ++ pr_err("can't find bch_nvmpg_recs by uuid\n"); ++ goto unlock; ++ } ++ ++ r = remove_nvmpg_rec(recs, ns->sb->this_ns, nvmpg_offset, order); ++ if (r < 0) { ++ pr_err("can't find bch_nvmpg_rec\n"); ++ goto unlock; ++ } ++ ++ __free_space(ns, nvmpg_offset, order); ++ ++unlock: ++ mutex_unlock(&global_nvmpg_set->lock); ++} ++EXPORT_SYMBOL_GPL(bch_nvmpg_free_pages); ++ + static int attach_nvmpg_set(struct bch_nvmpg_ns *ns) + { + struct bch_nvmpg_sb *sb = ns->sb; +@@ -677,6 +827,7 @@ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path, size_t size) + ns->pages_offset = sb->pages_offset; + ns->pages_total = sb->pages_total; + ns->sb = sb; ++ /* increase by __free_space() */ + ns->free = 0; + ns->bdev = bdev; + ns->set = global_nvmpg_set; +diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h +index e52bb1f3f79e..009582b5771b 100644 +--- a/drivers/md/bcache/nvmpg.h ++++ b/drivers/md/bcache/nvmpg.h +@@ -93,6 +93,7 @@ struct bch_nvmpg_ns *bch_register_namespace(const char *dev_path, size_t size); + int bch_nvmpg_init(void); + void bch_nvmpg_exit(void); + unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid); ++void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order, const char *uuid); + + #else + +@@ -113,6 +114,8 @@ static inline unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid) + return 0; + } + ++static inline void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order, const char *uuid) { } ++ + #endif /* CONFIG_BCACHE_NVM_PAGES */ + + #endif /* _BCACHE_NVM_PAGES_H */ +-- +2.39.2 + diff --git a/for-test/nvdimm-support/meta-dev-20230303/0006-bcache-get-recs-list-head-for-allocated-pages-by-specific-uuid.patch b/for-test/nvdimm-support/meta-dev-20230303/0006-bcache-get-recs-list-head-for-allocated-pages-by-specific-uuid.patch new file mode 100644 index 0000000..fb312f3 --- /dev/null +++ b/for-test/nvdimm-support/meta-dev-20230303/0006-bcache-get-recs-list-head-for-allocated-pages-by-specific-uuid.patch @@ -0,0 +1,67 @@ +From 5f3c1d461205e4baebfa952650827477d4d988cd Mon Sep 17 00:00:00 2001 +From: Jianpeng Ma <jianpeng.ma@intel.com> +Date: Thu, 21 Oct 2021 21:06:03 +0800 +Subject: [PATCH 06/16] bcache: get recs list head for allocated pages by + specific uuid + +This patch implements bch_get_nvmpg_head() of the buddy allocator +to be used to get recs list head for allocated pages by specific +uuid. Then the requester (owner) can find all previous allocated +nvdimm pages by iterating the recs list. + +Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com> +Co-developed-by: Qiaowei Ren <qiaowei.ren@intel.com> +Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com> +Reviewed-by: Hannes Reinecke <hare@suse.de> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Jens Axboe <axboe@kernel.dk> +--- + drivers/md/bcache/nvmpg.c | 6 ++++++ + drivers/md/bcache/nvmpg.h | 6 ++++++ + 2 files changed, 12 insertions(+) + +diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c +index 129938603bc7..87e2f5c3f734 100644 +--- a/drivers/md/bcache/nvmpg.c ++++ b/drivers/md/bcache/nvmpg.c +@@ -539,6 +539,12 @@ unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid) + } + EXPORT_SYMBOL_GPL(bch_nvmpg_alloc_pages); + ++struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid) ++{ ++ return find_nvmpg_head(uuid, false); ++} ++EXPORT_SYMBOL_GPL(bch_get_nvmpg_head); ++ + static inline void *nvm_end_addr(struct bch_nvmpg_ns *ns) + { + return ns->base_addr + (ns->pages_total << PAGE_SHIFT); +diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h +index 009582b5771b..a2621c201fa6 100644 +--- a/drivers/md/bcache/nvmpg.h ++++ b/drivers/md/bcache/nvmpg.h +@@ -94,6 +94,7 @@ int bch_nvmpg_init(void); + void bch_nvmpg_exit(void); + unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid); + void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order, const char *uuid); ++struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid); + + #else + +@@ -116,6 +117,11 @@ static inline unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid) + + static inline void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order, const char *uuid) { } + ++static inline struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid) ++{ ++ return NULL; ++} ++ + #endif /* CONFIG_BCACHE_NVM_PAGES */ + + #endif /* _BCACHE_NVM_PAGES_H */ +-- +2.39.2 + diff --git a/for-test/nvdimm-support/meta-dev-20230303/0007-bcache-use-bucket-index-to-set-GC_MARK_METADATA-for-journal-buckets-in-bch_btree_gc_finish.patch b/for-test/nvdimm-support/meta-dev-20230303/0007-bcache-use-bucket-index-to-set-GC_MARK_METADATA-for-journal-buckets-in-bch_btree_gc_finish.patch new file mode 100644 index 0000000..366f0a7 --- /dev/null +++ b/for-test/nvdimm-support/meta-dev-20230303/0007-bcache-use-bucket-index-to-set-GC_MARK_METADATA-for-journal-buckets-in-bch_btree_gc_finish.patch @@ -0,0 +1,48 @@ +From 3e136dd5ecb2ce99a567f81504bf8e85501fab5b Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Fri, 25 Jun 2021 00:17:02 +0800 +Subject: [PATCH 07/16] bcache: use bucket index to set GC_MARK_METADATA for + journal buckets in bch_btree_gc_finish() + +Currently the meta data bucket locations on cache device are reserved +after the meta data stored on NVDIMM pages, for the meta data layout +consistentcy temporarily. So these buckets are still marked as meta data +by SET_GC_MARK() in bch_btree_gc_finish(). + +When BCH_FEATURE_INCOMPAT_NVDIMM_META is set, the sb.d[] stores linear +address of NVDIMM pages and not bucket index anymore. Therefore we +should avoid to find bucket index from sb.d[], and directly use bucket +index from ca->sb.first_bucket to (ca->sb.first_bucket + +ca->sb.njournal_bucketsi) for setting the gc mark of journal bucket. + +Signed-off-by: Coly Li <colyli@suse.de> +Reviewed-by: Hannes Reinecke <hare@suse.de> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: Jianpeng Ma <jianpeng.ma@intel.com> +Cc: Qiaowei Ren <qiaowei.ren@intel.com> +--- + drivers/md/bcache/btree.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c +index e136d6edc1ed..6a90c33109c7 100644 +--- a/drivers/md/bcache/btree.c ++++ b/drivers/md/bcache/btree.c +@@ -1761,8 +1761,10 @@ static void bch_btree_gc_finish(struct cache_set *c) + ca = c->cache; + ca->invalidate_needs_gc = 0; + +- for (k = ca->sb.d; k < ca->sb.d + ca->sb.keys; k++) +- SET_GC_MARK(ca->buckets + *k, GC_MARK_METADATA); ++ /* Range [first_bucket, first_bucket + keys) is for journal buckets */ ++ for (i = ca->sb.first_bucket; ++ i < ca->sb.first_bucket + ca->sb.njournal_buckets; i++) ++ SET_GC_MARK(ca->buckets + i, GC_MARK_METADATA); + + for (k = ca->prio_buckets; + k < ca->prio_buckets + prio_buckets(ca) * 2; k++) +-- +2.39.2 + diff --git a/for-test/nvdimm-support/meta-dev-20230303/0008-bcache-add-bch_nvmpg_flush-to-flush-LLC-of-NVDIMM-pages.patch b/for-test/nvdimm-support/meta-dev-20230303/0008-bcache-add-bch_nvmpg_flush-to-flush-LLC-of-NVDIMM-pages.patch new file mode 100644 index 0000000..03b1087 --- /dev/null +++ b/for-test/nvdimm-support/meta-dev-20230303/0008-bcache-add-bch_nvmpg_flush-to-flush-LLC-of-NVDIMM-pages.patch @@ -0,0 +1,64 @@ +From 4fbfdf76868bffede3dde3a11fc28fae50a5f314 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Mon, 4 Jul 2022 15:14:13 +0800 +Subject: [PATCH 08/16] bcache: add bch_nvmpg_flush() to flush LLC of NVDIMM + pages + +bch_nvmpg_flush() is added to flush last level cache for all dirty +NVDIMM pages from the nvmpg allocator. It will be used in future patches +when last level cache flushing is necessary. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/nvmpg.c | 14 ++++++++++++++ + drivers/md/bcache/nvmpg.h | 2 ++ + 2 files changed, 16 insertions(+) + +diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c +index 87e2f5c3f734..aaa7f2ff70ab 100644 +--- a/drivers/md/bcache/nvmpg.c ++++ b/drivers/md/bcache/nvmpg.c +@@ -299,6 +299,20 @@ static void __free_space(struct bch_nvmpg_ns *ns, unsigned long nvmpg_offset, + ns->free += add_pages; + } + ++void bch_nvmpg_flush(void) ++{ ++ int i; ++ ++ for (i = 0; i < BCH_NVMPG_NS_MAX; i++) { ++ struct bch_nvmpg_ns *ns = global_nvmpg_set->ns_tbl[i]; ++ ++ if (ns) ++ arch_invalidate_pmem(ns->base_addr, ++ ns->pages_total << PAGE_SHIFT); ++ } ++} ++ ++ + static void bch_nvmpg_init_free_space(struct bch_nvmpg_ns *ns) + { + unsigned int start, end, pages; +diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h +index a2621c201fa6..984c25cdf3d2 100644 +--- a/drivers/md/bcache/nvmpg.h ++++ b/drivers/md/bcache/nvmpg.h +@@ -94,6 +94,7 @@ int bch_nvmpg_init(void); + void bch_nvmpg_exit(void); + unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid); + void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order, const char *uuid); ++void bch_nvmpg_flush(void); + struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid); + + #else +@@ -116,6 +117,7 @@ static inline unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid) + } + + static inline void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order, const char *uuid) { } ++static inline void bch_nvmpg_flush(void) {} + + static inline struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid) + { +-- +2.39.2 + diff --git a/for-test/nvdimm-support/meta-dev-20230303/0009-bcache-add-BCH_FEATURE_INCOMPAT_NVDIMM_META-into-incompat-feature-set.patch b/for-test/nvdimm-support/meta-dev-20230303/0009-bcache-add-BCH_FEATURE_INCOMPAT_NVDIMM_META-into-incompat-feature-set.patch new file mode 100644 index 0000000..e5aeaee --- /dev/null +++ b/for-test/nvdimm-support/meta-dev-20230303/0009-bcache-add-BCH_FEATURE_INCOMPAT_NVDIMM_META-into-incompat-feature-set.patch @@ -0,0 +1,60 @@ +From 7a123f9624f90472d4fdd703838a7865ccd649ca Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Fri, 25 Jun 2021 00:18:31 +0800 +Subject: [PATCH 09/16] bcache: add BCH_FEATURE_INCOMPAT_NVDIMM_META into + incompat feature set + +This patch adds BCH_FEATURE_INCOMPAT_NVDIMM_META (value 0x0004) into the +incompat feature set. When this bit is set by bcache-tools, it indicates +bcache meta data should be stored on specific NVDIMM meta device. + +The bcache meta data mainly includes journal and btree nodes, when this +bit is set in incompat feature set, bcache will ask the nvm-pages +allocator for NVDIMM space to store the meta data. + +Signed-off-by: Coly Li <colyli@suse.de> +Reviewed-by: Hannes Reinecke <hare@suse.de> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: Jianpeng Ma <jianpeng.ma@intel.com> +Cc: Qiaowei Ren <qiaowei.ren@intel.com> +--- + drivers/md/bcache/features.h | 9 +++++++++ + 1 file changed, 9 insertions(+) + +diff --git a/drivers/md/bcache/features.h b/drivers/md/bcache/features.h +index 09161b89c63e..fab92678be76 100644 +--- a/drivers/md/bcache/features.h ++++ b/drivers/md/bcache/features.h +@@ -18,11 +18,19 @@ + #define BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET 0x0001 + /* real bucket size is (1 << bucket_size) */ + #define BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE 0x0002 ++/* store bcache meta data on nvdimm */ ++#define BCH_FEATURE_INCOMPAT_NVDIMM_META 0x0004 + + #define BCH_FEATURE_COMPAT_SUPP 0 + #define BCH_FEATURE_RO_COMPAT_SUPP 0 ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++#define BCH_FEATURE_INCOMPAT_SUPP (BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET| \ ++ BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE| \ ++ BCH_FEATURE_INCOMPAT_NVDIMM_META) ++#else + #define BCH_FEATURE_INCOMPAT_SUPP (BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET| \ + BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE) ++#endif + + #define BCH_HAS_COMPAT_FEATURE(sb, mask) \ + ((sb)->feature_compat & (mask)) +@@ -90,6 +98,7 @@ static inline void bch_clear_feature_##name(struct cache_sb *sb) \ + + BCH_FEATURE_INCOMPAT_FUNCS(obso_large_bucket, OBSO_LARGE_BUCKET); + BCH_FEATURE_INCOMPAT_FUNCS(large_bucket, LOG_LARGE_BUCKET_SIZE); ++BCH_FEATURE_INCOMPAT_FUNCS(nvdimm_meta, NVDIMM_META); + + static inline bool bch_has_unknown_compat_features(struct cache_sb *sb) + { +-- +2.39.2 + diff --git a/for-test/nvdimm-support/meta-dev-20230303/0010-bcache-initialize-bcache-journal-for-NVDIMM-meta-device.patch b/for-test/nvdimm-support/meta-dev-20230303/0010-bcache-initialize-bcache-journal-for-NVDIMM-meta-device.patch new file mode 100644 index 0000000..0ddf1e5 --- /dev/null +++ b/for-test/nvdimm-support/meta-dev-20230303/0010-bcache-initialize-bcache-journal-for-NVDIMM-meta-device.patch @@ -0,0 +1,257 @@ +From 06b8897f74cbf6cff0565c9fd85f76985267f5ff Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Mon, 4 Jul 2022 15:17:51 +0800 +Subject: [PATCH 10/16] bcache: initialize bcache journal for NVDIMM meta + device + +The nvm-pages allocator may store and index the NVDIMM pages allocated +for bcache journal. This patch adds the initialization to store bcache +journal space on NVDIMM pages if BCH_FEATURE_INCOMPAT_NVDIMM_META bit is +set by bcache-tools. + +If BCH_FEATURE_INCOMPAT_NVDIMM_META is set, get_nvdimm_journal_space() +will return the nvmpg_offset of NVDIMM pages for bcache journal, +- If there is previously allocated space, find it from nvm-pages owner + list and return to bch_journal_init(). +- If there is no previously allocated space, require a new NVDIMM range + from the nvm-pages allocator, and return it to bch_journal_init(). + +And in bch_journal_init(), keys in sb.d[] store the corresponding nvmpg +offset from NVDIMM into sb.d[i].ptr[0] where 'i' is the bucket index to +iterate all journal buckets. + +Later when bcache journaling code stores the journaling jset, the target +NVDIMM nvmpg offset stored (and updated) in sb.d[i].ptr[0] can be used +to calculate the linear address in memory copy from DRAM pages into +NVDIMM pages. + +Signed-off-by: Coly Li <colyli@suse.de> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: Jianpeng Ma <jianpeng.ma@intel.com> +Cc: Qiaowei Ren <qiaowei.ren@intel.com> +--- + drivers/md/bcache/journal.c | 115 ++++++++++++++++++++++++++++++++++++ + drivers/md/bcache/journal.h | 2 +- + drivers/md/bcache/nvmpg.c | 9 +++ + drivers/md/bcache/nvmpg.h | 1 + + drivers/md/bcache/super.c | 18 +++--- + 5 files changed, 134 insertions(+), 11 deletions(-) + +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index e5da469a4235..1040692c5cc7 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -9,6 +9,8 @@ + #include "btree.h" + #include "debug.h" + #include "extents.h" ++#include "nvmpg.h" ++#include "features.h" + + #include <trace/events/bcache.h> + +@@ -997,3 +999,116 @@ int bch_journal_alloc(struct cache_set *c) + + return 0; + } ++ ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++ ++static unsigned long find_journal_nvmpg_base(struct bch_nvmpg_head *nvmpg_head, ++ struct cache *ca) ++{ ++ unsigned long jnl_offset, jnl_pgoff, jnl_ns_id; ++ unsigned long ret_offset = 0; ++ int i; ++ ++ jnl_offset = (unsigned long)ca->sb.d[0]; ++ jnl_ns_id = BCH_NVMPG_GET_NS_ID(jnl_offset); ++ jnl_pgoff = BCH_NVMPG_GET_OFFSET(jnl_offset) >> PAGE_SHIFT; ++ ++ for (i = 0; i < BCH_NVMPG_NS_MAX; i++) { ++ struct bch_nvmpg_recs *recs; ++ struct bch_nvmpg_rec *rec; ++ unsigned long recs_offset = 0; ++ int j; ++ ++ recs_offset = nvmpg_head->recs_offset[i]; ++ recs = bch_nvmpg_offset_to_ptr(recs_offset); ++ while (recs) { ++ for (j = 0; j < recs->size; j++) { ++ rec = &recs->recs[j]; ++ if ((rec->pgoff != jnl_pgoff) || ++ (rec->ns_id != jnl_ns_id)) ++ continue; ++ ++ ret_offset = jnl_offset; ++ goto out; ++ } ++ recs_offset = recs->next_offset; ++ recs = bch_nvmpg_offset_to_ptr(recs_offset); ++ } ++ } ++ ++out: ++ return ret_offset; ++} ++ ++static unsigned long get_journal_nvmpg_space(struct cache *ca) ++{ ++ struct bch_nvmpg_head *head = NULL; ++ unsigned long nvmpg_offset; ++ int order; ++ ++ head = bch_get_nvmpg_head(ca->sb.set_uuid); ++ if (head) { ++ nvmpg_offset = find_journal_nvmpg_base(head, ca); ++ if (nvmpg_offset) ++ goto found; ++ } ++ ++ order = ilog2((ca->sb.bucket_size * ++ ca->sb.njournal_buckets) / PAGE_SECTORS); ++ nvmpg_offset = bch_nvmpg_alloc_pages(order, ca->sb.set_uuid); ++ if (nvmpg_offset) ++ memset(bch_nvmpg_offset_to_ptr(nvmpg_offset), ++ 0, (1 << order) * PAGE_SIZE); ++found: ++ return nvmpg_offset; ++} ++ ++#endif /* CONFIG_BCACHE_NVM_PAGES */ ++ ++static int __bch_journal_nvdimm_init(struct cache *ca) ++{ ++ int ret = -1; ++ ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++ int i; ++ unsigned long jnl_base = 0; ++ ++ jnl_base = get_journal_nvmpg_space(ca); ++ if (!jnl_base) { ++ pr_err("Failed to get journal space from nvdimm\n"); ++ goto out; ++ } ++ ++ /* Iniialized and reloaded from on-disk super block already */ ++ if (ca->sb.d[0] != 0) { ++ ret = 0; ++ goto out; ++ } ++ ++ for (i = 0; i < ca->sb.keys; i++) ++ ca->sb.d[i] = jnl_base + (bucket_bytes(ca) * i); ++ ++ ret = 0; ++out: ++#endif /* CONFIG_BCACHE_NVM_PAGES */ ++ ++ return ret; ++} ++ ++ ++int bch_journal_init(struct cache_set *c) ++{ ++ int i, ret = 0; ++ struct cache *ca = c->cache; ++ ++ ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7, ++ 2, SB_JOURNAL_BUCKETS); ++ ++ if (!bch_has_feature_nvdimm_meta(&ca->sb)) { ++ for (i = 0; i < ca->sb.keys; i++) ++ ca->sb.d[i] = ca->sb.first_bucket + i; ++ } else ++ ret = __bch_journal_nvdimm_init(ca); ++ ++ return ret; ++} +diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h +index cd316b4a1e95..3387659c5ede 100644 +--- a/drivers/md/bcache/journal.h ++++ b/drivers/md/bcache/journal.h +@@ -180,7 +180,7 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list); + void bch_journal_meta(struct cache_set *c, struct closure *cl); + int bch_journal_read(struct cache_set *c, struct list_head *list); + int bch_journal_replay(struct cache_set *c, struct list_head *list); +- ++int bch_journal_init(struct cache_set *c); + void bch_journal_free(struct cache_set *c); + int bch_journal_alloc(struct cache_set *c); + void bch_journal_space_reserve(struct journal *j); +diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c +index aaa7f2ff70ab..642d2e21c565 100644 +--- a/drivers/md/bcache/nvmpg.c ++++ b/drivers/md/bcache/nvmpg.c +@@ -24,6 +24,15 @@ + + struct bch_nvmpg_set *global_nvmpg_set; + ++struct bch_nvmpg_ns *bch_nvmpg_id_to_ns(int ns_id) ++{ ++ if ((ns_id >= 0) && (ns_id < BCH_NVMPG_NS_MAX)) ++ return global_nvmpg_set->ns_tbl[ns_id]; ++ ++ pr_emerg("Invalid ns_id: %d\n", ns_id); ++ return NULL; ++} ++ + void *bch_nvmpg_offset_to_ptr(unsigned long offset) + { + int ns_id; +diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h +index 984c25cdf3d2..a8a36a6caebb 100644 +--- a/drivers/md/bcache/nvmpg.h ++++ b/drivers/md/bcache/nvmpg.h +@@ -96,6 +96,7 @@ unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid); + void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order, const char *uuid); + void bch_nvmpg_flush(void); + struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid); ++struct bch_nvmpg_ns *bch_nvmpg_id_to_ns(int ns_id); + + #else + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index ffe79871aa69..eb04e8a4d6e7 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -146,9 +146,11 @@ static const char *read_super_common(struct cache_sb *sb, struct block_device * + goto err; + + err = "Journal buckets not sequential"; +- for (i = 0; i < sb->keys; i++) +- if (sb->d[i] != sb->first_bucket + i) +- goto err; ++ if (!bch_has_feature_nvdimm_meta(sb)) { ++ for (i = 0; i < sb->keys; i++) ++ if (sb->d[i] != sb->first_bucket + i) ++ goto err; ++ } + + err = "Too many journal buckets"; + if (sb->first_bucket + sb->keys > sb->nbuckets) +@@ -2065,14 +2067,10 @@ static int run_cache_set(struct cache_set *c) + if (bch_journal_replay(c, &journal)) + goto err; + } else { +- unsigned int j; +- + pr_notice("invalidating existing data\n"); +- ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7, +- 2, SB_JOURNAL_BUCKETS); +- +- for (j = 0; j < ca->sb.keys; j++) +- ca->sb.d[j] = ca->sb.first_bucket + j; ++ err = "error initializing journal"; ++ if (bch_journal_init(c)) ++ goto err; + + bch_initial_gc_finish(c); + +-- +2.39.2 + diff --git a/for-test/nvdimm-support/meta-dev-20230303/0011-bcache-support-storing-bcache-journal-into-NVDIMM-meta-device.patch b/for-test/nvdimm-support/meta-dev-20230303/0011-bcache-support-storing-bcache-journal-into-NVDIMM-meta-device.patch new file mode 100644 index 0000000..610da85 --- /dev/null +++ b/for-test/nvdimm-support/meta-dev-20230303/0011-bcache-support-storing-bcache-journal-into-NVDIMM-meta-device.patch @@ -0,0 +1,232 @@ +From 03b3d8c0e44a1dccecfdf13d2803070afd387516 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Tue, 7 Jun 2022 12:07:23 +0800 +Subject: [PATCH 11/16] bcache: support storing bcache journal into NVDIMM meta + device + +This patch implements two methods to store bcache journal to, +1) __journal_write_unlocked() for block interface device + The latency method to compose bio and issue the jset bio to cache + device (e.g. SSD). c->journal.key.ptr[0] indicates the LBA on cache + device to store the journal jset. +2) __journal_nvdimm_write_unlocked() for memory interface NVDIMM + Use memory interface to access NVDIMM pages and store the jset with + memcpy() and bch_nvmpg_flush(). c->journal.key.ptr[0] indicates the + linear address from the NVDIMM pages to store the journal jset. + +For legacy configuration without NVDIMM meta device, journal I/O is +handled by __journal_write_unlocked() with existing code logic. If the +NVDIMM meta device is used (by bcache-tools), the journal I/O will +be handled by __journal_nvdimm_write_unlocked() and go into the NVDIMM +pages. + +And when NVDIMM meta device is used, sb.d[] stores the linear addresses +from NVDIMM pages (no more bucket index), in journal_reclaim() the +journaling location in c->journal.key.ptr[0] should also be updated by +linear address from NVDIMM pages (no more LBA combined by sectors offset +and bucket index). + +Signed-off-by: Coly Li <colyli@suse.de> +Reviewed-by: Hannes Reinecke <hare@suse.de> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: Jianpeng Ma <jianpeng.ma@intel.com> +Cc: Qiaowei Ren <qiaowei.ren@intel.com> +--- + drivers/md/bcache/journal.c | 121 +++++++++++++++++++++++++----------- + drivers/md/bcache/super.c | 3 +- + 2 files changed, 86 insertions(+), 38 deletions(-) + +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index 1040692c5cc7..9c325be17830 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -599,6 +599,8 @@ static void do_journal_discard(struct cache *ca) + return; + } + ++ BUG_ON(bch_has_feature_nvdimm_meta(&ca->sb)); ++ + switch (atomic_read(&ja->discard_in_flight)) { + case DISCARD_IN_FLIGHT: + return; +@@ -678,9 +680,16 @@ static void journal_reclaim(struct cache_set *c) + goto out; + + ja->cur_idx = (ja->cur_idx + 1) % ca->sb.njournal_buckets; +- k->ptr[0] = MAKE_PTR(0, +- bucket_to_sector(c, ca->sb.d[ja->cur_idx]), +- ca->sb.nr_this_dev); ++ ++ if (!bch_has_feature_nvdimm_meta(&ca->sb)) ++ k->ptr[0] = MAKE_PTR(0, ++ bucket_to_sector(c, ca->sb.d[ja->cur_idx]), ++ ca->sb.nr_this_dev); ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++ else ++ k->ptr[0] = (unsigned long)bch_nvmpg_offset_to_ptr( ++ ca->sb.d[ja->cur_idx]); ++#endif + atomic_long_inc(&c->reclaimed_journal_buckets); + + bkey_init(k); +@@ -746,46 +755,21 @@ static void journal_write_unlock(struct closure *cl) + spin_unlock(&c->journal.lock); + } + +-static void journal_write_unlocked(struct closure *cl) ++ ++static void __journal_write_unlocked(struct cache_set *c) + __releases(c->journal.lock) + { +- struct cache_set *c = container_of(cl, struct cache_set, journal.io); +- struct cache *ca = c->cache; +- struct journal_write *w = c->journal.cur; + struct bkey *k = &c->journal.key; +- unsigned int i, sectors = set_blocks(w->data, block_bytes(ca)) * +- ca->sb.block_size; +- ++ struct journal_write *w = c->journal.cur; ++ struct closure *cl = &c->journal.io; ++ struct cache *ca = c->cache; + struct bio *bio; + struct bio_list list; ++ unsigned int i, sectors = set_blocks(w->data, block_bytes(ca)) * ++ ca->sb.block_size; + + bio_list_init(&list); + +- if (!w->need_write) { +- closure_return_with_destructor(cl, journal_write_unlock); +- return; +- } else if (journal_full(&c->journal)) { +- journal_reclaim(c); +- spin_unlock(&c->journal.lock); +- +- btree_flush_write(c); +- continue_at(cl, journal_write, bch_journal_wq); +- return; +- } +- +- c->journal.blocks_free -= set_blocks(w->data, block_bytes(ca)); +- +- w->data->btree_level = c->root->level; +- +- bkey_copy(&w->data->btree_root, &c->root->key); +- bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket); +- +- w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0]; +- w->data->magic = jset_magic(&ca->sb); +- w->data->version = BCACHE_JSET_VERSION; +- w->data->last_seq = last_seq(&c->journal); +- w->data->csum = csum_set(w->data); +- + for (i = 0; i < KEY_PTRS(k); i++) { + ca = c->cache; + bio = &ca->journal.bio; +@@ -808,7 +792,6 @@ static void journal_write_unlocked(struct closure *cl) + + ca->journal.seq[ca->journal.cur_idx] = w->data->seq; + } +- + /* If KEY_PTRS(k) == 0, this jset gets lost in air */ + BUG_ON(i == 0); + +@@ -820,6 +803,72 @@ static void journal_write_unlocked(struct closure *cl) + + while ((bio = bio_list_pop(&list))) + closure_bio_submit(c, bio, cl); ++} ++ ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++ ++static void __journal_nvdimm_write_unlocked(struct cache_set *c) ++ __releases(c->journal.lock) ++{ ++ struct journal_write *w = c->journal.cur; ++ struct cache *ca = c->cache; ++ unsigned int sectors; ++ ++ sectors = set_blocks(w->data, block_bytes(ca)) * ca->sb.block_size; ++ atomic_long_add(sectors, &ca->meta_sectors_written); ++ ++ memcpy((void *)c->journal.key.ptr[0], w->data, sectors << 9); ++ bch_nvmpg_flush(); ++ ++ c->journal.key.ptr[0] += sectors << 9; ++ ca->journal.seq[ca->journal.cur_idx] = w->data->seq; ++ ++ atomic_dec_bug(&fifo_back(&c->journal.pin)); ++ bch_journal_next(&c->journal); ++ journal_reclaim(c); ++ ++ spin_unlock(&c->journal.lock); ++} ++ ++#endif /* CONFIG_BCACHE_NVM_PAGES */ ++ ++static void journal_write_unlocked(struct closure *cl) ++{ ++ struct cache_set *c = container_of(cl, struct cache_set, journal.io); ++ struct cache *ca = c->cache; ++ struct journal_write *w = c->journal.cur; ++ ++ if (!w->need_write) { ++ closure_return_with_destructor(cl, journal_write_unlock); ++ return; ++ } else if (journal_full(&c->journal)) { ++ journal_reclaim(c); ++ spin_unlock(&c->journal.lock); ++ ++ btree_flush_write(c); ++ continue_at(cl, journal_write, bch_journal_wq); ++ return; ++ } ++ ++ c->journal.blocks_free -= set_blocks(w->data, block_bytes(ca)); ++ ++ w->data->btree_level = c->root->level; ++ ++ bkey_copy(&w->data->btree_root, &c->root->key); ++ bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket); ++ ++ w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0]; ++ w->data->magic = jset_magic(&ca->sb); ++ w->data->version = BCACHE_JSET_VERSION; ++ w->data->last_seq = last_seq(&c->journal); ++ w->data->csum = csum_set(w->data); ++ ++ if (!bch_has_feature_nvdimm_meta(&ca->sb)) ++ __journal_write_unlocked(c); ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++ else ++ __journal_nvdimm_write_unlocked(c); ++#endif + + continue_at(cl, journal_write_done, NULL); + } +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index eb04e8a4d6e7..7581c3eaf34e 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -1676,7 +1676,7 @@ void bch_cache_set_release(struct kobject *kobj) + static void cache_set_free(struct closure *cl) + { + struct cache_set *c = container_of(cl, struct cache_set, cl); +- struct cache *ca; ++ struct cache *ca = c->cache; + + debugfs_remove(c->debug); + +@@ -1688,7 +1688,6 @@ static void cache_set_free(struct closure *cl) + bch_bset_sort_state_free(&c->sort); + free_pages((unsigned long) c->uuids, ilog2(meta_bucket_pages(&c->cache->sb))); + +- ca = c->cache; + if (ca) { + ca->set = NULL; + c->cache = NULL; +-- +2.39.2 + diff --git a/for-test/nvdimm-support/meta-dev-20230303/0012-bcache-read-jset-from-NVDIMM-pages-for-journal-replay.patch b/for-test/nvdimm-support/meta-dev-20230303/0012-bcache-read-jset-from-NVDIMM-pages-for-journal-replay.patch new file mode 100644 index 0000000..9adc56a --- /dev/null +++ b/for-test/nvdimm-support/meta-dev-20230303/0012-bcache-read-jset-from-NVDIMM-pages-for-journal-replay.patch @@ -0,0 +1,177 @@ +From 97fbf32af7a3a3da1fa05a70df380f173b553725 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Tue, 7 Jun 2022 12:22:29 +0800 +Subject: [PATCH 12/16] bcache: read jset from NVDIMM pages for journal replay + +This patch implements two methods to read jset from media for journal +replay, +- __jnl_rd_bkt() for block device + This is the legacy method to read jset via block device interface. +- __jnl_rd_nvm_bkt() for NVDIMM + This is the method to read jset from NVDIMM memory interface, a.k.a + memcopy() from NVDIMM pages to DRAM pages. + +If BCH_FEATURE_INCOMPAT_NVDIMM_META is set in incompat feature set, +during running cache set, journal_read_bucket() will read the journal +content from NVDIMM by __jnl_rd_nvm_bkt(). The linear addresses of +NVDIMM pages to read jset are stored in sb.d[SB_JOURNAL_BUCKETS], which +were initialized and maintained in previous runs of the cache set. + +A thing should be noticed is, when bch_journal_read() is called, the +linear address of NVDIMM pages is not loaded and initialized yet, it +is necessary to call __bch_journal_nvdimm_init() before reading the jset +from NVDIMM pages. + +The code comments added in journal_read_bucket() is noticed by kernel +test robot and Dan Carpenter, it explains why it is safe to only check +!bch_has_feature_nvdimm_meta() condition in the if() statement when +CONFIG_BCACHE_NVM_PAGES is not configured. To avoid confusion from the +bogus warning message from static checking tool. + +Signed-off-by: Coly Li <colyli@suse.de> +Reported-by: kernel test robot <lkp@intel.com> +Reported-by: Dan Carpenter <dan.carpenter@oracle.com> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Hannes Reinecke <hare@suse.de> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: Jianpeng Ma <jianpeng.ma@intel.com> +Cc: Qiaowei Ren <qiaowei.ren@intel.com> +--- + drivers/md/bcache/journal.c | 84 ++++++++++++++++++++++++++++++------- + 1 file changed, 69 insertions(+), 15 deletions(-) + +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index 9c325be17830..24615df1f4e6 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -34,18 +34,58 @@ static void journal_read_endio(struct bio *bio) + closure_put(cl); + } + ++static struct jset *__jnl_rd_bkt(struct cache *ca, unsigned int bkt_idx, ++ unsigned int len, unsigned int offset, ++ struct closure *cl) ++{ ++ sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bkt_idx]); ++ struct bio *bio = &ca->journal.bio; ++ struct jset *data = ca->set->journal.w[0].data; ++ ++ bio_reset(bio, ca->bdev, REQ_OP_READ); ++ bio->bi_iter.bi_sector = bucket + offset; ++ bio->bi_iter.bi_size = len << 9; ++ ++ bio->bi_end_io = journal_read_endio; ++ bio->bi_private = cl; ++ bch_bio_map(bio, data); ++ ++ closure_bio_submit(ca->set, bio, cl); ++ closure_sync(cl); ++ ++ /* Indeed journal.w[0].data */ ++ return data; ++} ++ ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++ ++static struct jset *__jnl_rd_nvm_bkt(struct cache *ca, unsigned int bkt_idx, ++ unsigned int len, unsigned int offset) ++{ ++ void *jset_addr; ++ struct jset *data; ++ ++ jset_addr = bch_nvmpg_offset_to_ptr(ca->sb.d[bkt_idx]) + (offset << 9); ++ data = ca->set->journal.w[0].data; ++ ++ memcpy(data, jset_addr, len << 9); ++ ++ /* Indeed journal.w[0].data */ ++ return data; ++} ++ ++#endif /* CONFIG_BCACHE_NVM_PAGES */ ++ + static int journal_read_bucket(struct cache *ca, struct list_head *list, + unsigned int bucket_index) + { + struct journal_device *ja = &ca->journal; +- struct bio *bio = &ja->bio; + + struct journal_replay *i; +- struct jset *j, *data = ca->set->journal.w[0].data; ++ struct jset *j; + struct closure cl; + unsigned int len, left, offset = 0; + int ret = 0; +- sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]); + + closure_init_stack(&cl); + +@@ -55,24 +95,27 @@ static int journal_read_bucket(struct cache *ca, struct list_head *list, + reread: left = ca->sb.bucket_size - offset; + len = min_t(unsigned int, left, PAGE_SECTORS << JSET_BITS); + +- bio_reset(bio, ca->bdev, REQ_OP_READ); +- bio->bi_iter.bi_sector = bucket + offset; +- bio->bi_iter.bi_size = len << 9; +- +- bio->bi_end_io = journal_read_endio; +- bio->bi_private = &cl; +- bch_bio_map(bio, data); +- +- closure_bio_submit(ca->set, bio, &cl); +- closure_sync(&cl); ++ if (!bch_has_feature_nvdimm_meta(&ca->sb)) ++ j = __jnl_rd_bkt(ca, bucket_index, len, offset, &cl); ++ /* ++ * If CONFIG_BCACHE_NVM_PAGES is not defined, the feature bit ++ * BCH_FEATURE_INCOMPAT_NVDIMM_META won't in incompatible ++ * support feature set, a cache device format with feature bit ++ * BCH_FEATURE_INCOMPAT_NVDIMM_META will fail much earlier in ++ * read_super() by bch_has_unknown_incompat_features(). ++ * Therefore when CONFIG_BCACHE_NVM_PAGES is not define, it is ++ * safe to ignore the bch_has_feature_nvdimm_meta() condition. ++ */ ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++ else ++ j = __jnl_rd_nvm_bkt(ca, bucket_index, len, offset); ++#endif + + /* This function could be simpler now since we no longer write + * journal entries that overlap bucket boundaries; this means + * the start of a bucket will always have a valid journal entry + * if it has any journal entries at all. + */ +- +- j = data; + while (len) { + struct list_head *where; + size_t blocks, bytes = set_bytes(j); +@@ -168,6 +211,8 @@ reread: left = ca->sb.bucket_size - offset; + return ret; + } + ++static int __bch_journal_nvdimm_init(struct cache *ca); ++ + int bch_journal_read(struct cache_set *c, struct list_head *list) + { + #define read_bucket(b) \ +@@ -186,6 +231,15 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) + unsigned int i, l, r, m; + uint64_t seq; + ++ /* ++ * Linear addresses of NVDIMM pages for journaling is not ++ * initialized yet, do it before read jset from NVDIMM pages. ++ */ ++ if (bch_has_feature_nvdimm_meta(&ca->sb)) { ++ if (__bch_journal_nvdimm_init(ca) < 0) ++ return -ENXIO; ++ } ++ + bitmap_zero(bitmap, SB_JOURNAL_BUCKETS); + pr_debug("%u journal buckets\n", ca->sb.njournal_buckets); + +-- +2.39.2 + diff --git a/for-test/nvdimm-support/meta-dev-20230303/0013-bcache-add-sysfs-interface-register_nvdimm_meta-to-register-NVDIMM-meta-device.patch b/for-test/nvdimm-support/meta-dev-20230303/0013-bcache-add-sysfs-interface-register_nvdimm_meta-to-register-NVDIMM-meta-device.patch new file mode 100644 index 0000000..3c23ad5 --- /dev/null +++ b/for-test/nvdimm-support/meta-dev-20230303/0013-bcache-add-sysfs-interface-register_nvdimm_meta-to-register-NVDIMM-meta-device.patch @@ -0,0 +1,84 @@ +From f5e95ef2ef8d076e95a673ce4249503ea46a5d42 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Sat, 24 Jul 2021 00:55:25 +0800 +Subject: [PATCH 13/16] bcache: add sysfs interface register_nvdimm_meta to + register NVDIMM meta device + +This patch adds a sysfs interface register_nvdimm_meta to register +NVDIMM meta device. The sysfs interface file only shows up when +CONFIG_BCACHE_NVM_PAGES=y. Then a NVDIMM name space formatted by +bcache-tools can be registered into bcache by e.g., + echo /dev/pmem0 > /sys/fs/bcache/register_nvdimm_meta + +Signed-off-by: Coly Li <colyli@suse.de> +Reviewed-by: Hannes Reinecke <hare@suse.de> +Cc: Christoph Hellwig <hch@lst.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Jens Axboe <axboe@kernel.dk> +Cc: Jianpeng Ma <jianpeng.ma@intel.com> +Cc: Qiaowei Ren <qiaowei.ren@intel.com> +--- + drivers/md/bcache/super.c | 29 +++++++++++++++++++++++++++++ + 1 file changed, 29 insertions(+) + +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 7581c3eaf34e..0043b0675df8 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -2403,10 +2403,18 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, + static ssize_t bch_pending_bdevs_cleanup(struct kobject *k, + struct kobj_attribute *attr, + const char *buffer, size_t size); ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++static ssize_t register_nvdimm_meta(struct kobject *k, ++ struct kobj_attribute *attr, ++ const char *buffer, size_t size); ++#endif + + kobj_attribute_write(register, register_bcache); + kobj_attribute_write(register_quiet, register_bcache); + kobj_attribute_write(pendings_cleanup, bch_pending_bdevs_cleanup); ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++kobj_attribute_write(register_nvdimm_meta, register_nvdimm_meta); ++#endif + + static bool bch_is_open_backing(dev_t dev) + { +@@ -2520,6 +2528,24 @@ static void register_device_async(struct async_reg_args *args) + queue_delayed_work(system_wq, &args->reg_work, 10); + } + ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++static ssize_t register_nvdimm_meta(struct kobject *k, struct kobj_attribute *attr, ++ const char *buffer, size_t size) ++{ ++ ssize_t ret = size; ++ ++ struct bch_nvmpg_ns *ns = bch_register_namespace(buffer, size); ++ ++ if (IS_ERR(ns)) { ++ pr_err("register nvdimm namespace %s for meta device failed.\n", ++ buffer); ++ ret = -EINVAL; ++ } ++ ++ return ret; ++} ++#endif ++ + static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, + const char *buffer, size_t size) + { +@@ -2862,6 +2888,9 @@ static int __init bcache_init(void) + static const struct attribute *files[] = { + &ksysfs_register.attr, + &ksysfs_register_quiet.attr, ++#if defined(CONFIG_BCACHE_NVM_PAGES) ++ &ksysfs_register_nvdimm_meta.attr, ++#endif + &ksysfs_pendings_cleanup.attr, + NULL + }; +-- +2.39.2 + diff --git a/for-test/nvdimm-support/meta-dev-20230303/0014-bcache-add-helper-routines-to-convert-bkey-and-nvmpg-offset.patch b/for-test/nvdimm-support/meta-dev-20230303/0014-bcache-add-helper-routines-to-convert-bkey-and-nvmpg-offset.patch new file mode 100644 index 0000000..5529725 --- /dev/null +++ b/for-test/nvdimm-support/meta-dev-20230303/0014-bcache-add-helper-routines-to-convert-bkey-and-nvmpg-offset.patch @@ -0,0 +1,169 @@ +From 0b109e7d981a92628a89fe01706fef89196abddd Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Mon, 4 Jul 2022 21:04:09 +0800 +Subject: [PATCH 14/16] bcache: add helper routines to convert bkey and nvmpg + offset + +This patch adds the following routines to convert bkey format offset and +nvmpg format offset, and convert a bkey format offset to linear address +on nvmpg pages, +- bug_on_bkey_offset_limit() + If the bkey format offset is too large (should not happen), call BUG() + to avoid further chaos. +- bkey_offset_to_nvmpg_ns_id() + Extract the nvmpg namespace id from bkey format offset value. +- bkey_offset_to_nvmpg_offset() + Convert bkey format offset to nvmpg format offset. +- nvmpg_offset_to_bkey_offset() + Convert nvmpg format offset to bkey format offset. +- bkey_offset_to_nvmpg_ptr() + Convert bkey format offset to a linear address on nvmpg pages. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/nvmpg.c | 48 +++++++++++++++++++++++++++++++++++++ + drivers/md/bcache/nvmpg.h | 50 +++++++++++++++++++++++++++++++++++++-- + 2 files changed, 96 insertions(+), 2 deletions(-) + +diff --git a/drivers/md/bcache/nvmpg.c b/drivers/md/bcache/nvmpg.c +index 642d2e21c565..da1045c0e10c 100644 +--- a/drivers/md/bcache/nvmpg.c ++++ b/drivers/md/bcache/nvmpg.c +@@ -91,6 +91,54 @@ static void *bch_nvmpg_rec_to_ptr(struct bch_nvmpg_rec *r) + return bch_nvmpg_pgoff_to_ptr(ns, pgoff); + } + ++static void bug_on_bkey_offset_limit(unsigned long sector) ++{ ++ if (sector >= ((1UL << BCH_BKEY_OFFSET_BITS) - 1)) { ++ pr_err("Invalid NVDIMM offset: too large as 0x%lx\n", sector); ++ pr_err("Such condition should never happen. Panic.\n"); ++ BUG(); ++ } ++} ++ ++int bkey_offset_to_nvmpg_ns_id(unsigned long bkey_offset) ++{ ++ return (bkey_offset >> BCH_BKEY_OFFSET_BITS) & ++ BCH_BKEY_OFFSET_NS_ID_MASK; ++} ++ ++unsigned long bkey_offset_to_nvmpg_offset(unsigned long bkey_offset) ++{ ++ int ns_id; ++ unsigned long offset; ++ ++ ns_id = (bkey_offset >> BCH_BKEY_OFFSET_BITS) & ++ BCH_BKEY_OFFSET_NS_ID_MASK; ++ ++ offset = (bkey_offset & BCH_BKEY_OFFSET_MASK) << 9; ++ return BCH_NVMPG_OFFSET(ns_id, offset); ++} ++ ++unsigned long nvmpg_offset_to_bkey_offset(unsigned long nvmpg_offset) ++{ ++ int ns_id; ++ unsigned long sector; ++ ++ ns_id = BCH_NVMPG_GET_NS_ID(nvmpg_offset); ++ sector = BCH_NVMPG_GET_OFFSET(nvmpg_offset) >> 9; ++ bug_on_bkey_offset_limit(sector); ++ ++ return ((sector & BCH_BKEY_OFFSET_MASK) | ++ ((ns_id & BCH_BKEY_OFFSET_NS_ID_MASK) << BCH_BKEY_OFFSET_BITS)); ++} ++ ++void *bkey_offset_to_nvmpg_ptr(unsigned long bkey_offset) ++{ ++ unsigned long nvmpg_offset; ++ ++ nvmpg_offset = bkey_offset_to_nvmpg_offset(bkey_offset); ++ return bch_nvmpg_offset_to_ptr(nvmpg_offset); ++} ++ + static inline void reserve_nvmpg_pages(struct bch_nvmpg_ns *ns, + pgoff_t pgoff, u64 nr) + { +diff --git a/drivers/md/bcache/nvmpg.h b/drivers/md/bcache/nvmpg.h +index a8a36a6caebb..c187185d0c3f 100644 +--- a/drivers/md/bcache/nvmpg.h ++++ b/drivers/md/bcache/nvmpg.h +@@ -84,8 +84,16 @@ struct bch_nvmpg_set { + (BCH_NVMPG_START - BCH_NVMPG_SYSRECS_OFFSET) / \ + sizeof(struct bch_nvmpg_recs))) + +-void *bch_nvmpg_offset_to_ptr(unsigned long offset); +-unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr); ++/* For bkey PTR_OFFSET to nvmpg namespace ID and offset convertion. ++ * ++ * PTR_OFFSET is 43 bits, the most significant 3 bits are for ++ * namespace ID. Rested 40 bits are for per-namespace offset ++ * in sectors. ++ */ ++#define BCH_BKEY_OFFSET_NS_ID_BITS 3 ++#define BCH_BKEY_OFFSET_NS_ID_MASK ((1UL<<BCH_BKEY_OFFSET_NS_ID_BITS) - 1) ++#define BCH_BKEY_OFFSET_BITS 40 ++#define BCH_BKEY_OFFSET_MASK ((1UL<<BCH_BKEY_OFFSET_BITS) - 1) + + #if defined(CONFIG_BCACHE_NVM_PAGES) + +@@ -97,6 +105,12 @@ void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order, const char *uui + void bch_nvmpg_flush(void); + struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid); + struct bch_nvmpg_ns *bch_nvmpg_id_to_ns(int ns_id); ++unsigned long bkey_offset_to_nvmpg_offset(unsigned long bkey_offset); ++unsigned long nvmpg_offset_to_bkey_offset(unsigned long nvmpg_offset); ++void *bkey_offset_to_nvmpg_ptr(unsigned long bkey_offset); ++void *bch_nvmpg_offset_to_ptr(unsigned long offset); ++unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr); ++ + + #else + +@@ -120,11 +134,43 @@ static inline unsigned long bch_nvmpg_alloc_pages(int order, const char *uuid) + static inline void bch_nvmpg_free_pages(unsigned long nvmpg_offset, int order, const char *uuid) { } + static inline void bch_nvmpg_flush(void) {} + ++static inline struct bch_nvmpg_ns *bch_nvmpg_id_to_ns(int ns_id) ++{ ++ return NULL; ++} ++ + static inline struct bch_nvmpg_head *bch_get_nvmpg_head(const char *uuid) + { + return NULL; + } + ++static inline unsigned long bkey_offset_to_nvmpg_offset(unsigned long bkey_offset) ++{ ++ return 0; ++} ++ ++static inline unsigned long nvmpg_offset_to_bkey_offset(unsigned long nvmpg_offset) ++{ ++ return 0; ++} ++ ++/* XXX: shoud not return NULL when NVDIMM support is not enabled */ ++static inline void *bkey_offset_to_nvmpg_ptr(unsigned long bkey_offset) ++{ ++ return NULL; ++} ++ ++static inline void *bch_nvmpg_offset_to_ptr(unsigned long offset) ++{ ++ return NULL; ++} ++ ++static inline unsigned long bch_nvmpg_ptr_to_offset(struct bch_nvmpg_ns *ns, void *ptr) ++{ ++ return 0; ++} ++ ++ + #endif /* CONFIG_BCACHE_NVM_PAGES */ + + #endif /* _BCACHE_NVM_PAGES_H */ +-- +2.39.2 + diff --git a/for-test/nvdimm-support/meta-dev-20230303/0015-bcache-add-KEY_NVMPG-bit-in-KEY_FIELD.patch b/for-test/nvdimm-support/meta-dev-20230303/0015-bcache-add-KEY_NVMPG-bit-in-KEY_FIELD.patch new file mode 100644 index 0000000..c94600f --- /dev/null +++ b/for-test/nvdimm-support/meta-dev-20230303/0015-bcache-add-KEY_NVMPG-bit-in-KEY_FIELD.patch @@ -0,0 +1,30 @@ +From 8ddd7e832cb0f838cbab5f153030f48190f8ec72 Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Mon, 4 Jul 2022 21:21:02 +0800 +Subject: [PATCH 15/16] bcache: add KEY_NVMPG bit in KEY_FIELD + +This patch adds KEY_NVMPG bit in KEY_FIELD, when KEY_NVMPG bit is set, +PTR_OFFSET of the key points to a NVDIMM area. It will be used in latter +patches to check whether a btree node is allocated from NVDIMM pages. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/bcache_ondisk.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/md/bcache/bcache_ondisk.h b/drivers/md/bcache/bcache_ondisk.h +index 97413586195b..6c890f632197 100644 +--- a/drivers/md/bcache/bcache_ondisk.h ++++ b/drivers/md/bcache/bcache_ondisk.h +@@ -45,7 +45,7 @@ static inline void SET_##name(struct bkey *k, unsigned int i, __u64 v) \ + KEY_FIELD(KEY_PTRS, high, 60, 3) + KEY_FIELD(__PAD0, high, 58, 2) + KEY_FIELD(KEY_CSUM, high, 56, 2) +-KEY_FIELD(__PAD1, high, 55, 1) ++KEY_FIELD(KEY_NVMPG, high, 55, 1) + KEY_FIELD(KEY_DIRTY, high, 36, 1) + + KEY_FIELD(KEY_SIZE, high, 20, KEY_SIZE_BITS) +-- +2.39.2 + diff --git a/for-test/nvdimm-support/meta-dev-20230303/0016-bcache-support-storing-bcache-btree-nodes-into-NVDIMM-meta-device.patch b/for-test/nvdimm-support/meta-dev-20230303/0016-bcache-support-storing-bcache-btree-nodes-into-NVDIMM-meta-device.patch new file mode 100644 index 0000000..81d1def --- /dev/null +++ b/for-test/nvdimm-support/meta-dev-20230303/0016-bcache-support-storing-bcache-btree-nodes-into-NVDIMM-meta-device.patch @@ -0,0 +1,560 @@ +From b3a2634cd2ac86de5e7ac607104db6866b1b9f6b Mon Sep 17 00:00:00 2001 +From: Coly Li <colyli@suse.de> +Date: Tue, 5 Jul 2022 23:24:18 +0800 +Subject: [PATCH 16/16] bcache: support storing bcache btree nodes into NVDIMM + meta device + +WIP. + +Signed-off-by: Coly Li <colyli@suse.de> +--- + drivers/md/bcache/alloc.c | 45 ++++++++++ + drivers/md/bcache/bcache.h | 1 + + drivers/md/bcache/btree.c | 163 +++++++++++++++++++++++++++++++----- + drivers/md/bcache/extents.c | 55 ++++++++---- + drivers/md/bcache/journal.c | 3 + + drivers/md/bcache/request.c | 13 ++- + 6 files changed, 241 insertions(+), 39 deletions(-) + +diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c +index ce13c272c387..54ccf83b7261 100644 +--- a/drivers/md/bcache/alloc.c ++++ b/drivers/md/bcache/alloc.c +@@ -63,6 +63,7 @@ + + #include "bcache.h" + #include "btree.h" ++#include "nvmpg.h" + + #include <linux/blkdev.h> + #include <linux/kthread.h> +@@ -477,14 +478,58 @@ void __bch_bucket_free(struct cache *ca, struct bucket *b) + } + } + ++static void __bch_nvmpg_bucket_free(struct cache_set *c, struct bkey *k) ++{ ++ int order; ++ unsigned long nvmpg_offset; ++ ++ order = ilog2(c->cache->sb.bucket_size / PAGE_SECTORS); ++ nvmpg_offset = bkey_offset_to_nvmpg_offset(PTR_OFFSET(k, 0)); ++ memset(bch_nvmpg_offset_to_ptr(nvmpg_offset), 0, 1<<order); ++ bch_nvmpg_free_pages(nvmpg_offset, order, c->set_uuid); ++} ++ + void bch_bucket_free(struct cache_set *c, struct bkey *k) + { + unsigned int i; + ++ if (KEY_NVMPG(k)) { ++ __bch_nvmpg_bucket_free(c, k); ++ return; ++ } ++ + for (i = 0; i < KEY_PTRS(k); i++) + __bch_bucket_free(c->cache, PTR_BUCKET(c, k, i)); + } + ++int __bch_nvmpg_bucket_alloc(struct cache_set *c, struct bkey *k) ++{ ++ struct cache *ca; ++ unsigned long nvmpg_offset, bkey_offset; ++ int order; ++ ++ if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) ++ return -1; ++ ++ lockdep_assert_held(&c->bucket_lock); ++ ++ ca = c->cache; ++ order = ilog2(ca->sb.bucket_size / PAGE_SECTORS); ++ nvmpg_offset = bch_nvmpg_alloc_pages(order, c->set_uuid); ++ if (!nvmpg_offset) ++ goto err; ++ ++ bkey_offset = nvmpg_offset_to_bkey_offset(nvmpg_offset); ++ bkey_init(k); ++ SET_KEY_NVMPG(k, true); ++ k->ptr[0] = MAKE_PTR(0, bkey_offset, ca->sb.nr_this_dev); ++ SET_KEY_PTRS(k, 1); ++ ++ return 0; ++err: ++ return -1; ++} ++ + int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, + struct bkey *k, bool wait) + { +diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h +index 2acda9cea0f9..395c923d68cf 100644 +--- a/drivers/md/bcache/bcache.h ++++ b/drivers/md/bcache/bcache.h +@@ -991,6 +991,7 @@ bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, + unsigned int sectors, unsigned int write_point, + unsigned int write_prio, bool wait); + bool bch_cached_dev_error(struct cached_dev *dc); ++int __bch_nvmpg_bucket_alloc(struct cache_set *c, struct bkey *k); + + __printf(2, 3) + bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...); +diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c +index 6a90c33109c7..022a227f0c02 100644 +--- a/drivers/md/bcache/btree.c ++++ b/drivers/md/bcache/btree.c +@@ -25,6 +25,8 @@ + #include "btree.h" + #include "debug.h" + #include "extents.h" ++#include "features.h" ++#include "nvmpg.h" + + #include <linux/slab.h> + #include <linux/bitops.h> +@@ -129,6 +131,9 @@ void bkey_put(struct cache_set *c, struct bkey *k) + { + unsigned int i; + ++ if (KEY_NVMPG(k)) ++ return; ++ + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(c, k, i)) + atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin); +@@ -170,6 +175,10 @@ void bch_btree_node_read_done(struct btree *b) + for (; + b->written < btree_blocks(b) && i->seq == b->keys.set[0].data->seq; + i = write_block(b)) { ++ err = "bad magic"; ++ if (i->magic != bset_magic(&b->c->cache->sb)) ++ goto err; ++ + err = "unsupported bset version"; + if (i->version > BCACHE_BSET_VERSION) + goto err; +@@ -179,10 +188,6 @@ void bch_btree_node_read_done(struct btree *b) + btree_blocks(b)) + goto err; + +- err = "bad magic"; +- if (i->magic != bset_magic(&b->c->cache->sb)) +- goto err; +- + err = "bad checksum"; + switch (i->version) { + case 0: +@@ -227,9 +232,15 @@ void bch_btree_node_read_done(struct btree *b) + return; + err: + set_btree_node_io_error(b); +- bch_cache_set_error(b->c, "%s at bucket %zu, block %u, %u keys", +- err, PTR_BUCKET_NR(b->c, &b->key, 0), +- bset_block_offset(b, i), i->keys); ++ if (!KEY_NVMPG(&b->key)) ++ bch_cache_set_error(b->c, "%s at bucket %zu, block %u, %u keys", ++ err, PTR_BUCKET_NR(b->c, &b->key, 0), ++ bset_block_offset(b, i), i->keys); ++ else ++ bch_cache_set_error(b->c, "%s at addr %p, block %u, %u keys", ++ err, bkey_offset_to_nvmpg_ptr(PTR_OFFSET(&b->key, 0)), ++ bset_block_offset(b, i), i->keys); ++ + goto out; + } + +@@ -240,7 +251,7 @@ static void btree_node_read_endio(struct bio *bio) + closure_put(cl); + } + +-static void bch_btree_node_read(struct btree *b) ++static void __bch_btree_node_read(struct btree *b) + { + uint64_t start_time = local_clock(); + struct closure cl; +@@ -278,6 +289,28 @@ static void bch_btree_node_read(struct btree *b) + PTR_BUCKET_NR(b->c, &b->key, 0)); + } + ++static void __bch_nvmpg_btree_node_read(struct btree *b) ++{ ++ uint64_t start_time = local_clock(); ++ void *ptr; ++ ++ ptr = bkey_offset_to_nvmpg_ptr(PTR_OFFSET(&b->key, 0)); ++ memcpy(b->keys.set[0].data, ptr, KEY_SIZE(&b->key) << 9); ++ ++ bch_btree_node_read_done(b); ++ bch_time_stats_update(&b->c->btree_read_time, start_time); ++} ++ ++static void bch_btree_node_read(struct btree *b) ++{ ++ trace_bcache_btree_read(b); ++ ++ if (!KEY_NVMPG(&b->key)) ++ __bch_btree_node_read(b); ++ else ++ __bch_nvmpg_btree_node_read(b); ++} ++ + static void btree_complete_write(struct btree *b, struct btree_write *w) + { + if (w->prio_blocked && +@@ -335,7 +368,7 @@ static void btree_node_write_endio(struct bio *bio) + closure_put(cl); + } + +-static void do_btree_node_write(struct btree *b) ++static void __do_btree_node_write(struct btree *b) + { + struct closure *cl = &b->io; + struct bset *i = btree_bset_last(b); +@@ -400,6 +433,68 @@ static void do_btree_node_write(struct btree *b) + } + } + ++static void btree_nvmpg_complete_write(struct btree *b, struct btree_write *w) ++{ ++ atomic_sub(w->prio_blocked, &b->c->prio_blocked); ++ ++ if (w->journal) { ++ atomic_dec_bug(w->journal); ++ __closure_wake_up(&b->c->journal.wait); ++ } ++ ++ w->prio_blocked = 0; ++ w->journal = NULL; ++} ++ ++static void btree_nvmpg_node_write_done(struct closure *cl) ++{ ++ struct btree *b = container_of(cl, struct btree, io); ++ struct btree_write *w = btree_prev_write(b); ++ ++ btree_nvmpg_complete_write(b, w); ++ ++ if (btree_node_dirty(b)) ++ queue_delayed_work(btree_io_wq, &b->work, 30 * HZ); ++ ++ closure_return_with_destructor(cl, btree_node_write_unlock); ++} ++ ++static void __do_nvmpg_btree_node_write(struct btree *b) ++{ ++ struct closure *cl = &b->io; ++ struct bset *i = btree_bset_last(b); ++ unsigned long nvmpg_offset; ++ void *nvmpg_ptr; ++ ++ i->version = BCACHE_BSET_VERSION; ++ i->csum = btree_csum_set(b, i); ++ ++ BUG_ON(b->bio); ++ ++ /* Calculate location to write */ ++ nvmpg_offset = bkey_offset_to_nvmpg_offset(PTR_OFFSET(&b->key, 0)); ++ nvmpg_ptr = bch_nvmpg_offset_to_ptr(nvmpg_offset) + ++ bset_byte_offset(&b->keys, i); ++ ++ if (b->level > 0) ++ memcpy_flushcache(nvmpg_ptr, i, ++ roundup(set_bytes(i), block_bytes(b->c->cache))); ++ else ++ memcpy(nvmpg_ptr, i, ++ roundup(set_bytes(i), block_bytes(b->c->cache))); ++ ++ closure_sync(cl); ++ continue_at_nobarrier(cl, btree_nvmpg_node_write_done, NULL); ++} ++ ++static void do_btree_node_write(struct btree *b) ++{ ++ if (!KEY_NVMPG(&b->key)) ++ __do_btree_node_write(b); ++ else ++ __do_nvmpg_btree_node_write(b); ++} ++ + void __bch_btree_node_write(struct btree *b, struct closure *parent) + { + struct bset *i = btree_bset_last(b); +@@ -535,6 +630,9 @@ static void mca_bucket_free(struct btree *b) + { + BUG_ON(btree_node_dirty(b)); + ++ if (KEY_NVMPG(&b->key)) ++ SET_KEY_NVMPG(&b->key, false); ++ + b->key.ptr[0] = 0; + hlist_del_init_rcu(&b->hash); + list_move(&b->list, &b->c->btree_cache_freeable); +@@ -1091,13 +1189,25 @@ struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op, + { + BKEY_PADDED(key) k; + struct btree *b = ERR_PTR(-EAGAIN); ++ int err = -1; + + mutex_lock(&c->bucket_lock); + retry: +- if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, wait)) +- goto err; ++ /* ++ * If nvdimm_meta feature is enabled, try to allocate btree ++ * node from NVDIMM pages and set KEY_NVMPG bit successfully. ++ */ ++ if (bch_has_feature_nvdimm_meta(&(c->cache->sb))) ++ err = __bch_nvmpg_bucket_alloc(c, &k.key); ++ ++ if (err < 0) { ++ err = __bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, wait); ++ if (!err) ++ bkey_put(c, &k.key); ++ else ++ goto err; ++ } + +- bkey_put(c, &k.key); + SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); + + b = mca_alloc(c, op, &k.key, level); +@@ -1159,10 +1269,12 @@ static void make_btree_freeing_key(struct btree *b, struct bkey *k) + bkey_copy(k, &b->key); + bkey_copy_key(k, &ZERO_KEY); + +- for (i = 0; i < KEY_PTRS(k); i++) +- SET_PTR_GEN(k, i, +- bch_inc_gen(b->c->cache, +- PTR_BUCKET(b->c, &b->key, i))); ++ if (!KEY_NVMPG(&b->key)) { ++ for (i = 0; i < KEY_PTRS(k); i++) ++ SET_PTR_GEN(k, i, ++ bch_inc_gen(b->c->cache, ++ PTR_BUCKET(b->c, &b->key, i))); ++ } + + mutex_unlock(&b->c->bucket_lock); + } +@@ -1205,6 +1317,9 @@ static uint8_t __bch_btree_mark_key(struct cache_set *c, int level, + if (!bkey_cmp(k, &ZERO_KEY)) + return stale; + ++ if (KEY_NVMPG(k)) ++ return stale; ++ + for (i = 0; i < KEY_PTRS(k); i++) { + if (!ptr_available(c, k, i)) + continue; +@@ -1248,6 +1363,9 @@ void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k) + { + unsigned int i; + ++ if (KEY_NVMPG(k)) ++ return; ++ + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(c, k, i) && + !ptr_stale(c, k, i)) { +@@ -1748,10 +1866,14 @@ static void bch_btree_gc_finish(struct cache_set *c) + + spin_lock(&dc->writeback_keys.lock); + rbtree_postorder_for_each_entry_safe(w, n, +- &dc->writeback_keys.keys, node) ++ &dc->writeback_keys.keys, node) { ++ if (KEY_NVMPG(&w->key)) ++ continue; ++ + for (j = 0; j < KEY_PTRS(&w->key); j++) + SET_GC_MARK(PTR_BUCKET(c, &w->key, j), + GC_MARK_DIRTY); ++ } + spin_unlock(&dc->writeback_keys.lock); + } + rcu_read_unlock(); +@@ -2480,8 +2602,11 @@ void bch_btree_set_root(struct btree *b) + + BUG_ON(!b->written); + +- for (i = 0; i < KEY_PTRS(&b->key); i++) +- BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO); ++ if (!KEY_NVMPG(&b->key)) { ++ for (i = 0; i < KEY_PTRS(&b->key); i++) ++ BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != ++ BTREE_PRIO); ++ } + + mutex_lock(&b->c->bucket_lock); + list_del_init(&b->list); +diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c +index d626ffcbecb9..4b11d857f091 100644 +--- a/drivers/md/bcache/extents.c ++++ b/drivers/md/bcache/extents.c +@@ -51,13 +51,18 @@ static bool __ptr_invalid(struct cache_set *c, const struct bkey *k) + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(c, k, i)) { + struct cache *ca = c->cache; +- size_t bucket = PTR_BUCKET_NR(c, k, i); + size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); + +- if (KEY_SIZE(k) + r > c->cache->sb.bucket_size || +- bucket < ca->sb.first_bucket || +- bucket >= ca->sb.nbuckets) ++ if (KEY_SIZE(k) + r > c->cache->sb.bucket_size) + return true; ++ ++ if (!KEY_NVMPG(k)) { ++ size_t bucket = PTR_BUCKET_NR(c, k, i); ++ ++ if (bucket < ca->sb.first_bucket || ++ bucket >= ca->sb.nbuckets) ++ return true; ++ } + } + + return false; +@@ -72,17 +77,20 @@ static const char *bch_ptr_status(struct cache_set *c, const struct bkey *k) + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(c, k, i)) { + struct cache *ca = c->cache; +- size_t bucket = PTR_BUCKET_NR(c, k, i); + size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); + + if (KEY_SIZE(k) + r > c->cache->sb.bucket_size) + return "bad, length too big"; +- if (bucket < ca->sb.first_bucket) +- return "bad, short offset"; +- if (bucket >= ca->sb.nbuckets) +- return "bad, offset past end of device"; +- if (ptr_stale(c, k, i)) +- return "stale"; ++ if (!KEY_NVMPG(k)) { ++ size_t bucket = PTR_BUCKET_NR(c, k, i); ++ ++ if (bucket < ca->sb.first_bucket) ++ return "bad, short offset"; ++ if (bucket >= ca->sb.nbuckets) ++ return "bad, offset past end of device"; ++ if (ptr_stale(c, k, i)) ++ return "stale"; ++ } + } + + if (!bkey_cmp(k, &ZERO_KEY)) +@@ -129,6 +137,9 @@ static void bch_bkey_dump(struct btree_keys *keys, const struct bkey *k) + unsigned int j; + char buf[80]; + ++ if (KEY_NVMPG(k)) ++ return; ++ + bch_extent_to_text(buf, sizeof(buf), k); + pr_cont(" %s", buf); + +@@ -176,6 +187,9 @@ static bool btree_ptr_bad_expensive(struct btree *b, const struct bkey *k) + char buf[80]; + struct bucket *g; + ++ if (KEY_NVMPG(k)) ++ return false; ++ + if (mutex_trylock(&b->c->bucket_lock)) { + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(b->c, k, i)) { +@@ -212,10 +226,12 @@ static bool bch_btree_ptr_bad(struct btree_keys *bk, const struct bkey *k) + bch_ptr_invalid(bk, k)) + return true; + +- for (i = 0; i < KEY_PTRS(k); i++) +- if (!ptr_available(b->c, k, i) || +- ptr_stale(b->c, k, i)) ++ for (i = 0; i < KEY_PTRS(k); i++) { ++ if (!ptr_available(b->c, k, i)) ++ return true; ++ if (!KEY_NVMPG(k) && ptr_stale(b->c, k, i)) + return true; ++ } + + if (expensive_debug_checks(b->c) && + btree_ptr_bad_expensive(b, k)) +@@ -507,9 +523,13 @@ static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k) + static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k, + unsigned int ptr) + { +- struct bucket *g = PTR_BUCKET(b->c, k, ptr); ++ struct bucket *g; + char buf[80]; + ++ if (KEY_NVMPG(k)) ++ return false; ++ ++ g = PTR_BUCKET(b->c, k, ptr); + if (mutex_trylock(&b->c->bucket_lock)) { + if (b->c->gc_mark_valid && + (!GC_MARK(g) || +@@ -548,7 +568,7 @@ static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k) + if (!ptr_available(b->c, k, i)) + return true; + +- for (i = 0; i < KEY_PTRS(k); i++) { ++ for (i = 0; (!KEY_NVMPG(k)) && (i < KEY_PTRS(k)); i++) { + stale = ptr_stale(b->c, k, i); + + if (stale && KEY_DIRTY(k)) { +@@ -588,6 +608,9 @@ static bool bch_extent_merge(struct btree_keys *bk, + if (key_merging_disabled(b->c)) + return false; + ++ if (KEY_NVMPG(l) || KEY_NVMPG(r)) ++ return false; ++ + for (i = 0; i < KEY_PTRS(l); i++) + if (l->ptr[i] + MAKE_PTR(0, KEY_SIZE(l), 0) != r->ptr[i] || + PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i)) +diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c +index 24615df1f4e6..85a20e081f12 100644 +--- a/drivers/md/bcache/journal.c ++++ b/drivers/md/bcache/journal.c +@@ -382,6 +382,9 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list) + if (!__bch_extent_invalid(c, k)) { + unsigned int j; + ++ if (KEY_NVMPG(k)) ++ continue; ++ + for (j = 0; j < KEY_PTRS(k); j++) + if (ptr_available(c, k, j)) + atomic_inc(&PTR_BUCKET(c, k, j)->pin); +diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c +index f2c5a7e06fa9..4a5d75e8a2dd 100644 +--- a/drivers/md/bcache/request.c ++++ b/drivers/md/bcache/request.c +@@ -232,9 +232,11 @@ static void bch_data_insert_start(struct closure *cl) + if (op->writeback) { + SET_KEY_DIRTY(k, true); + +- for (i = 0; i < KEY_PTRS(k); i++) +- SET_GC_MARK(PTR_BUCKET(op->c, k, i), +- GC_MARK_DIRTY); ++ if (!KEY_NVMPG(k)) { ++ for (i = 0; i < KEY_PTRS(k); i++) ++ SET_GC_MARK(PTR_BUCKET(op->c, k, i), ++ GC_MARK_DIRTY); ++ } + } + + SET_KEY_CSUM(k, op->csum); +@@ -542,7 +544,10 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k) + /* XXX: figure out best pointer - for multiple cache devices */ + ptr = 0; + +- PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO; ++ if (!KEY_NVMPG(k)) ++ PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO; ++ else ++ pr_err("nvmpg key should not show up here.\n"); + + if (KEY_DIRTY(k)) + s->read_dirty_data = true; +-- +2.39.2 + |