diff options
author | Andrew Morton <akpm@linux-foundation.org> | 2024-04-18 13:42:04 -0700 |
---|---|---|
committer | Andrew Morton <akpm@linux-foundation.org> | 2024-04-18 13:42:04 -0700 |
commit | 578a2c2d8e5c25cc32ea3ab3515c903b7c45ba04 (patch) | |
tree | 44ef1c7e567c0114204658f181b4dd971daede14 | |
parent | c65c0e14247748216c988a1b18897d1258afaaf7 (diff) | |
download | 25-new-578a2c2d8e5c25cc32ea3ab3515c903b7c45ba04.tar.gz |
foo
40 files changed, 1548 insertions, 16 deletions
diff --git a/patches/crash-add-prefix-for-crash-dumping-messages.patch b/patches/crash-add-prefix-for-crash-dumping-messages.patch new file mode 100644 index 000000000..8e0449aa0 --- /dev/null +++ b/patches/crash-add-prefix-for-crash-dumping-messages.patch @@ -0,0 +1,56 @@ +From: Baoquan He <bhe@redhat.com> +Subject: crash: add prefix for crash dumping messages +Date: Thu, 18 Apr 2024 11:58:43 +0800 + +Add pr_fmt() to kernel/crash_core.c to add the module name to debugging +message printed as prefix. + +And also add prefix 'crashkernel:' to two lines of message printing code +in kernel/crash_reserve.c. In kernel/crash_reserve.c, almost all +debugging messages have 'crashkernel:' prefix or there's keyword +crashkernel at the beginning or in the middle, adding pr_fmt() makes it +redundant. + +Link: https://lkml.kernel.org/r/20240418035843.1562887-1-bhe@redhat.com +Signed-off-by: Baoquan He <bhe@redhat.com> +Cc: Dave Young <dyoung@redhat.com> +Cc: Jiri Slaby <jirislaby@kernel.org> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +--- + + kernel/crash_core.c | 2 ++ + kernel/crash_reserve.c | 4 ++-- + 2 files changed, 4 insertions(+), 2 deletions(-) + +--- a/kernel/crash_core.c~crash-add-prefix-for-crash-dumping-messages ++++ a/kernel/crash_core.c +@@ -4,6 +4,8 @@ + * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> + */ + ++#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt ++ + #include <linux/buildid.h> + #include <linux/init.h> + #include <linux/utsname.h> +--- a/kernel/crash_reserve.c~crash-add-prefix-for-crash-dumping-messages ++++ a/kernel/crash_reserve.c +@@ -109,7 +109,7 @@ static int __init parse_crashkernel_mem( + + size = memparse(cur, &tmp); + if (cur == tmp) { +- pr_warn("Memory value expected\n"); ++ pr_warn("crashkernel: Memory value expected\n"); + return -EINVAL; + } + cur = tmp; +@@ -132,7 +132,7 @@ static int __init parse_crashkernel_mem( + cur++; + *crash_base = memparse(cur, &tmp); + if (cur == tmp) { +- pr_warn("Memory value expected after '@'\n"); ++ pr_warn("crahskernel: Memory value expected after '@'\n"); + return -EINVAL; + } + } +_ diff --git a/patches/hugetlb-check-for-anon_vma-prior-to-folio-allocation.patch b/patches/hugetlb-check-for-anon_vma-prior-to-folio-allocation.patch new file mode 100644 index 000000000..8448eb11a --- /dev/null +++ b/patches/hugetlb-check-for-anon_vma-prior-to-folio-allocation.patch @@ -0,0 +1,58 @@ +From: "Vishal Moola (Oracle)" <vishal.moola@gmail.com> +Subject: hugetlb: check for anon_vma prior to folio allocation +Date: Mon, 15 Apr 2024 14:17:47 -0700 + +Commit 9acad7ba3e25 ("hugetlb: use vmf_anon_prepare() instead of +anon_vma_prepare()") may bailout after allocating a folio if we do not +hold the mmap lock. When this occurs, vmf_anon_prepare() will release the +vma lock. Hugetlb then attempts to call restore_reserve_on_error(), which +depends on the vma lock being held. + +We can move vmf_anon_prepare() prior to the folio allocation in order to +avoid calling restore_reserve_on_error() without the vma lock. + +Link: https://lkml.kernel.org/r/ZiFqSrSRLhIV91og@fedora +Fixes: 9acad7ba3e25 ("hugetlb: use vmf_anon_prepare() instead of anon_vma_prepare()") +Reported-by: syzbot+ad1b592fc4483655438b@syzkaller.appspotmail.com +Signed-off-by: Vishal Moola (Oracle) <vishal.moola@gmail.com> +Cc: Muchun Song <muchun.song@linux.dev> +Cc: <stable@vger.kernel.org> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +--- + + mm/hugetlb.c | 11 +++++++---- + 1 file changed, 7 insertions(+), 4 deletions(-) + +--- a/mm/hugetlb.c~hugetlb-check-for-anon_vma-prior-to-folio-allocation ++++ a/mm/hugetlb.c +@@ -6261,6 +6261,12 @@ static vm_fault_t hugetlb_no_page(struct + VM_UFFD_MISSING); + } + ++ if (!(vma->vm_flags & VM_MAYSHARE)) { ++ ret = vmf_anon_prepare(vmf); ++ if (unlikely(ret)) ++ goto out; ++ } ++ + folio = alloc_hugetlb_folio(vma, haddr, 0); + if (IS_ERR(folio)) { + /* +@@ -6297,15 +6303,12 @@ static vm_fault_t hugetlb_no_page(struct + */ + restore_reserve_on_error(h, vma, haddr, folio); + folio_put(folio); ++ ret = VM_FAULT_SIGBUS; + goto out; + } + new_pagecache_folio = true; + } else { + folio_lock(folio); +- +- ret = vmf_anon_prepare(vmf); +- if (unlikely(ret)) +- goto backout_unlocked; + anon_rmap = 1; + } + } else { +_ diff --git a/patches/hugetlb-convert-hugetlb_no_page-to-use-struct-vm_fault.patch b/patches/hugetlb-convert-hugetlb_no_page-to-use-struct-vm_fault.patch index 88611c178..8d3c318e0 100644 --- a/patches/hugetlb-convert-hugetlb_no_page-to-use-struct-vm_fault.patch +++ b/patches/hugetlb-convert-hugetlb_no_page-to-use-struct-vm_fault.patch @@ -64,8 +64,8 @@ Signed-off-by: Andrew Morton <akpm@linux-foundation.org> ret = 0; goto out; } -@@ -6256,7 +6252,7 @@ static vm_fault_t hugetlb_no_page(struct - VM_UFFD_MISSING); +@@ -6262,7 +6258,7 @@ static vm_fault_t hugetlb_no_page(struct + goto out; } - folio = alloc_hugetlb_folio(vma, haddr, 0); @@ -73,7 +73,7 @@ Signed-off-by: Andrew Morton <akpm@linux-foundation.org> if (IS_ERR(folio)) { /* * Returning error will result in faulting task being -@@ -6270,18 +6266,20 @@ static vm_fault_t hugetlb_no_page(struct +@@ -6276,18 +6272,20 @@ static vm_fault_t hugetlb_no_page(struct * here. Before returning error, get ptl and make * sure there really is no pte entry. */ @@ -97,7 +97,7 @@ Signed-off-by: Andrew Morton <akpm@linux-foundation.org> if (err) { /* * err can't be -EEXIST which implies someone -@@ -6290,7 +6288,8 @@ static vm_fault_t hugetlb_no_page(struct +@@ -6296,7 +6294,8 @@ static vm_fault_t hugetlb_no_page(struct * to the page cache. So it's safe to call * restore_reserve_on_error() here. */ @@ -105,9 +105,9 @@ Signed-off-by: Andrew Morton <akpm@linux-foundation.org> + restore_reserve_on_error(h, vma, vmf->address, + folio); folio_put(folio); + ret = VM_FAULT_SIGBUS; goto out; - } -@@ -6320,7 +6319,7 @@ static vm_fault_t hugetlb_no_page(struct +@@ -6323,7 +6322,7 @@ static vm_fault_t hugetlb_no_page(struct folio_unlock(folio); folio_put(folio); /* See comment in userfaultfd_missing() block above */ @@ -116,7 +116,7 @@ Signed-off-by: Andrew Morton <akpm@linux-foundation.org> ret = 0; goto out; } -@@ -6335,23 +6334,23 @@ static vm_fault_t hugetlb_no_page(struct +@@ -6338,23 +6337,23 @@ static vm_fault_t hugetlb_no_page(struct * any allocations necessary to record that reservation occur outside * the spinlock. */ @@ -146,7 +146,7 @@ Signed-off-by: Andrew Morton <akpm@linux-foundation.org> else hugetlb_add_file_rmap(folio); new_pte = make_huge_pte(vma, &folio->page, ((vma->vm_flags & VM_WRITE) -@@ -6360,17 +6359,18 @@ static vm_fault_t hugetlb_no_page(struct +@@ -6363,17 +6362,18 @@ static vm_fault_t hugetlb_no_page(struct * If this pte was previously wr-protected, keep it wr-protected even * if populated. */ @@ -170,7 +170,7 @@ Signed-off-by: Andrew Morton <akpm@linux-foundation.org> /* * Only set hugetlb_migratable in newly allocated pages. Existing pages -@@ -6387,10 +6387,10 @@ out: +@@ -6390,10 +6390,10 @@ out: return ret; backout: @@ -183,7 +183,7 @@ Signed-off-by: Andrew Morton <akpm@linux-foundation.org> folio_unlock(folio); folio_put(folio); -@@ -6486,8 +6486,7 @@ vm_fault_t hugetlb_fault(struct mm_struc +@@ -6489,8 +6489,7 @@ vm_fault_t hugetlb_fault(struct mm_struc * hugetlb_no_page will drop vma lock and hugetlb fault * mutex internally, which make us return immediately. */ diff --git a/patches/init-fix-allocated-page-overlapping-with-ptr_err.patch b/patches/init-fix-allocated-page-overlapping-with-ptr_err.patch new file mode 100644 index 000000000..0e839c0ee --- /dev/null +++ b/patches/init-fix-allocated-page-overlapping-with-ptr_err.patch @@ -0,0 +1,66 @@ +From: Nam Cao <namcao@linutronix.de> +Subject: init: fix allocated page overlapping with PTR_ERR +Date: Thu, 18 Apr 2024 12:29:43 +0200 + +There is nothing preventing kernel memory allocators from allocating a +page that overlaps with PTR_ERR(), except for architecture-specific code +that setup memblock. + +It was discovered that RISCV architecture doesn't setup memblock corectly, +leading to a page overlapping with PTR_ERR() being allocated, and +subsequently crashing the kernel (link in Close: ) + +The reported crash has nothing to do with PTR_ERR(): the last page (at +address 0xfffff000) being allocated leads to an unexpected arithmetic +overflow in ext4; but still, this page shouldn't be allocated in the first +place. + +Because PTR_ERR() is an architecture-independent thing, we shouldn't ask +every single architecture to set this up. There may be other +architectures beside RISCV that have the same problem. + +Fix this once and for all by reserving the physical memory page that may +be mapped to the last virtual memory page as part of low memory. + +Unfortunately, this means if there is actual memory at this reserved +location, that memory will become inaccessible. However, if this page is +not reserved, it can only be accessed as high memory, so this doesn't +matter if high memory is not supported. Even if high memory is supported, +it is still only one page. + +Closes: https://lore.kernel.org/linux-riscv/878r1ibpdn.fsf@all.your.base.are.belong.to.us +Link: https://lkml.kernel.org/r/20240418102943.180510-1-namcao@linutronix.de +Signed-off-by: Nam Cao <namcao@linutronix.de> +Reported-by: Björn Töpel <bjorn@kernel.org> +Tested-by: Björn Töpel <bjorn@kernel.org> +Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org> +Cc: Andreas Dilger <adilger@dilger.ca> +Cc: Arnd Bergmann <arnd@arndb.de> +Cc: Changbin Du <changbin.du@huawei.com> +Cc: Christophe Leroy <christophe.leroy@csgroup.eu> +Cc: Geert Uytterhoeven <geert+renesas@glider.be> +Cc: Ingo Molnar <mingo@kernel.org> +Cc: Krister Johansen <kjlx@templeofstupid.com> +Cc: Luis Chamberlain <mcgrof@kernel.org> +Cc: Nick Desaulniers <ndesaulniers@google.com> +Cc: Stephen Rothwell <sfr@canb.auug.org.au> +Cc: Tejun Heo <tj@kernel.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Cc: <stable@vger.kernel.org> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +--- + + init/main.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/init/main.c~init-fix-allocated-page-overlapping-with-ptr_err ++++ a/init/main.c +@@ -900,6 +900,7 @@ void start_kernel(void) + page_address_init(); + pr_notice("%s", linux_banner); + early_security_init(); ++ memblock_reserve(__pa(-PAGE_SIZE), PAGE_SIZE); /* reserve last page for ERR_PTR */ + setup_arch(&command_line); + setup_boot_config(); + setup_command_line(command_line); +_ diff --git a/patches/mm-arm64-override-clear_young_dirty_ptes-batch-helper.patch b/patches/mm-arm64-override-clear_young_dirty_ptes-batch-helper.patch new file mode 100644 index 000000000..6e777bb5a --- /dev/null +++ b/patches/mm-arm64-override-clear_young_dirty_ptes-batch-helper.patch @@ -0,0 +1,154 @@ +From: Lance Yang <ioworker0@gmail.com> +Subject: mm/arm64: override clear_young_dirty_ptes() batch helper +Date: Thu, 18 Apr 2024 21:44:33 +0800 + +The per-pte get_and_clear/modify/set approach would result in +unfolding/refolding for contpte mappings on arm64. So we need to override +clear_young_dirty_ptes() for arm64 to avoid it. + +Link: https://lkml.kernel.org/r/20240418134435.6092-3-ioworker0@gmail.com +Signed-off-by: Lance Yang <ioworker0@gmail.com> +Suggested-by: Barry Song <21cnbao@gmail.com> +Suggested-by: Ryan Roberts <ryan.roberts@arm.com> +Reviewed-by: Ryan Roberts <ryan.roberts@arm.com> +Cc: David Hildenbrand <david@redhat.com> +Cc: Jeff Xie <xiehuan09@gmail.com> +Cc: Kefeng Wang <wangkefeng.wang@huawei.com> +Cc: Michal Hocko <mhocko@suse.com> +Cc: Minchan Kim <minchan@kernel.org> +Cc: Muchun Song <songmuchun@bytedance.com> +Cc: Peter Xu <peterx@redhat.com> +Cc: Yang Shi <shy828301@gmail.com> +Cc: Yin Fengwei <fengwei.yin@intel.com> +Cc: Zach O'Keefe <zokeefe@google.com> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +--- + + arch/arm64/include/asm/pgtable.h | 55 +++++++++++++++++++++++++++++ + arch/arm64/mm/contpte.c | 29 +++++++++++++++ + 2 files changed, 84 insertions(+) + +--- a/arch/arm64/include/asm/pgtable.h~mm-arm64-override-clear_young_dirty_ptes-batch-helper ++++ a/arch/arm64/include/asm/pgtable.h +@@ -1223,6 +1223,46 @@ static inline void __wrprotect_ptes(stru + __ptep_set_wrprotect(mm, address, ptep); + } + ++static inline void __clear_young_dirty_pte(struct vm_area_struct *vma, ++ unsigned long addr, pte_t *ptep, ++ pte_t pte, cydp_t flags) ++{ ++ pte_t old_pte; ++ ++ do { ++ old_pte = pte; ++ ++ if (flags & CYDP_CLEAR_YOUNG) ++ pte = pte_mkold(pte); ++ if (flags & CYDP_CLEAR_DIRTY) ++ pte = pte_mkclean(pte); ++ ++ pte_val(pte) = cmpxchg_relaxed(&pte_val(*ptep), ++ pte_val(old_pte), pte_val(pte)); ++ } while (pte_val(pte) != pte_val(old_pte)); ++} ++ ++static inline void __clear_young_dirty_ptes(struct vm_area_struct *vma, ++ unsigned long addr, pte_t *ptep, ++ unsigned int nr, cydp_t flags) ++{ ++ pte_t pte; ++ ++ for (;;) { ++ pte = __ptep_get(ptep); ++ ++ if (flags == (CYDP_CLEAR_YOUNG | CYDP_CLEAR_DIRTY)) ++ __set_pte(ptep, pte_mkclean(pte_mkold(pte))); ++ else ++ __clear_young_dirty_pte(vma, addr, ptep, pte, flags); ++ ++ if (--nr == 0) ++ break; ++ ptep++; ++ addr += PAGE_SIZE; ++ } ++} ++ + #ifdef CONFIG_TRANSPARENT_HUGEPAGE + #define __HAVE_ARCH_PMDP_SET_WRPROTECT + static inline void pmdp_set_wrprotect(struct mm_struct *mm, +@@ -1379,6 +1419,9 @@ extern void contpte_wrprotect_ptes(struc + extern int contpte_ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t entry, int dirty); ++extern void contpte_clear_young_dirty_ptes(struct vm_area_struct *vma, ++ unsigned long addr, pte_t *ptep, ++ unsigned int nr, cydp_t flags); + + static __always_inline void contpte_try_fold(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, pte_t pte) +@@ -1603,6 +1646,17 @@ static inline int ptep_set_access_flags( + return contpte_ptep_set_access_flags(vma, addr, ptep, entry, dirty); + } + ++#define clear_young_dirty_ptes clear_young_dirty_ptes ++static inline void clear_young_dirty_ptes(struct vm_area_struct *vma, ++ unsigned long addr, pte_t *ptep, ++ unsigned int nr, cydp_t flags) ++{ ++ if (likely(nr == 1 && !pte_cont(__ptep_get(ptep)))) ++ __clear_young_dirty_ptes(vma, addr, ptep, nr, flags); ++ else ++ contpte_clear_young_dirty_ptes(vma, addr, ptep, nr, flags); ++} ++ + #else /* CONFIG_ARM64_CONTPTE */ + + #define ptep_get __ptep_get +@@ -1622,6 +1676,7 @@ static inline int ptep_set_access_flags( + #define wrprotect_ptes __wrprotect_ptes + #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS + #define ptep_set_access_flags __ptep_set_access_flags ++#define clear_young_dirty_ptes __clear_young_dirty_ptes + + #endif /* CONFIG_ARM64_CONTPTE */ + +--- a/arch/arm64/mm/contpte.c~mm-arm64-override-clear_young_dirty_ptes-batch-helper ++++ a/arch/arm64/mm/contpte.c +@@ -361,6 +361,35 @@ void contpte_wrprotect_ptes(struct mm_st + } + EXPORT_SYMBOL_GPL(contpte_wrprotect_ptes); + ++void contpte_clear_young_dirty_ptes(struct vm_area_struct *vma, ++ unsigned long addr, pte_t *ptep, ++ unsigned int nr, cydp_t flags) ++{ ++ /* ++ * We can safely clear access/dirty without needing to unfold from ++ * the architectures perspective, even when contpte is set. If the ++ * range starts or ends midway through a contpte block, we can just ++ * expand to include the full contpte block. While this is not ++ * exactly what the core-mm asked for, it tracks access/dirty per ++ * folio, not per page. And since we only create a contpte block ++ * when it is covered by a single folio, we can get away with ++ * clearing access/dirty for the whole block. ++ */ ++ unsigned long start = addr; ++ unsigned long end = start + nr; ++ ++ if (pte_cont(__ptep_get(ptep + nr - 1))) ++ end = ALIGN(end, CONT_PTE_SIZE); ++ ++ if (pte_cont(__ptep_get(ptep))) { ++ start = ALIGN_DOWN(start, CONT_PTE_SIZE); ++ ptep = contpte_align_down(ptep); ++ } ++ ++ __clear_young_dirty_ptes(vma, start, ptep, end - start, flags); ++} ++EXPORT_SYMBOL_GPL(contpte_clear_young_dirty_ptes); ++ + int contpte_ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t entry, int dirty) +_ diff --git a/patches/mm-huge_memory-improve-split_huge_page_to_list_to_order-return-value-documentation.patch b/patches/mm-huge_memory-improve-split_huge_page_to_list_to_order-return-value-documentation.patch new file mode 100644 index 000000000..6158d9b29 --- /dev/null +++ b/patches/mm-huge_memory-improve-split_huge_page_to_list_to_order-return-value-documentation.patch @@ -0,0 +1,51 @@ +From: David Hildenbrand <david@redhat.com> +Subject: mm/huge_memory: improve split_huge_page_to_list_to_order() return value documentation +Date: Thu, 18 Apr 2024 17:18:34 +0200 + +The documentation is wrong and relying on it almost resulted in BUGs in +new callers: we return -EAGAIN on unexpected folio references, not -EBUSY. + +Let's fix that and also document which other return values we can +currently see and why they could happen. + +Link: https://lkml.kernel.org/r/20240418151834.216557-1-david@redhat.com +Signed-off-by: David Hildenbrand <david@redhat.com> +Cc: John Hubbard <jhubbard@nvidia.com> +Cc: Zi Yan <ziy@nvidia.com> +Cc: Matthew Wilcox <willy@infradead.org> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +--- + + mm/huge_memory.c | 13 ++++++++++--- + 1 file changed, 10 insertions(+), 3 deletions(-) + +--- a/mm/huge_memory.c~mm-huge_memory-improve-split_huge_page_to_list_to_order-return-value-documentation ++++ a/mm/huge_memory.c +@@ -2956,7 +2956,7 @@ bool can_split_folio(struct folio *folio + * + * 3) The folio must not be pinned. Any unexpected folio references, including + * GUP pins, will result in the folio not getting split; instead, the caller +- * will receive an -EBUSY. ++ * will receive an -EAGAIN. + * + * 4) @new_order > 1, usually. Splitting to order-1 anonymous folios is not + * supported for non-file-backed folios, because folio->_deferred_list, which +@@ -2975,8 +2975,15 @@ bool can_split_folio(struct folio *folio + * + * Returns 0 if the huge page was split successfully. + * +- * Returns -EBUSY if @page's folio is pinned, or if the anon_vma disappeared +- * from under us. ++ * Returns -EAGAIN if the folio has unexpected reference (e.g., GUP). ++ * ++ * Returns -EBUSY when trying to split the huge zeropage, if the folio is ++ * under writeback, if fs-specific folio metadata cannot currently be ++ * released, or if some unexpected race happened (e.g., anon VMA disappeared, ++ * truncation). ++ * ++ * Returns -EINVAL when trying to split to an order that is incompatible ++ * with the folio. Splitting to order 0 is compatible with all folios. + */ + int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, + unsigned int new_order) +_ diff --git a/patches/mm-madvise-introduce-clear_young_dirty_ptes-batch-helper.patch b/patches/mm-madvise-introduce-clear_young_dirty_ptes-batch-helper.patch new file mode 100644 index 000000000..38a1b498d --- /dev/null +++ b/patches/mm-madvise-introduce-clear_young_dirty_ptes-batch-helper.patch @@ -0,0 +1,190 @@ +From: Lance Yang <ioworker0@gmail.com> +Subject: mm/madvise: introduce clear_young_dirty_ptes() batch helper +Date: Thu, 18 Apr 2024 21:44:32 +0800 + +Patch series "mm/madvise: enhance lazyfreeing with mTHP in madvise_free", +v10. + +This patchset adds support for lazyfreeing multi-size THP (mTHP) without +needing to first split the large folio via split_folio(). However, we +still need to split a large folio that is not fully mapped within the +target range. + +If a large folio is locked or shared, or if we fail to split it, we just +leave it in place and advance to the next PTE in the range. But note that +the behavior is changed; previously, any failure of this sort would cause +the entire operation to give up. As large folios become more common, +sticking to the old way could result in wasted opportunities. + +Performance Testing +=================== + +On an Intel I5 CPU, lazyfreeing a 1GiB VMA backed by PTE-mapped folios of +the same size results in the following runtimes for madvise(MADV_FREE) in +seconds (shorter is better): + +Folio Size | Old | New | Change +------------------------------------------ + 4KiB | 0.590251 | 0.590259 | 0% + 16KiB | 2.990447 | 0.185655 | -94% + 32KiB | 2.547831 | 0.104870 | -95% + 64KiB | 2.457796 | 0.052812 | -97% + 128KiB | 2.281034 | 0.032777 | -99% + 256KiB | 2.230387 | 0.017496 | -99% + 512KiB | 2.189106 | 0.010781 | -99% + 1024KiB | 2.183949 | 0.007753 | -99% + 2048KiB | 0.002799 | 0.002804 | 0% + + +This patch (of 4): + +This commit introduces clear_young_dirty_ptes() to replace mkold_ptes(). +By doing so, we can use the same function for both use cases +(madvise_pageout and madvise_free), and it also provides the flexibility +to only clear the dirty flag in the future if needed. + +Link: https://lkml.kernel.org/r/20240418134435.6092-1-ioworker0@gmail.com +Link: https://lkml.kernel.org/r/20240418134435.6092-2-ioworker0@gmail.com +Signed-off-by: Lance Yang <ioworker0@gmail.com> +Suggested-by: Ryan Roberts <ryan.roberts@arm.com> +Acked-by: David Hildenbrand <david@redhat.com> +Reviewed-by: Ryan Roberts <ryan.roberts@arm.com> +Cc: Barry Song <21cnbao@gmail.com> +Cc: Jeff Xie <xiehuan09@gmail.com> +Cc: Kefeng Wang <wangkefeng.wang@huawei.com> +Cc: Michal Hocko <mhocko@suse.com> +Cc: Minchan Kim <minchan@kernel.org> +Cc: Muchun Song <songmuchun@bytedance.com> +Cc: Peter Xu <peterx@redhat.com> +Cc: Yang Shi <shy828301@gmail.com> +Cc: Yin Fengwei <fengwei.yin@intel.com> +Cc: Zach O'Keefe <zokeefe@google.com> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +--- + + include/linux/mm_types.h | 9 ++++ + include/linux/pgtable.h | 74 ++++++++++++++++++++++--------------- + mm/madvise.c | 3 + + 3 files changed, 55 insertions(+), 31 deletions(-) + +--- a/include/linux/mm_types.h~mm-madvise-introduce-clear_young_dirty_ptes-batch-helper ++++ a/include/linux/mm_types.h +@@ -1368,6 +1368,15 @@ enum fault_flag { + + typedef unsigned int __bitwise zap_flags_t; + ++/* Flags for clear_young_dirty_ptes(). */ ++typedef int __bitwise cydp_t; ++ ++/* Clear the access bit */ ++#define CYDP_CLEAR_YOUNG ((__force cydp_t)BIT(0)) ++ ++/* Clear the dirty bit */ ++#define CYDP_CLEAR_DIRTY ((__force cydp_t)BIT(1)) ++ + /* + * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each + * other. Here is what they mean, and how to use them: +--- a/include/linux/pgtable.h~mm-madvise-introduce-clear_young_dirty_ptes-batch-helper ++++ a/include/linux/pgtable.h +@@ -361,36 +361,6 @@ static inline int ptep_test_and_clear_yo + } + #endif + +-#ifndef mkold_ptes +-/** +- * mkold_ptes - Mark PTEs that map consecutive pages of the same folio as old. +- * @vma: VMA the pages are mapped into. +- * @addr: Address the first page is mapped at. +- * @ptep: Page table pointer for the first entry. +- * @nr: Number of entries to mark old. +- * +- * May be overridden by the architecture; otherwise, implemented as a simple +- * loop over ptep_test_and_clear_young(). +- * +- * Note that PTE bits in the PTE range besides the PFN can differ. For example, +- * some PTEs might be write-protected. +- * +- * Context: The caller holds the page table lock. The PTEs map consecutive +- * pages that belong to the same folio. The PTEs are all in the same PMD. +- */ +-static inline void mkold_ptes(struct vm_area_struct *vma, unsigned long addr, +- pte_t *ptep, unsigned int nr) +-{ +- for (;;) { +- ptep_test_and_clear_young(vma, addr, ptep); +- if (--nr == 0) +- break; +- ptep++; +- addr += PAGE_SIZE; +- } +-} +-#endif +- + #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG + #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) + static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, +@@ -489,6 +459,50 @@ static inline pte_t ptep_get_and_clear(s + } + #endif + ++#ifndef clear_young_dirty_ptes ++/** ++ * clear_young_dirty_ptes - Mark PTEs that map consecutive pages of the ++ * same folio as old/clean. ++ * @mm: Address space the pages are mapped into. ++ * @addr: Address the first page is mapped at. ++ * @ptep: Page table pointer for the first entry. ++ * @nr: Number of entries to mark old/clean. ++ * @flags: Flags to modify the PTE batch semantics. ++ * ++ * May be overridden by the architecture; otherwise, implemented by ++ * get_and_clear/modify/set for each pte in the range. ++ * ++ * Note that PTE bits in the PTE range besides the PFN can differ. For example, ++ * some PTEs might be write-protected. ++ * ++ * Context: The caller holds the page table lock. The PTEs map consecutive ++ * pages that belong to the same folio. The PTEs are all in the same PMD. ++ */ ++static inline void clear_young_dirty_ptes(struct vm_area_struct *vma, ++ unsigned long addr, pte_t *ptep, ++ unsigned int nr, cydp_t flags) ++{ ++ pte_t pte; ++ ++ for (;;) { ++ if (flags == CYDP_CLEAR_YOUNG) ++ ptep_test_and_clear_young(vma, addr, ptep); ++ else { ++ pte = ptep_get_and_clear(vma->vm_mm, addr, ptep); ++ if (flags & CYDP_CLEAR_YOUNG) ++ pte = pte_mkold(pte); ++ if (flags & CYDP_CLEAR_DIRTY) ++ pte = pte_mkclean(pte); ++ set_pte_at(vma->vm_mm, addr, ptep, pte); ++ } ++ if (--nr == 0) ++ break; ++ ptep++; ++ addr += PAGE_SIZE; ++ } ++} ++#endif ++ + static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) + { +--- a/mm/madvise.c~mm-madvise-introduce-clear_young_dirty_ptes-batch-helper ++++ a/mm/madvise.c +@@ -507,7 +507,8 @@ restart: + continue; + + if (!pageout && pte_young(ptent)) { +- mkold_ptes(vma, addr, pte, nr); ++ clear_young_dirty_ptes(vma, addr, pte, nr, ++ CYDP_CLEAR_YOUNG); + tlb_remove_tlb_entries(tlb, pte, nr, addr); + } + +_ diff --git a/patches/mm-madvise-optimize-lazyfreeing-with-mthp-in-madvise_free.patch b/patches/mm-madvise-optimize-lazyfreeing-with-mthp-in-madvise_free.patch new file mode 100644 index 000000000..ce8bf7ebe --- /dev/null +++ b/patches/mm-madvise-optimize-lazyfreeing-with-mthp-in-madvise_free.patch @@ -0,0 +1,172 @@ +From: Lance Yang <ioworker0@gmail.com> +Subject: mm/madvise: optimize lazyfreeing with mTHP in madvise_free +Date: Thu, 18 Apr 2024 21:44:35 +0800 + +This patch optimizes lazyfreeing with PTE-mapped mTHP[1] (Inspired by +David Hildenbrand[2]). We aim to avoid unnecessary folio splitting if the +large folio is fully mapped within the target range. + +If a large folio is locked or shared, or if we fail to split it, we just +leave it in place and advance to the next PTE in the range. But note that +the behavior is changed; previously, any failure of this sort would cause +the entire operation to give up. As large folios become more common, +sticking to the old way could result in wasted opportunities. + +On an Intel I5 CPU, lazyfreeing a 1GiB VMA backed by PTE-mapped folios of +the same size results in the following runtimes for madvise(MADV_FREE) in +seconds (shorter is better): + +Folio Size | Old | New | Change +------------------------------------------ + 4KiB | 0.590251 | 0.590259 | 0% + 16KiB | 2.990447 | 0.185655 | -94% + 32KiB | 2.547831 | 0.104870 | -95% + 64KiB | 2.457796 | 0.052812 | -97% + 128KiB | 2.281034 | 0.032777 | -99% + 256KiB | 2.230387 | 0.017496 | -99% + 512KiB | 2.189106 | 0.010781 | -99% + 1024KiB | 2.183949 | 0.007753 | -99% + 2048KiB | 0.002799 | 0.002804 | 0% + +[1] https://lkml.kernel.org/r/20231207161211.2374093-5-ryan.roberts@arm.com +[2] https://lore.kernel.org/linux-mm/20240214204435.167852-1-david@redhat.com + +Link: https://lkml.kernel.org/r/20240418134435.6092-5-ioworker0@gmail.com +Signed-off-by: Lance Yang <ioworker0@gmail.com> +Reviewed-by: Ryan Roberts <ryan.roberts@arm.com> +Acked-by: David Hildenbrand <david@redhat.com> +Cc: Barry Song <21cnbao@gmail.com> +Cc: Jeff Xie <xiehuan09@gmail.com> +Cc: Kefeng Wang <wangkefeng.wang@huawei.com> +Cc: Michal Hocko <mhocko@suse.com> +Cc: Minchan Kim <minchan@kernel.org> +Cc: Muchun Song <songmuchun@bytedance.com> +Cc: Peter Xu <peterx@redhat.com> +Cc: Yang Shi <shy828301@gmail.com> +Cc: Yin Fengwei <fengwei.yin@intel.com> +Cc: Zach O'Keefe <zokeefe@google.com> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +--- + + mm/madvise.c | 85 +++++++++++++++++++++++++------------------------ + 1 file changed, 44 insertions(+), 41 deletions(-) + +--- a/mm/madvise.c~mm-madvise-optimize-lazyfreeing-with-mthp-in-madvise_free ++++ a/mm/madvise.c +@@ -643,6 +643,7 @@ static int madvise_free_pte_range(pmd_t + unsigned long end, struct mm_walk *walk) + + { ++ const cydp_t cydp_flags = CYDP_CLEAR_YOUNG | CYDP_CLEAR_DIRTY; + struct mmu_gather *tlb = walk->private; + struct mm_struct *mm = tlb->mm; + struct vm_area_struct *vma = walk->vma; +@@ -697,44 +698,57 @@ static int madvise_free_pte_range(pmd_t + continue; + + /* +- * If pmd isn't transhuge but the folio is large and +- * is owned by only this process, split it and +- * deactivate all pages. ++ * If we encounter a large folio, only split it if it is not ++ * fully mapped within the range we are operating on. Otherwise ++ * leave it as is so that it can be marked as lazyfree. If we ++ * fail to split a folio, leave it in place and advance to the ++ * next pte in the range. + */ + if (folio_test_large(folio)) { +- int err; ++ bool any_young, any_dirty; + +- if (folio_likely_mapped_shared(folio)) +- break; +- if (!folio_trylock(folio)) +- break; +- folio_get(folio); +- arch_leave_lazy_mmu_mode(); +- pte_unmap_unlock(start_pte, ptl); +- start_pte = NULL; +- err = split_folio(folio); +- folio_unlock(folio); +- folio_put(folio); +- if (err) +- break; +- start_pte = pte = +- pte_offset_map_lock(mm, pmd, addr, &ptl); +- if (!start_pte) +- break; +- arch_enter_lazy_mmu_mode(); +- pte--; +- addr -= PAGE_SIZE; +- continue; ++ nr = madvise_folio_pte_batch(addr, end, folio, pte, ++ ptent, &any_young, &any_dirty); ++ ++ if (nr < folio_nr_pages(folio)) { ++ int err; ++ ++ if (folio_likely_mapped_shared(folio)) ++ continue; ++ if (!folio_trylock(folio)) ++ continue; ++ folio_get(folio); ++ arch_leave_lazy_mmu_mode(); ++ pte_unmap_unlock(start_pte, ptl); ++ start_pte = NULL; ++ err = split_folio(folio); ++ folio_unlock(folio); ++ folio_put(folio); ++ pte = pte_offset_map_lock(mm, pmd, addr, &ptl); ++ start_pte = pte; ++ if (!start_pte) ++ break; ++ arch_enter_lazy_mmu_mode(); ++ if (!err) ++ nr = 0; ++ continue; ++ } ++ ++ if (any_young) ++ ptent = pte_mkyoung(ptent); ++ if (any_dirty) ++ ptent = pte_mkdirty(ptent); + } + + if (folio_test_swapcache(folio) || folio_test_dirty(folio)) { + if (!folio_trylock(folio)) + continue; + /* +- * If folio is shared with others, we mustn't clear +- * the folio's dirty flag. ++ * If we have a large folio at this point, we know it is ++ * fully mapped so if its mapcount is the same as its ++ * number of pages, it must be exclusive. + */ +- if (folio_mapcount(folio) != 1) { ++ if (folio_mapcount(folio) != folio_nr_pages(folio)) { + folio_unlock(folio); + continue; + } +@@ -750,19 +764,8 @@ static int madvise_free_pte_range(pmd_t + } + + if (pte_young(ptent) || pte_dirty(ptent)) { +- /* +- * Some of architecture(ex, PPC) don't update TLB +- * with set_pte_at and tlb_remove_tlb_entry so for +- * the portability, remap the pte with old|clean +- * after pte clearing. +- */ +- ptent = ptep_get_and_clear_full(mm, addr, pte, +- tlb->fullmm); +- +- ptent = pte_mkold(ptent); +- ptent = pte_mkclean(ptent); +- set_pte_at(mm, addr, pte, ptent); +- tlb_remove_tlb_entry(tlb, pte, addr); ++ clear_young_dirty_ptes(vma, addr, pte, nr, cydp_flags); ++ tlb_remove_tlb_entries(tlb, pte, nr, addr); + } + folio_mark_lazyfree(folio); + } +_ diff --git a/patches/mm-memory-add-any_dirty-optional-pointer-to-folio_pte_batch.patch b/patches/mm-memory-add-any_dirty-optional-pointer-to-folio_pte_batch.patch new file mode 100644 index 000000000..64221bb7d --- /dev/null +++ b/patches/mm-memory-add-any_dirty-optional-pointer-to-folio_pte_batch.patch @@ -0,0 +1,142 @@ +From: Lance Yang <ioworker0@gmail.com> +Subject: mm/memory: add any_dirty optional pointer to folio_pte_batch() +Date: Thu, 18 Apr 2024 21:44:34 +0800 + +This commit adds the any_dirty pointer as an optional parameter to +folio_pte_batch() function. By using both the any_young and any_dirty +pointers, madvise_free can make smarter decisions about whether to clear +the PTEs when marking large folios as lazyfree. + +Link: https://lkml.kernel.org/r/20240418134435.6092-4-ioworker0@gmail.com +Signed-off-by: Lance Yang <ioworker0@gmail.com> +Suggested-by: David Hildenbrand <david@redhat.com> +Acked-by: David Hildenbrand <david@redhat.com> +Cc: Barry Song <21cnbao@gmail.com> +Cc: Jeff Xie <xiehuan09@gmail.com> +Cc: Kefeng Wang <wangkefeng.wang@huawei.com> +Cc: Michal Hocko <mhocko@suse.com> +Cc: Minchan Kim <minchan@kernel.org> +Cc: Muchun Song <songmuchun@bytedance.com> +Cc: Peter Xu <peterx@redhat.com> +Cc: Ryan Roberts <ryan.roberts@arm.com> +Cc: Yang Shi <shy828301@gmail.com> +Cc: Yin Fengwei <fengwei.yin@intel.com> +Cc: Zach O'Keefe <zokeefe@google.com> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +--- + + mm/internal.h | 12 ++++++++++-- + mm/madvise.c | 19 ++++++++++++++----- + mm/memory.c | 4 ++-- + 3 files changed, 26 insertions(+), 9 deletions(-) + +--- a/mm/internal.h~mm-memory-add-any_dirty-optional-pointer-to-folio_pte_batch ++++ a/mm/internal.h +@@ -134,6 +134,8 @@ static inline pte_t __pte_batch_clear_ig + * first one is writable. + * @any_young: Optional pointer to indicate whether any entry except the + * first one is young. ++ * @any_dirty: Optional pointer to indicate whether any entry except the ++ * first one is dirty. + * + * Detect a PTE batch: consecutive (present) PTEs that map consecutive + * pages of the same large folio. +@@ -149,18 +151,20 @@ static inline pte_t __pte_batch_clear_ig + */ + static inline int folio_pte_batch(struct folio *folio, unsigned long addr, + pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags, +- bool *any_writable, bool *any_young) ++ bool *any_writable, bool *any_young, bool *any_dirty) + { + unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio); + const pte_t *end_ptep = start_ptep + max_nr; + pte_t expected_pte, *ptep; +- bool writable, young; ++ bool writable, young, dirty; + int nr; + + if (any_writable) + *any_writable = false; + if (any_young) + *any_young = false; ++ if (any_dirty) ++ *any_dirty = false; + + VM_WARN_ON_FOLIO(!pte_present(pte), folio); + VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio); +@@ -176,6 +180,8 @@ static inline int folio_pte_batch(struct + writable = !!pte_write(pte); + if (any_young) + young = !!pte_young(pte); ++ if (any_dirty) ++ dirty = !!pte_dirty(pte); + pte = __pte_batch_clear_ignored(pte, flags); + + if (!pte_same(pte, expected_pte)) +@@ -193,6 +199,8 @@ static inline int folio_pte_batch(struct + *any_writable |= writable; + if (any_young) + *any_young |= young; ++ if (any_dirty) ++ *any_dirty |= dirty; + + nr = pte_batch_hint(ptep, pte); + expected_pte = pte_advance_pfn(expected_pte, nr); +--- a/mm/madvise.c~mm-memory-add-any_dirty-optional-pointer-to-folio_pte_batch ++++ a/mm/madvise.c +@@ -321,6 +321,18 @@ static inline bool can_do_file_pageout(s + file_permission(vma->vm_file, MAY_WRITE) == 0; + } + ++static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end, ++ struct folio *folio, pte_t *ptep, ++ pte_t pte, bool *any_young, ++ bool *any_dirty) ++{ ++ const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; ++ int max_nr = (end - addr) / PAGE_SIZE; ++ ++ return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL, ++ any_young, any_dirty); ++} ++ + static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +@@ -456,13 +468,10 @@ restart: + * next pte in the range. + */ + if (folio_test_large(folio)) { +- const fpb_t fpb_flags = FPB_IGNORE_DIRTY | +- FPB_IGNORE_SOFT_DIRTY; +- int max_nr = (end - addr) / PAGE_SIZE; + bool any_young; + +- nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, +- fpb_flags, NULL, &any_young); ++ nr = madvise_folio_pte_batch(addr, end, folio, pte, ++ ptent, &any_young, NULL); + if (any_young) + ptent = pte_mkyoung(ptent); + +--- a/mm/memory.c~mm-memory-add-any_dirty-optional-pointer-to-folio_pte_batch ++++ a/mm/memory.c +@@ -989,7 +989,7 @@ copy_present_ptes(struct vm_area_struct + flags |= FPB_IGNORE_SOFT_DIRTY; + + nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags, +- &any_writable, NULL); ++ &any_writable, NULL, NULL); + folio_ref_add(folio, nr); + if (folio_test_anon(folio)) { + if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page, +@@ -1558,7 +1558,7 @@ static inline int zap_present_ptes(struc + */ + if (unlikely(folio_test_large(folio) && max_nr != 1)) { + nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags, +- NULL, NULL); ++ NULL, NULL, NULL); + + zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr, + addr, details, rss, force_flush, +_ diff --git a/patches/mm-swapfile-check-usable-swap-device-in-__folio_throttle_swaprate.patch b/patches/mm-swapfile-check-usable-swap-device-in-__folio_throttle_swaprate.patch new file mode 100644 index 000000000..e1aeb16b7 --- /dev/null +++ b/patches/mm-swapfile-check-usable-swap-device-in-__folio_throttle_swaprate.patch @@ -0,0 +1,58 @@ +From: Kefeng Wang <wangkefeng.wang@huawei.com> +Subject: mm: swapfile: check usable swap device in __folio_throttle_swaprate() +Date: Thu, 18 Apr 2024 21:56:44 +0800 + +Skip blk_cgroup_congested() if there is no usable swap device since no +swapin/out will occur, Thereby avoid taking swap_lock. The difference is +shown below from perf date of CoW pagefault, + + perf report -g -i perf.data.swapoff | egrep "blk_cgroup_congested|__folio_throttle_swaprate" + 1.01% 0.16% page_fault2_pro [kernel.kallsyms] [k] __folio_throttle_swaprate + 0.83% 0.80% page_fault2_pro [kernel.kallsyms] [k] blk_cgroup_congested + + perf report -g -i perf.data.swapon | egrep "blk_cgroup_congested|__folio_throttle_swaprate" + 0.15% 0.15% page_fault2_pro [kernel.kallsyms] [k] __folio_throttle_swaprate + +Link: https://lkml.kernel.org/r/20240418135644.2736748-1-wangkefeng.wang@huawei.com +Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com> +Cc: Tejun Heo <tj@kernel.org> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +--- + + mm/swapfile.c | 13 ++++++++++--- + 1 file changed, 10 insertions(+), 3 deletions(-) + +--- a/mm/swapfile.c~mm-swapfile-check-usable-swap-device-in-__folio_throttle_swaprate ++++ a/mm/swapfile.c +@@ -2444,13 +2444,17 @@ static void reinsert_swap_info(struct sw + spin_unlock(&swap_lock); + } + ++static bool __has_usable_swap(void) ++{ ++ return !plist_head_empty(&swap_active_head); ++} ++ + bool has_usable_swap(void) + { +- bool ret = true; ++ bool ret; + + spin_lock(&swap_lock); +- if (plist_head_empty(&swap_active_head)) +- ret = false; ++ ret = __has_usable_swap(); + spin_unlock(&swap_lock); + return ret; + } +@@ -3710,6 +3714,9 @@ void __folio_throttle_swaprate(struct fo + if (!(gfp & __GFP_IO)) + return; + ++ if (!__has_usable_swap()) ++ return; ++ + if (!blk_cgroup_congested()) + return; + +_ diff --git a/patches/mm-zswap-fix-shrinker-null-crash-with-cgroup_disable=memory.patch b/patches/mm-zswap-fix-shrinker-null-crash-with-cgroup_disable=memory.patch new file mode 100644 index 000000000..21eaef699 --- /dev/null +++ b/patches/mm-zswap-fix-shrinker-null-crash-with-cgroup_disable=memory.patch @@ -0,0 +1,76 @@ +From: Johannes Weiner <hannes@cmpxchg.org> +Subject: mm: zswap: fix shrinker NULL crash with cgroup_disable=memory +Date: Thu, 18 Apr 2024 08:26:28 -0400 + +Christian reports a NULL deref in zswap that he bisected down to the zswap +shrinker. The issue also cropped up in the bug trackers of libguestfs [1] +and the Red Hat bugzilla [2]. + +The problem is that when memcg is disabled with the boot time flag, the +zswap shrinker might get called with sc->memcg == NULL. This is okay in +many places, like the lruvec operations. But it crashes in +memcg_page_state() - which is only used due to the non-node accounting of +cgroup's the zswap memory to begin with. + +Nhat spotted that the memcg can be NULL in the memcg-disabled case, and I +was then able to reproduce the crash locally as well. + +[1] https://github.com/libguestfs/libguestfs/issues/139 +[2] https://bugzilla.redhat.com/show_bug.cgi?id=2275252 + +Link: https://lkml.kernel.org/r/20240418124043.GC1055428@cmpxchg.org +Link: https://lkml.kernel.org/r/20240417143324.GA1055428@cmpxchg.org +Fixes: b5ba474f3f51 ("zswap: shrink zswap pool based on memory pressure") +Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> +Reported-by: Christian Heusel <christian@heusel.eu> +Debugged-by: Nhat Pham <nphamcs@gmail.com> +Suggested-by: Nhat Pham <nphamcs@gmail.com> +Tested-by: Christian Heusel <christian@heusel.eu> +Cc: Chengming Zhou <chengming.zhou@linux.dev> +Cc: Dan Streetman <ddstreet@ieee.org> +Cc: Richard W.M. Jones <rjones@redhat.com> +Cc: Seth Jennings <sjenning@redhat.com> +Cc: Vitaly Wool <vitaly.wool@konsulko.com> +Cc: Yosry Ahmed <yosryahmed@google.com> +Cc: <stable@vger.kernel.org> [v6.8] +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +--- + + mm/zswap.c | 25 ++++++++++++++++--------- + 1 file changed, 16 insertions(+), 9 deletions(-) + +--- a/mm/zswap.c~mm-zswap-fix-shrinker-null-crash-with-cgroup_disable=memory ++++ a/mm/zswap.c +@@ -1331,15 +1331,22 @@ static unsigned long zswap_shrinker_coun + if (!gfp_has_io_fs(sc->gfp_mask)) + return 0; + +-#ifdef CONFIG_MEMCG_KMEM +- mem_cgroup_flush_stats(memcg); +- nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT; +- nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED); +-#else +- /* use pool stats instead of memcg stats */ +- nr_backing = zswap_pool_total_size >> PAGE_SHIFT; +- nr_stored = atomic_read(&zswap_nr_stored); +-#endif ++ /* ++ * For memcg, use the cgroup-wide ZSWAP stats since we don't ++ * have them per-node and thus per-lruvec. Careful if memcg is ++ * runtime-disabled: we can get sc->memcg == NULL, which is ok ++ * for the lruvec, but not for memcg_page_state(). ++ * ++ * Without memcg, use the zswap pool-wide metrics. ++ */ ++ if (!mem_cgroup_disabled()) { ++ mem_cgroup_flush_stats(memcg); ++ nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT; ++ nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED); ++ } else { ++ nr_backing = zswap_pool_total_size >> PAGE_SHIFT; ++ nr_stored = atomic_read(&zswap_nr_stored); ++ } + + if (!nr_stored) + return 0; +_ diff --git a/patches/null-pointer-dereference-while-shrinking-zswap.patch b/patches/old/null-pointer-dereference-while-shrinking-zswap.patch index a94d69056..a94d69056 100644 --- a/patches/null-pointer-dereference-while-shrinking-zswap.patch +++ b/patches/old/null-pointer-dereference-while-shrinking-zswap.patch diff --git a/patches/stackdepot-respect-__gfp_nolockdep-allocation-flag.patch b/patches/stackdepot-respect-__gfp_nolockdep-allocation-flag.patch new file mode 100644 index 000000000..09dab83ca --- /dev/null +++ b/patches/stackdepot-respect-__gfp_nolockdep-allocation-flag.patch @@ -0,0 +1,90 @@ +From: Andrey Ryabinin <ryabinin.a.a@gmail.com> +Subject: stackdepot: respect __GFP_NOLOCKDEP allocation flag +Date: Thu, 18 Apr 2024 16:11:33 +0200 + +If stack_depot_save_flags() allocates memory it always drops +__GFP_NOLOCKDEP flag. So when KASAN tries to track __GFP_NOLOCKDEP +allocation we may end up with lockdep splat like bellow: + +====================================================== + WARNING: possible circular locking dependency detected + 6.9.0-rc3+ #49 Not tainted + ------------------------------------------------------ + kswapd0/149 is trying to acquire lock: + ffff88811346a920 +(&xfs_nondir_ilock_class){++++}-{4:4}, at: xfs_reclaim_inode+0x3ac/0x590 +[xfs] + + but task is already holding lock: + ffffffff8bb33100 (fs_reclaim){+.+.}-{0:0}, at: +balance_pgdat+0x5d9/0xad0 + + which lock already depends on the new lock. + + the existing dependency chain (in reverse order) is: + -> #1 (fs_reclaim){+.+.}-{0:0}: + __lock_acquire+0x7da/0x1030 + lock_acquire+0x15d/0x400 + fs_reclaim_acquire+0xb5/0x100 + prepare_alloc_pages.constprop.0+0xc5/0x230 + __alloc_pages+0x12a/0x3f0 + alloc_pages_mpol+0x175/0x340 + stack_depot_save_flags+0x4c5/0x510 + kasan_save_stack+0x30/0x40 + kasan_save_track+0x10/0x30 + __kasan_slab_alloc+0x83/0x90 + kmem_cache_alloc+0x15e/0x4a0 + __alloc_object+0x35/0x370 + __create_object+0x22/0x90 + __kmalloc_node_track_caller+0x477/0x5b0 + krealloc+0x5f/0x110 + xfs_iext_insert_raw+0x4b2/0x6e0 [xfs] + xfs_iext_insert+0x2e/0x130 [xfs] + xfs_iread_bmbt_block+0x1a9/0x4d0 [xfs] + xfs_btree_visit_block+0xfb/0x290 [xfs] + xfs_btree_visit_blocks+0x215/0x2c0 [xfs] + xfs_iread_extents+0x1a2/0x2e0 [xfs] + xfs_buffered_write_iomap_begin+0x376/0x10a0 [xfs] + iomap_iter+0x1d1/0x2d0 + iomap_file_buffered_write+0x120/0x1a0 + xfs_file_buffered_write+0x128/0x4b0 [xfs] + vfs_write+0x675/0x890 + ksys_write+0xc3/0x160 + do_syscall_64+0x94/0x170 + entry_SYSCALL_64_after_hwframe+0x71/0x79 + +Always preserve __GFP_NOLOCKDEP to fix this. + +Link: https://lkml.kernel.org/r/20240418141133.22950-1-ryabinin.a.a@gmail.com +Fixes: cd11016e5f52 ("mm, kasan: stackdepot implementation. Enable stackdepot for SLAB") +Signed-off-by: Andrey Ryabinin <ryabinin.a.a@gmail.com> +Reported-by: Xiubo Li <xiubli@redhat.com> +Closes: https://lore.kernel.org/all/a0caa289-ca02-48eb-9bf2-d86fd47b71f4@redhat.com/ +Reported-by: Damien Le Moal <damien.lemoal@opensource.wdc.com> +Closes: https://lore.kernel.org/all/f9ff999a-e170-b66b-7caf-293f2b147ac2@opensource.wdc.com/ +Suggested-by: Dave Chinner <david@fromorbit.com> +Cc: Christoph Hellwig <hch@infradead.org> +Cc: Alexander Potapenko <glider@google.com> +Cc: <stable@vger.kernel.org> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +--- + + lib/stackdepot.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/lib/stackdepot.c~stackdepot-respect-__gfp_nolockdep-allocation-flag ++++ a/lib/stackdepot.c +@@ -627,10 +627,10 @@ depot_stack_handle_t stack_depot_save_fl + /* + * Zero out zone modifiers, as we don't have specific zone + * requirements. Keep the flags related to allocation in atomic +- * contexts and I/O. ++ * contexts, I/O, nolockdep. + */ + alloc_flags &= ~GFP_ZONEMASK; +- alloc_flags &= (GFP_ATOMIC | GFP_KERNEL); ++ alloc_flags &= (GFP_ATOMIC | GFP_KERNEL | __GFP_NOLOCKDEP); + alloc_flags |= __GFP_NOWARN; + page = alloc_pages(alloc_flags, DEPOT_POOL_ORDER); + if (page) +_ diff --git a/pc/crash-add-prefix-for-crash-dumping-messages.pc b/pc/crash-add-prefix-for-crash-dumping-messages.pc new file mode 100644 index 000000000..76b71fde8 --- /dev/null +++ b/pc/crash-add-prefix-for-crash-dumping-messages.pc @@ -0,0 +1,2 @@ +kernel/crash_core.c +kernel/crash_reserve.c diff --git a/pc/devel-series b/pc/devel-series index 96756605d..c7370b154 100644 --- a/pc/devel-series +++ b/pc/devel-series @@ -109,10 +109,24 @@ selftests-harness-remove-use-of-line_max-fix-fix-fix.patch # selftests-mm-fix-unused-and-uninitialized-variable-warning.patch # -null-pointer-dereference-while-shrinking-zswap.patch # mm-hugetlb-fix-missing-hugetlb_lock-for-resv-uncharge.patch # +mm-create-folio_flag_false-and-folio_type_ops-macros.patch +mm-support-page_mapcount-on-page_has_type-pages.patch +mm-turn-folio_test_hugetlb-into-a-pagetype.patch +mm-turn-folio_test_hugetlb-into-a-pagetype-fix.patch +# +#mm-zswap-fix-shrinker-null-crash-with-cgroup_disable=memory.patch: https://lkml.kernel.org/r/CAJD7tkaPMQqQtfxcLWraz-vnbAxZKxuJRJ7vKuDOCCXtpBSF1A@mail.gmail.com +mm-zswap-fix-shrinker-null-crash-with-cgroup_disable=memory.patch +# +#hugetlb-check-for-anon_vma-prior-to-folio-allocation.patch: syzbot testing +hugetlb-check-for-anon_vma-prior-to-folio-allocation.patch +# +stackdepot-respect-__gfp_nolockdep-allocation-flag.patch +# +init-fix-allocated-page-overlapping-with-ptr_err.patch +# ### hfe # #ENDBRANCH mm-hotfixes-unstable @@ -234,11 +248,7 @@ mm-change-inlined-allocation-helpers-to-account-at-the-call-site.patch # mm-always-initialise-folio-_deferred_list.patch mm-always-initialise-folio-_deferred_list-fix.patch -mm-create-folio_flag_false-and-folio_type_ops-macros.patch mm-remove-folio_prep_large_rmappable.patch -mm-support-page_mapcount-on-page_has_type-pages.patch -mm-turn-folio_test_hugetlb-into-a-pagetype.patch -mm-turn-folio_test_hugetlb-into-a-pagetype-fix.patch mm-remove-a-call-to-compound_head-from-is_page_hwpoison.patch #mm-free-up-pg_slab.patch: check review https://lkml.kernel.org/r/202403312344.c0d273ab-oliver.sang@intel.com mm-free-up-pg_slab.patch @@ -606,10 +616,12 @@ mm-filemap-batch-mm-counter-updating-in-filemap_map_pages.patch # mm-page_alloc-allowing-mthp-compaction-to-capture-the-freed-page-directly.patch # +#mseal-wire-up-mseal-syscall.patch: https://lkml.kernel.org/r/CAJuCfpFLwJg4n7wPpT+u9vC4XHoLE_BPPZ0tDKf7W45hGky4_Q@mail.gmail.com mseal-wire-up-mseal-syscall.patch mseal-add-mseal-syscall.patch selftest-mm-mseal-memory-sealing.patch mseal-add-documentation.patch +#selftest-mm-mseal-read-only-elf-memory-segment.patch: https://lkml.kernel.org/r/CA+G9fYvacWNZsmizotfcwD35xBq0999_EAV0wZgwjdi46yivgg@mail.gmail.com selftest-mm-mseal-read-only-elf-memory-segment.patch selftest-mm-mseal-read-only-elf-memory-segment-fix.patch # @@ -645,6 +657,15 @@ mm-hugetlb-assert-hugetlb_lock-in-__hugetlb_cgroup_commit_charge.patch # mm-page_table_check-support-userfault-wr-protect-entries.patch # +mm-huge_memory-improve-split_huge_page_to_list_to_order-return-value-documentation.patch +# +mm-swapfile-check-usable-swap-device-in-__folio_throttle_swaprate.patch +# +mm-madvise-introduce-clear_young_dirty_ptes-batch-helper.patch +mm-arm64-override-clear_young_dirty_ptes-batch-helper.patch +mm-memory-add-any_dirty-optional-pointer-to-folio_pte_batch.patch +mm-madvise-optimize-lazyfreeing-with-mthp-in-madvise_free.patch +# # # # @@ -797,4 +818,6 @@ selftests-exec-make-binaries-position-independent.patch # cpumask-delete-unused-reset_cpu_possible_mask.patch # +crash-add-prefix-for-crash-dumping-messages.patch +# #ENDBRANCH mm-nonmm-unstable diff --git a/pc/hugetlb-check-for-anon_vma-prior-to-folio-allocation.pc b/pc/hugetlb-check-for-anon_vma-prior-to-folio-allocation.pc new file mode 100644 index 000000000..6dc98425d --- /dev/null +++ b/pc/hugetlb-check-for-anon_vma-prior-to-folio-allocation.pc @@ -0,0 +1 @@ +mm/hugetlb.c diff --git a/pc/init-fix-allocated-page-overlapping-with-ptr_err.pc b/pc/init-fix-allocated-page-overlapping-with-ptr_err.pc new file mode 100644 index 000000000..1b7210806 --- /dev/null +++ b/pc/init-fix-allocated-page-overlapping-with-ptr_err.pc @@ -0,0 +1 @@ +init/main.c diff --git a/pc/mm-arm64-override-clear_young_dirty_ptes-batch-helper.pc b/pc/mm-arm64-override-clear_young_dirty_ptes-batch-helper.pc new file mode 100644 index 000000000..ba3c28b5d --- /dev/null +++ b/pc/mm-arm64-override-clear_young_dirty_ptes-batch-helper.pc @@ -0,0 +1,2 @@ +arch/arm64/include/asm/pgtable.h +arch/arm64/mm/contpte.c diff --git a/pc/mm-huge_memory-improve-split_huge_page_to_list_to_order-return-value-documentation.pc b/pc/mm-huge_memory-improve-split_huge_page_to_list_to_order-return-value-documentation.pc new file mode 100644 index 000000000..b35bccbe3 --- /dev/null +++ b/pc/mm-huge_memory-improve-split_huge_page_to_list_to_order-return-value-documentation.pc @@ -0,0 +1 @@ +mm/huge_memory.c diff --git a/pc/mm-madvise-introduce-clear_young_dirty_ptes-batch-helper.pc b/pc/mm-madvise-introduce-clear_young_dirty_ptes-batch-helper.pc new file mode 100644 index 000000000..006118fb2 --- /dev/null +++ b/pc/mm-madvise-introduce-clear_young_dirty_ptes-batch-helper.pc @@ -0,0 +1,3 @@ +include/linux/mm_types.h +include/linux/pgtable.h +mm/madvise.c diff --git a/pc/mm-madvise-optimize-lazyfreeing-with-mthp-in-madvise_free.pc b/pc/mm-madvise-optimize-lazyfreeing-with-mthp-in-madvise_free.pc new file mode 100644 index 000000000..74d58a564 --- /dev/null +++ b/pc/mm-madvise-optimize-lazyfreeing-with-mthp-in-madvise_free.pc @@ -0,0 +1 @@ +mm/madvise.c diff --git a/pc/mm-memory-add-any_dirty-optional-pointer-to-folio_pte_batch.pc b/pc/mm-memory-add-any_dirty-optional-pointer-to-folio_pte_batch.pc new file mode 100644 index 000000000..8491d45ba --- /dev/null +++ b/pc/mm-memory-add-any_dirty-optional-pointer-to-folio_pte_batch.pc @@ -0,0 +1,3 @@ +mm/internal.h +mm/madvise.c +mm/memory.c diff --git a/pc/mm-swapfile-check-usable-swap-device-in-__folio_throttle_swaprate.pc b/pc/mm-swapfile-check-usable-swap-device-in-__folio_throttle_swaprate.pc new file mode 100644 index 000000000..b6b7df785 --- /dev/null +++ b/pc/mm-swapfile-check-usable-swap-device-in-__folio_throttle_swaprate.pc @@ -0,0 +1 @@ +mm/swapfile.c diff --git a/pc/null-pointer-dereference-while-shrinking-zswap.pc b/pc/mm-zswap-fix-shrinker-null-crash-with-cgroup_disable=memory.pc index 7f1f05d5c..7f1f05d5c 100644 --- a/pc/null-pointer-dereference-while-shrinking-zswap.pc +++ b/pc/mm-zswap-fix-shrinker-null-crash-with-cgroup_disable=memory.pc diff --git a/pc/stackdepot-respect-__gfp_nolockdep-allocation-flag.pc b/pc/stackdepot-respect-__gfp_nolockdep-allocation-flag.pc new file mode 100644 index 000000000..0e968ad16 --- /dev/null +++ b/pc/stackdepot-respect-__gfp_nolockdep-allocation-flag.pc @@ -0,0 +1 @@ +lib/stackdepot.c diff --git a/txt/crash-add-prefix-for-crash-dumping-messages.txt b/txt/crash-add-prefix-for-crash-dumping-messages.txt new file mode 100644 index 000000000..bf92d3291 --- /dev/null +++ b/txt/crash-add-prefix-for-crash-dumping-messages.txt @@ -0,0 +1,17 @@ +From: Baoquan He <bhe@redhat.com> +Subject: crash: add prefix for crash dumping messages +Date: Thu, 18 Apr 2024 11:58:43 +0800 + +Add pr_fmt() to kernel/crash_core.c to add the module name to debugging +message printed as prefix. + +And also add prefix 'crashkernel:' to two lines of message printing code +in kernel/crash_reserve.c. In kernel/crash_reserve.c, almost all +debugging messages have 'crashkernel:' prefix or there's keyword +crashkernel at the beginning or in the middle, adding pr_fmt() makes it +redundant. + +Link: https://lkml.kernel.org/r/20240418035843.1562887-1-bhe@redhat.com +Signed-off-by: Baoquan He <bhe@redhat.com> +Cc: Dave Young <dyoung@redhat.com> +Cc: Jiri Slaby <jirislaby@kernel.org> diff --git a/txt/hugetlb-check-for-anon_vma-prior-to-folio-allocation.txt b/txt/hugetlb-check-for-anon_vma-prior-to-folio-allocation.txt new file mode 100644 index 000000000..a21998914 --- /dev/null +++ b/txt/hugetlb-check-for-anon_vma-prior-to-folio-allocation.txt @@ -0,0 +1,19 @@ +From: "Vishal Moola (Oracle)" <vishal.moola@gmail.com> +Subject: hugetlb: check for anon_vma prior to folio allocation +Date: Mon, 15 Apr 2024 14:17:47 -0700 + +Commit 9acad7ba3e25 ("hugetlb: use vmf_anon_prepare() instead of +anon_vma_prepare()") may bailout after allocating a folio if we do not +hold the mmap lock. When this occurs, vmf_anon_prepare() will release the +vma lock. Hugetlb then attempts to call restore_reserve_on_error(), which +depends on the vma lock being held. + +We can move vmf_anon_prepare() prior to the folio allocation in order to +avoid calling restore_reserve_on_error() without the vma lock. + +Link: https://lkml.kernel.org/r/ZiFqSrSRLhIV91og@fedora +Fixes: 9acad7ba3e25 ("hugetlb: use vmf_anon_prepare() instead of anon_vma_prepare()") +Reported-by: syzbot+ad1b592fc4483655438b@syzkaller.appspotmail.com +Signed-off-by: Vishal Moola (Oracle) <vishal.moola@gmail.com> +Cc: Muchun Song <muchun.song@linux.dev> +Cc: <stable@vger.kernel.org> diff --git a/txt/init-fix-allocated-page-overlapping-with-ptr_err.txt b/txt/init-fix-allocated-page-overlapping-with-ptr_err.txt new file mode 100644 index 000000000..5d68e1b92 --- /dev/null +++ b/txt/init-fix-allocated-page-overlapping-with-ptr_err.txt @@ -0,0 +1,49 @@ +From: Nam Cao <namcao@linutronix.de> +Subject: init: fix allocated page overlapping with PTR_ERR +Date: Thu, 18 Apr 2024 12:29:43 +0200 + +There is nothing preventing kernel memory allocators from allocating a +page that overlaps with PTR_ERR(), except for architecture-specific code +that setup memblock. + +It was discovered that RISCV architecture doesn't setup memblock corectly, +leading to a page overlapping with PTR_ERR() being allocated, and +subsequently crashing the kernel (link in Close: ) + +The reported crash has nothing to do with PTR_ERR(): the last page (at +address 0xfffff000) being allocated leads to an unexpected arithmetic +overflow in ext4; but still, this page shouldn't be allocated in the first +place. + +Because PTR_ERR() is an architecture-independent thing, we shouldn't ask +every single architecture to set this up. There may be other +architectures beside RISCV that have the same problem. + +Fix this once and for all by reserving the physical memory page that may +be mapped to the last virtual memory page as part of low memory. + +Unfortunately, this means if there is actual memory at this reserved +location, that memory will become inaccessible. However, if this page is +not reserved, it can only be accessed as high memory, so this doesn't +matter if high memory is not supported. Even if high memory is supported, +it is still only one page. + +Closes: https://lore.kernel.org/linux-riscv/878r1ibpdn.fsf@all.your.base.are.belong.to.us +Link: https://lkml.kernel.org/r/20240418102943.180510-1-namcao@linutronix.de +Signed-off-by: Nam Cao <namcao@linutronix.de> +Reported-by: Björn Töpel <bjorn@kernel.org> +Tested-by: Björn Töpel <bjorn@kernel.org> +Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org> +Cc: Andreas Dilger <adilger@dilger.ca> +Cc: Arnd Bergmann <arnd@arndb.de> +Cc: Changbin Du <changbin.du@huawei.com> +Cc: Christophe Leroy <christophe.leroy@csgroup.eu> +Cc: Geert Uytterhoeven <geert+renesas@glider.be> +Cc: Ingo Molnar <mingo@kernel.org> +Cc: Krister Johansen <kjlx@templeofstupid.com> +Cc: Luis Chamberlain <mcgrof@kernel.org> +Cc: Nick Desaulniers <ndesaulniers@google.com> +Cc: Stephen Rothwell <sfr@canb.auug.org.au> +Cc: Tejun Heo <tj@kernel.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Cc: <stable@vger.kernel.org> diff --git a/txt/mm-arm64-override-clear_young_dirty_ptes-batch-helper.txt b/txt/mm-arm64-override-clear_young_dirty_ptes-batch-helper.txt new file mode 100644 index 000000000..43fe8f8d4 --- /dev/null +++ b/txt/mm-arm64-override-clear_young_dirty_ptes-batch-helper.txt @@ -0,0 +1,23 @@ +From: Lance Yang <ioworker0@gmail.com> +Subject: mm/arm64: override clear_young_dirty_ptes() batch helper +Date: Thu, 18 Apr 2024 21:44:33 +0800 + +The per-pte get_and_clear/modify/set approach would result in +unfolding/refolding for contpte mappings on arm64. So we need to override +clear_young_dirty_ptes() for arm64 to avoid it. + +Link: https://lkml.kernel.org/r/20240418134435.6092-3-ioworker0@gmail.com +Signed-off-by: Lance Yang <ioworker0@gmail.com> +Suggested-by: Barry Song <21cnbao@gmail.com> +Suggested-by: Ryan Roberts <ryan.roberts@arm.com> +Reviewed-by: Ryan Roberts <ryan.roberts@arm.com> +Cc: David Hildenbrand <david@redhat.com> +Cc: Jeff Xie <xiehuan09@gmail.com> +Cc: Kefeng Wang <wangkefeng.wang@huawei.com> +Cc: Michal Hocko <mhocko@suse.com> +Cc: Minchan Kim <minchan@kernel.org> +Cc: Muchun Song <songmuchun@bytedance.com> +Cc: Peter Xu <peterx@redhat.com> +Cc: Yang Shi <shy828301@gmail.com> +Cc: Yin Fengwei <fengwei.yin@intel.com> +Cc: Zach O'Keefe <zokeefe@google.com> diff --git a/txt/mm-create-folio_flag_false-and-folio_type_ops-macros.txt b/txt/mm-create-folio_flag_false-and-folio_type_ops-macros.txt index acacd697a..46a9f7b37 100644 --- a/txt/mm-create-folio_flag_false-and-folio_type_ops-macros.txt +++ b/txt/mm-create-folio_flag_false-and-folio_type_ops-macros.txt @@ -7,9 +7,11 @@ FOLIO_FLAG_FALSE from PAGEFLAG_FALSE and FOLIO_TYPE_OPS from PAGE_TYPE_OPS. Link: https://lkml.kernel.org/r/20240321142448.1645400-3-willy@infradead.org +Fixes: 9c5ccf2db04b ("mm: remove HUGETLB_PAGE_DTOR") Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: David Hildenbrand <david@redhat.com> Acked-by: Vlastimil Babka <vbabka@suse.cz> Cc: Miaohe Lin <linmiaohe@huawei.com> Cc: Muchun Song <muchun.song@linux.dev> Cc: Oscar Salvador <osalvador@suse.de> +Cc: <stable@vger.kernel.org> diff --git a/txt/mm-huge_memory-improve-split_huge_page_to_list_to_order-return-value-documentation.txt b/txt/mm-huge_memory-improve-split_huge_page_to_list_to_order-return-value-documentation.txt new file mode 100644 index 000000000..528263ad1 --- /dev/null +++ b/txt/mm-huge_memory-improve-split_huge_page_to_list_to_order-return-value-documentation.txt @@ -0,0 +1,15 @@ +From: David Hildenbrand <david@redhat.com> +Subject: mm/huge_memory: improve split_huge_page_to_list_to_order() return value documentation +Date: Thu, 18 Apr 2024 17:18:34 +0200 + +The documentation is wrong and relying on it almost resulted in BUGs in +new callers: we return -EAGAIN on unexpected folio references, not -EBUSY. + +Let's fix that and also document which other return values we can +currently see and why they could happen. + +Link: https://lkml.kernel.org/r/20240418151834.216557-1-david@redhat.com +Signed-off-by: David Hildenbrand <david@redhat.com> +Cc: John Hubbard <jhubbard@nvidia.com> +Cc: Zi Yan <ziy@nvidia.com> +Cc: Matthew Wilcox <willy@infradead.org> diff --git a/txt/mm-madvise-introduce-clear_young_dirty_ptes-batch-helper.txt b/txt/mm-madvise-introduce-clear_young_dirty_ptes-batch-helper.txt new file mode 100644 index 000000000..9d5731b50 --- /dev/null +++ b/txt/mm-madvise-introduce-clear_young_dirty_ptes-batch-helper.txt @@ -0,0 +1,61 @@ +From: Lance Yang <ioworker0@gmail.com> +Subject: mm/madvise: introduce clear_young_dirty_ptes() batch helper +Date: Thu, 18 Apr 2024 21:44:32 +0800 + +Patch series "mm/madvise: enhance lazyfreeing with mTHP in madvise_free", +v10. + +This patchset adds support for lazyfreeing multi-size THP (mTHP) without +needing to first split the large folio via split_folio(). However, we +still need to split a large folio that is not fully mapped within the +target range. + +If a large folio is locked or shared, or if we fail to split it, we just +leave it in place and advance to the next PTE in the range. But note that +the behavior is changed; previously, any failure of this sort would cause +the entire operation to give up. As large folios become more common, +sticking to the old way could result in wasted opportunities. + +Performance Testing +=================== + +On an Intel I5 CPU, lazyfreeing a 1GiB VMA backed by PTE-mapped folios of +the same size results in the following runtimes for madvise(MADV_FREE) in +seconds (shorter is better): + +Folio Size | Old | New | Change +------------------------------------------ + 4KiB | 0.590251 | 0.590259 | 0% + 16KiB | 2.990447 | 0.185655 | -94% + 32KiB | 2.547831 | 0.104870 | -95% + 64KiB | 2.457796 | 0.052812 | -97% + 128KiB | 2.281034 | 0.032777 | -99% + 256KiB | 2.230387 | 0.017496 | -99% + 512KiB | 2.189106 | 0.010781 | -99% + 1024KiB | 2.183949 | 0.007753 | -99% + 2048KiB | 0.002799 | 0.002804 | 0% + + +This patch (of 4): + +This commit introduces clear_young_dirty_ptes() to replace mkold_ptes(). +By doing so, we can use the same function for both use cases +(madvise_pageout and madvise_free), and it also provides the flexibility +to only clear the dirty flag in the future if needed. + +Link: https://lkml.kernel.org/r/20240418134435.6092-1-ioworker0@gmail.com +Link: https://lkml.kernel.org/r/20240418134435.6092-2-ioworker0@gmail.com +Signed-off-by: Lance Yang <ioworker0@gmail.com> +Suggested-by: Ryan Roberts <ryan.roberts@arm.com> +Acked-by: David Hildenbrand <david@redhat.com> +Reviewed-by: Ryan Roberts <ryan.roberts@arm.com> +Cc: Barry Song <21cnbao@gmail.com> +Cc: Jeff Xie <xiehuan09@gmail.com> +Cc: Kefeng Wang <wangkefeng.wang@huawei.com> +Cc: Michal Hocko <mhocko@suse.com> +Cc: Minchan Kim <minchan@kernel.org> +Cc: Muchun Song <songmuchun@bytedance.com> +Cc: Peter Xu <peterx@redhat.com> +Cc: Yang Shi <shy828301@gmail.com> +Cc: Yin Fengwei <fengwei.yin@intel.com> +Cc: Zach O'Keefe <zokeefe@google.com> diff --git a/txt/mm-madvise-optimize-lazyfreeing-with-mthp-in-madvise_free.txt b/txt/mm-madvise-optimize-lazyfreeing-with-mthp-in-madvise_free.txt new file mode 100644 index 000000000..51f30897c --- /dev/null +++ b/txt/mm-madvise-optimize-lazyfreeing-with-mthp-in-madvise_free.txt @@ -0,0 +1,47 @@ +From: Lance Yang <ioworker0@gmail.com> +Subject: mm/madvise: optimize lazyfreeing with mTHP in madvise_free +Date: Thu, 18 Apr 2024 21:44:35 +0800 + +This patch optimizes lazyfreeing with PTE-mapped mTHP[1] (Inspired by +David Hildenbrand[2]). We aim to avoid unnecessary folio splitting if the +large folio is fully mapped within the target range. + +If a large folio is locked or shared, or if we fail to split it, we just +leave it in place and advance to the next PTE in the range. But note that +the behavior is changed; previously, any failure of this sort would cause +the entire operation to give up. As large folios become more common, +sticking to the old way could result in wasted opportunities. + +On an Intel I5 CPU, lazyfreeing a 1GiB VMA backed by PTE-mapped folios of +the same size results in the following runtimes for madvise(MADV_FREE) in +seconds (shorter is better): + +Folio Size | Old | New | Change +------------------------------------------ + 4KiB | 0.590251 | 0.590259 | 0% + 16KiB | 2.990447 | 0.185655 | -94% + 32KiB | 2.547831 | 0.104870 | -95% + 64KiB | 2.457796 | 0.052812 | -97% + 128KiB | 2.281034 | 0.032777 | -99% + 256KiB | 2.230387 | 0.017496 | -99% + 512KiB | 2.189106 | 0.010781 | -99% + 1024KiB | 2.183949 | 0.007753 | -99% + 2048KiB | 0.002799 | 0.002804 | 0% + +[1] https://lkml.kernel.org/r/20231207161211.2374093-5-ryan.roberts@arm.com +[2] https://lore.kernel.org/linux-mm/20240214204435.167852-1-david@redhat.com + +Link: https://lkml.kernel.org/r/20240418134435.6092-5-ioworker0@gmail.com +Signed-off-by: Lance Yang <ioworker0@gmail.com> +Reviewed-by: Ryan Roberts <ryan.roberts@arm.com> +Acked-by: David Hildenbrand <david@redhat.com> +Cc: Barry Song <21cnbao@gmail.com> +Cc: Jeff Xie <xiehuan09@gmail.com> +Cc: Kefeng Wang <wangkefeng.wang@huawei.com> +Cc: Michal Hocko <mhocko@suse.com> +Cc: Minchan Kim <minchan@kernel.org> +Cc: Muchun Song <songmuchun@bytedance.com> +Cc: Peter Xu <peterx@redhat.com> +Cc: Yang Shi <shy828301@gmail.com> +Cc: Yin Fengwei <fengwei.yin@intel.com> +Cc: Zach O'Keefe <zokeefe@google.com> diff --git a/txt/mm-memory-add-any_dirty-optional-pointer-to-folio_pte_batch.txt b/txt/mm-memory-add-any_dirty-optional-pointer-to-folio_pte_batch.txt new file mode 100644 index 000000000..5fdfd3742 --- /dev/null +++ b/txt/mm-memory-add-any_dirty-optional-pointer-to-folio_pte_batch.txt @@ -0,0 +1,24 @@ +From: Lance Yang <ioworker0@gmail.com> +Subject: mm/memory: add any_dirty optional pointer to folio_pte_batch() +Date: Thu, 18 Apr 2024 21:44:34 +0800 + +This commit adds the any_dirty pointer as an optional parameter to +folio_pte_batch() function. By using both the any_young and any_dirty +pointers, madvise_free can make smarter decisions about whether to clear +the PTEs when marking large folios as lazyfree. + +Link: https://lkml.kernel.org/r/20240418134435.6092-4-ioworker0@gmail.com +Signed-off-by: Lance Yang <ioworker0@gmail.com> +Suggested-by: David Hildenbrand <david@redhat.com> +Acked-by: David Hildenbrand <david@redhat.com> +Cc: Barry Song <21cnbao@gmail.com> +Cc: Jeff Xie <xiehuan09@gmail.com> +Cc: Kefeng Wang <wangkefeng.wang@huawei.com> +Cc: Michal Hocko <mhocko@suse.com> +Cc: Minchan Kim <minchan@kernel.org> +Cc: Muchun Song <songmuchun@bytedance.com> +Cc: Peter Xu <peterx@redhat.com> +Cc: Ryan Roberts <ryan.roberts@arm.com> +Cc: Yang Shi <shy828301@gmail.com> +Cc: Yin Fengwei <fengwei.yin@intel.com> +Cc: Zach O'Keefe <zokeefe@google.com> diff --git a/txt/mm-page_table_check-support-userfault-wr-protect-entries.txt b/txt/mm-page_table_check-support-userfault-wr-protect-entries.txt index 046beb6cb..180b38c6a 100644 --- a/txt/mm-page_table_check-support-userfault-wr-protect-entries.txt +++ b/txt/mm-page_table_check-support-userfault-wr-protect-entries.txt @@ -48,7 +48,7 @@ better now. Link: https://lkml.kernel.org/r/20240417212549.2766883-1-peterx@redhat.com Signed-off-by: Peter Xu <peterx@redhat.com> -Cc: Pasha Tatashin <pasha.tatashin@soleen.com> +Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com> Cc: Axel Rasmussen <axelrasmussen@google.com> Cc: David Hildenbrand <david@redhat.com> Cc: Nadav Amit <nadav.amit@gmail.com> diff --git a/txt/mm-support-page_mapcount-on-page_has_type-pages.txt b/txt/mm-support-page_mapcount-on-page_has_type-pages.txt index f226b6859..beafd94ce 100644 --- a/txt/mm-support-page_mapcount-on-page_has_type-pages.txt +++ b/txt/mm-support-page_mapcount-on-page_has_type-pages.txt @@ -7,9 +7,11 @@ works. It is more convenient for users to not have to filter out these pages. Link: https://lkml.kernel.org/r/20240321142448.1645400-5-willy@infradead.org +Fixes: 9c5ccf2db04b ("mm: remove HUGETLB_PAGE_DTOR") Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: David Hildenbrand <david@redhat.com> Acked-by: Vlastimil Babka <vbabka@suse.cz> Cc: Miaohe Lin <linmiaohe@huawei.com> Cc: Muchun Song <muchun.song@linux.dev> Cc: Oscar Salvador <osalvador@suse.de> +Cc: <stable@vger.kernel.org> diff --git a/txt/mm-swapfile-check-usable-swap-device-in-__folio_throttle_swaprate.txt b/txt/mm-swapfile-check-usable-swap-device-in-__folio_throttle_swaprate.txt new file mode 100644 index 000000000..65cc6c631 --- /dev/null +++ b/txt/mm-swapfile-check-usable-swap-device-in-__folio_throttle_swaprate.txt @@ -0,0 +1,18 @@ +From: Kefeng Wang <wangkefeng.wang@huawei.com> +Subject: mm: swapfile: check usable swap device in __folio_throttle_swaprate() +Date: Thu, 18 Apr 2024 21:56:44 +0800 + +Skip blk_cgroup_congested() if there is no usable swap device since no +swapin/out will occur, Thereby avoid taking swap_lock. The difference is +shown below from perf date of CoW pagefault, + + perf report -g -i perf.data.swapoff | egrep "blk_cgroup_congested|__folio_throttle_swaprate" + 1.01% 0.16% page_fault2_pro [kernel.kallsyms] [k] __folio_throttle_swaprate + 0.83% 0.80% page_fault2_pro [kernel.kallsyms] [k] blk_cgroup_congested + + perf report -g -i perf.data.swapon | egrep "blk_cgroup_congested|__folio_throttle_swaprate" + 0.15% 0.15% page_fault2_pro [kernel.kallsyms] [k] __folio_throttle_swaprate + +Link: https://lkml.kernel.org/r/20240418135644.2736748-1-wangkefeng.wang@huawei.com +Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com> +Cc: Tejun Heo <tj@kernel.org> diff --git a/txt/mm-zswap-fix-shrinker-null-crash-with-cgroup_disable=memory.txt b/txt/mm-zswap-fix-shrinker-null-crash-with-cgroup_disable=memory.txt new file mode 100644 index 000000000..09615713a --- /dev/null +++ b/txt/mm-zswap-fix-shrinker-null-crash-with-cgroup_disable=memory.txt @@ -0,0 +1,35 @@ +From: Johannes Weiner <hannes@cmpxchg.org> +Subject: mm: zswap: fix shrinker NULL crash with cgroup_disable=memory +Date: Thu, 18 Apr 2024 08:26:28 -0400 + +Christian reports a NULL deref in zswap that he bisected down to the zswap +shrinker. The issue also cropped up in the bug trackers of libguestfs [1] +and the Red Hat bugzilla [2]. + +The problem is that when memcg is disabled with the boot time flag, the +zswap shrinker might get called with sc->memcg == NULL. This is okay in +many places, like the lruvec operations. But it crashes in +memcg_page_state() - which is only used due to the non-node accounting of +cgroup's the zswap memory to begin with. + +Nhat spotted that the memcg can be NULL in the memcg-disabled case, and I +was then able to reproduce the crash locally as well. + +[1] https://github.com/libguestfs/libguestfs/issues/139 +[2] https://bugzilla.redhat.com/show_bug.cgi?id=2275252 + +Link: https://lkml.kernel.org/r/20240418124043.GC1055428@cmpxchg.org +Link: https://lkml.kernel.org/r/20240417143324.GA1055428@cmpxchg.org +Fixes: b5ba474f3f51 ("zswap: shrink zswap pool based on memory pressure") +Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> +Reported-by: Christian Heusel <christian@heusel.eu> +Debugged-by: Nhat Pham <nphamcs@gmail.com> +Suggested-by: Nhat Pham <nphamcs@gmail.com> +Tested-by: Christian Heusel <christian@heusel.eu> +Acked-by: Yosry Ahmed <yosryahmed@google.com> +Cc: Chengming Zhou <chengming.zhou@linux.dev> +Cc: Dan Streetman <ddstreet@ieee.org> +Cc: Richard W.M. Jones <rjones@redhat.com> +Cc: Seth Jennings <sjenning@redhat.com> +Cc: Vitaly Wool <vitaly.wool@konsulko.com> +Cc: <stable@vger.kernel.org> [v6.8] diff --git a/txt/null-pointer-dereference-while-shrinking-zswap.txt b/txt/old/null-pointer-dereference-while-shrinking-zswap.txt index f437585b2..f437585b2 100644 --- a/txt/null-pointer-dereference-while-shrinking-zswap.txt +++ b/txt/old/null-pointer-dereference-while-shrinking-zswap.txt diff --git a/txt/stackdepot-respect-__gfp_nolockdep-allocation-flag.txt b/txt/stackdepot-respect-__gfp_nolockdep-allocation-flag.txt new file mode 100644 index 000000000..824f42ed9 --- /dev/null +++ b/txt/stackdepot-respect-__gfp_nolockdep-allocation-flag.txt @@ -0,0 +1,68 @@ +From: Andrey Ryabinin <ryabinin.a.a@gmail.com> +Subject: stackdepot: respect __GFP_NOLOCKDEP allocation flag +Date: Thu, 18 Apr 2024 16:11:33 +0200 + +If stack_depot_save_flags() allocates memory it always drops +__GFP_NOLOCKDEP flag. So when KASAN tries to track __GFP_NOLOCKDEP +allocation we may end up with lockdep splat like bellow: + +====================================================== + WARNING: possible circular locking dependency detected + 6.9.0-rc3+ #49 Not tainted + ------------------------------------------------------ + kswapd0/149 is trying to acquire lock: + ffff88811346a920 +(&xfs_nondir_ilock_class){++++}-{4:4}, at: xfs_reclaim_inode+0x3ac/0x590 +[xfs] + + but task is already holding lock: + ffffffff8bb33100 (fs_reclaim){+.+.}-{0:0}, at: +balance_pgdat+0x5d9/0xad0 + + which lock already depends on the new lock. + + the existing dependency chain (in reverse order) is: + -> #1 (fs_reclaim){+.+.}-{0:0}: + __lock_acquire+0x7da/0x1030 + lock_acquire+0x15d/0x400 + fs_reclaim_acquire+0xb5/0x100 + prepare_alloc_pages.constprop.0+0xc5/0x230 + __alloc_pages+0x12a/0x3f0 + alloc_pages_mpol+0x175/0x340 + stack_depot_save_flags+0x4c5/0x510 + kasan_save_stack+0x30/0x40 + kasan_save_track+0x10/0x30 + __kasan_slab_alloc+0x83/0x90 + kmem_cache_alloc+0x15e/0x4a0 + __alloc_object+0x35/0x370 + __create_object+0x22/0x90 + __kmalloc_node_track_caller+0x477/0x5b0 + krealloc+0x5f/0x110 + xfs_iext_insert_raw+0x4b2/0x6e0 [xfs] + xfs_iext_insert+0x2e/0x130 [xfs] + xfs_iread_bmbt_block+0x1a9/0x4d0 [xfs] + xfs_btree_visit_block+0xfb/0x290 [xfs] + xfs_btree_visit_blocks+0x215/0x2c0 [xfs] + xfs_iread_extents+0x1a2/0x2e0 [xfs] + xfs_buffered_write_iomap_begin+0x376/0x10a0 [xfs] + iomap_iter+0x1d1/0x2d0 + iomap_file_buffered_write+0x120/0x1a0 + xfs_file_buffered_write+0x128/0x4b0 [xfs] + vfs_write+0x675/0x890 + ksys_write+0xc3/0x160 + do_syscall_64+0x94/0x170 + entry_SYSCALL_64_after_hwframe+0x71/0x79 + +Always preserve __GFP_NOLOCKDEP to fix this. + +Link: https://lkml.kernel.org/r/20240418141133.22950-1-ryabinin.a.a@gmail.com +Fixes: cd11016e5f52 ("mm, kasan: stackdepot implementation. Enable stackdepot for SLAB") +Signed-off-by: Andrey Ryabinin <ryabinin.a.a@gmail.com> +Reported-by: Xiubo Li <xiubli@redhat.com> +Closes: https://lore.kernel.org/all/a0caa289-ca02-48eb-9bf2-d86fd47b71f4@redhat.com/ +Reported-by: Damien Le Moal <damien.lemoal@opensource.wdc.com> +Closes: https://lore.kernel.org/all/f9ff999a-e170-b66b-7caf-293f2b147ac2@opensource.wdc.com/ +Suggested-by: Dave Chinner <david@fromorbit.com> +Cc: Christoph Hellwig <hch@infradead.org> +Cc: Alexander Potapenko <glider@google.com> +Cc: <stable@vger.kernel.org> |