From 631426ba1d45a8672b177ee85ad4cabe760dd131 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 14 Mar 2024 17:12:59 +0100 Subject: mm/madvise: make MADV_POPULATE_(READ|WRITE) handle VM_FAULT_RETRY properly Darrick reports that in some cases where pread() would fail with -EIO and mmap()+access would generate a SIGBUS signal, MADV_POPULATE_READ / MADV_POPULATE_WRITE will keep retrying forever and not fail with -EFAULT. While the madvise() call can be interrupted by a signal, this is not the desired behavior. MADV_POPULATE_READ / MADV_POPULATE_WRITE should behave like page faults in that case: fail and not retry forever. A reproducer can be found at [1]. The reason is that __get_user_pages(), as called by faultin_vma_page_range(), will not handle VM_FAULT_RETRY in a proper way: it will simply return 0 when VM_FAULT_RETRY happened, making madvise_populate()->faultin_vma_page_range() retry again and again, never setting FOLL_TRIED->FAULT_FLAG_TRIED for __get_user_pages(). __get_user_pages_locked() does what we want, but duplicating that logic in faultin_vma_page_range() feels wrong. So let's use __get_user_pages_locked() instead, that will detect VM_FAULT_RETRY and set FOLL_TRIED when retrying, making the fault handler return VM_FAULT_SIGBUS (VM_FAULT_ERROR) at some point, propagating -EFAULT from faultin_page() to __get_user_pages(), all the way to madvise_populate(). But, there is an issue: __get_user_pages_locked() will end up re-taking the MM lock and then __get_user_pages() will do another VMA lookup. In the meantime, the VMA layout could have changed and we'd fail with different error codes than we'd want to. As __get_user_pages() will currently do a new VMA lookup either way, let it do the VMA handling in a different way, controlled by a new FOLL_MADV_POPULATE flag, effectively moving these checks from madvise_populate() + faultin_page_range() in there. With this change, Darricks reproducer properly fails with -EFAULT, as documented for MADV_POPULATE_READ / MADV_POPULATE_WRITE. [1] https://lore.kernel.org/all/20240313171936.GN1927156@frogsfrogsfrogs/ Link: https://lkml.kernel.org/r/20240314161300.382526-1-david@redhat.com Link: https://lkml.kernel.org/r/20240314161300.382526-2-david@redhat.com Fixes: 4ca9b3859dac ("mm/madvise: introduce MADV_POPULATE_(READ|WRITE) to prefault page tables") Signed-off-by: David Hildenbrand Reported-by: Darrick J. Wong Closes: https://lore.kernel.org/all/20240311223815.GW1927156@frogsfrogsfrogs/ Cc: Darrick J. Wong Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: John Hubbard Cc: Signed-off-by: Andrew Morton --- mm/gup.c | 54 ++++++++++++++++++++++++++++++++---------------------- mm/internal.h | 10 ++++++---- mm/madvise.c | 17 ++--------------- 3 files changed, 40 insertions(+), 41 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index af8edadc05d1b8..1611e73b1121b1 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1206,6 +1206,22 @@ static long __get_user_pages(struct mm_struct *mm, /* first iteration or cross vma bound */ if (!vma || start >= vma->vm_end) { + /* + * MADV_POPULATE_(READ|WRITE) wants to handle VMA + * lookups+error reporting differently. + */ + if (gup_flags & FOLL_MADV_POPULATE) { + vma = vma_lookup(mm, start); + if (!vma) { + ret = -ENOMEM; + goto out; + } + if (check_vma_flags(vma, gup_flags)) { + ret = -EINVAL; + goto out; + } + goto retry; + } vma = gup_vma_lookup(mm, start); if (!vma && in_gate_area(mm, start)) { ret = get_gate_page(mm, start & PAGE_MASK, @@ -1685,35 +1701,35 @@ long populate_vma_page_range(struct vm_area_struct *vma, } /* - * faultin_vma_page_range() - populate (prefault) page tables inside the - * given VMA range readable/writable + * faultin_page_range() - populate (prefault) page tables inside the + * given range readable/writable * * This takes care of mlocking the pages, too, if VM_LOCKED is set. * - * @vma: target vma + * @mm: the mm to populate page tables in * @start: start address * @end: end address * @write: whether to prefault readable or writable * @locked: whether the mmap_lock is still held * - * Returns either number of processed pages in the vma, or a negative error - * code on error (see __get_user_pages()). + * Returns either number of processed pages in the MM, or a negative error + * code on error (see __get_user_pages()). Note that this function reports + * errors related to VMAs, such as incompatible mappings, as expected by + * MADV_POPULATE_(READ|WRITE). * - * vma->vm_mm->mmap_lock must be held. The range must be page-aligned and - * covered by the VMA. If it's released, *@locked will be set to 0. + * The range must be page-aligned. + * + * mm->mmap_lock must be held. If it's released, *@locked will be set to 0. */ -long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end, bool write, int *locked) +long faultin_page_range(struct mm_struct *mm, unsigned long start, + unsigned long end, bool write, int *locked) { - struct mm_struct *mm = vma->vm_mm; unsigned long nr_pages = (end - start) / PAGE_SIZE; int gup_flags; long ret; VM_BUG_ON(!PAGE_ALIGNED(start)); VM_BUG_ON(!PAGE_ALIGNED(end)); - VM_BUG_ON_VMA(start < vma->vm_start, vma); - VM_BUG_ON_VMA(end > vma->vm_end, vma); mmap_assert_locked(mm); /* @@ -1725,19 +1741,13 @@ long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, * a poisoned page. * !FOLL_FORCE: Require proper access permissions. */ - gup_flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_UNLOCKABLE; + gup_flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_UNLOCKABLE | + FOLL_MADV_POPULATE; if (write) gup_flags |= FOLL_WRITE; - /* - * We want to report -EINVAL instead of -EFAULT for any permission - * problems or incompatible mappings. - */ - if (check_vma_flags(vma, gup_flags)) - return -EINVAL; - - ret = __get_user_pages(mm, start, nr_pages, gup_flags, - NULL, locked); + ret = __get_user_pages_locked(mm, start, nr_pages, NULL, locked, + gup_flags); lru_add_drain(); return ret; } diff --git a/mm/internal.h b/mm/internal.h index 7e486f2c502cee..07ad2675a88b47 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -686,9 +686,8 @@ struct anon_vma *folio_anon_vma(struct folio *folio); void unmap_mapping_folio(struct folio *folio); extern long populate_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *locked); -extern long faultin_vma_page_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end, - bool write, int *locked); +extern long faultin_page_range(struct mm_struct *mm, unsigned long start, + unsigned long end, bool write, int *locked); extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags, unsigned long bytes); @@ -1127,10 +1126,13 @@ enum { FOLL_FAST_ONLY = 1 << 20, /* allow unlocking the mmap lock */ FOLL_UNLOCKABLE = 1 << 21, + /* VMA lookup+checks compatible with MADV_POPULATE_(READ|WRITE) */ + FOLL_MADV_POPULATE = 1 << 22, }; #define INTERNAL_GUP_FLAGS (FOLL_TOUCH | FOLL_TRIED | FOLL_REMOTE | FOLL_PIN | \ - FOLL_FAST_ONLY | FOLL_UNLOCKABLE) + FOLL_FAST_ONLY | FOLL_UNLOCKABLE | \ + FOLL_MADV_POPULATE) /* * Indicates for which pages that are write-protected in the page table, diff --git a/mm/madvise.c b/mm/madvise.c index 44a498c94158c8..1a073fcc4c0c02 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -908,27 +908,14 @@ static long madvise_populate(struct vm_area_struct *vma, { const bool write = behavior == MADV_POPULATE_WRITE; struct mm_struct *mm = vma->vm_mm; - unsigned long tmp_end; int locked = 1; long pages; *prev = vma; while (start < end) { - /* - * We might have temporarily dropped the lock. For example, - * our VMA might have been split. - */ - if (!vma || start >= vma->vm_end) { - vma = vma_lookup(mm, start); - if (!vma) - return -ENOMEM; - } - - tmp_end = min_t(unsigned long, end, vma->vm_end); /* Populate (prefault) page tables readable/writable. */ - pages = faultin_vma_page_range(vma, start, tmp_end, write, - &locked); + pages = faultin_page_range(mm, start, end, write, &locked); if (!locked) { mmap_read_lock(mm); locked = 1; @@ -949,7 +936,7 @@ static long madvise_populate(struct vm_area_struct *vma, pr_warn_once("%s: unhandled return value: %ld\n", __func__, pages); fallthrough; - case -ENOMEM: + case -ENOMEM: /* No VMA or out of memory. */ return -ENOMEM; } } -- cgit 1.2.3-korg From c0205eaf3af9f5db14d4b5ee4abacf4a583c3c50 Mon Sep 17 00:00:00 2001 From: Lokesh Gidra Date: Thu, 4 Apr 2024 10:17:26 -0700 Subject: userfaultfd: change src_folio after ensuring it's unpinned in UFFDIO_MOVE Commit d7a08838ab74 ("mm: userfaultfd: fix unexpected change to src_folio when UFFDIO_MOVE fails") moved the src_folio->{mapping, index} changing to after clearing the page-table and ensuring that it's not pinned. This avoids failure of swapout+migration and possibly memory corruption. However, the commit missed fixing it in the huge-page case. Link: https://lkml.kernel.org/r/20240404171726.2302435-1-lokeshgidra@google.com Fixes: adef440691ba ("userfaultfd: UFFDIO_MOVE uABI") Signed-off-by: Lokesh Gidra Acked-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Kalesh Singh Cc: Lokesh Gidra Cc: Nicolas Geoffray Cc: Peter Xu Cc: Qi Zheng Cc: Matthew Wilcox Cc: Signed-off-by: Andrew Morton --- mm/huge_memory.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9859aa4f755380..89f58c7603b255 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2259,9 +2259,6 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm goto unlock_ptls; } - folio_move_anon_rmap(src_folio, dst_vma); - WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr)); - src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd); /* Folio got pinned from under us. Put it back and fail the move. */ if (folio_maybe_dma_pinned(src_folio)) { @@ -2270,6 +2267,9 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm goto unlock_ptls; } + folio_move_anon_rmap(src_folio, dst_vma); + WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr)); + _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot); /* Follow mremap() behavior and treat the entry dirty after the move */ _dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma); -- cgit 1.2.3-korg From ea4b5b33bf8a4936cfb07759d926db697c43fb1e Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 4 Apr 2024 09:06:59 +0200 Subject: mm,page_owner: update metadata for tail pages Patch series "page_owner: Fix refcount imbalance and print fixup", v4. This series consists of a refactoring/correctness of updating the metadata of tail pages, a couple of fixups for the refcounting part and a fixup for the stack_start() function. From this series on, instead of counting the stacks, we count the outstanding nr_base_pages each stack has, which gives us a much better memory overview. The other fixup is for the migration part. A more detailed explanation can be found in the changelog of the respective patches. This patch (of 4): __set_page_owner_handle() and __reset_page_owner() update the metadata of all pages when the page is of a higher-order, but we miss to do the same when the pages are migrated. __folio_copy_owner() only updates the metadata of the head page, meaning that the information stored in the first page and the tail pages will not match. Strictly speaking that is not a big problem because 1) we do not print tail pages and 2) upon splitting all tail pages will inherit the metadata of the head page, but it is better to have all metadata in check should there be any problem, so it can ease debugging. For that purpose, a couple of helpers are created __update_page_owner_handle() which updates the metadata on allocation, and __update_page_owner_free_handle() which does the same when the page is freed. __folio_copy_owner() will make use of both as it needs to entirely replace the page_owner metadata for the new page. Link: https://lkml.kernel.org/r/20240404070702.2744-1-osalvador@suse.de Link: https://lkml.kernel.org/r/20240404070702.2744-2-osalvador@suse.de Signed-off-by: Oscar Salvador Reviewed-by: Vlastimil Babka Tested-by: Kefeng Wang Cc: Alexander Potapenko Cc: Alexandre Ghiti Cc: Andrey Konovalov Cc: Marco Elver Cc: Michal Hocko Cc: Oscar Salvador Cc: Palmer Dabbelt Cc: Alexandre Ghiti Signed-off-by: Andrew Morton --- mm/page_owner.c | 137 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 74 insertions(+), 63 deletions(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index d17d1351ec84af..52d1ced0b57fb8 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -228,9 +228,58 @@ static void dec_stack_record_count(depot_stack_handle_t handle) refcount_dec(&stack_record->count); } -void __reset_page_owner(struct page *page, unsigned short order) +static inline void __update_page_owner_handle(struct page_ext *page_ext, + depot_stack_handle_t handle, + unsigned short order, + gfp_t gfp_mask, + short last_migrate_reason, u64 ts_nsec, + pid_t pid, pid_t tgid, char *comm) { int i; + struct page_owner *page_owner; + + for (i = 0; i < (1 << order); i++) { + page_owner = get_page_owner(page_ext); + page_owner->handle = handle; + page_owner->order = order; + page_owner->gfp_mask = gfp_mask; + page_owner->last_migrate_reason = last_migrate_reason; + page_owner->pid = pid; + page_owner->tgid = tgid; + page_owner->ts_nsec = ts_nsec; + strscpy(page_owner->comm, comm, + sizeof(page_owner->comm)); + __set_bit(PAGE_EXT_OWNER, &page_ext->flags); + __set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); + page_ext = page_ext_next(page_ext); + } +} + +static inline void __update_page_owner_free_handle(struct page_ext *page_ext, + depot_stack_handle_t handle, + unsigned short order, + pid_t pid, pid_t tgid, + u64 free_ts_nsec) +{ + int i; + struct page_owner *page_owner; + + for (i = 0; i < (1 << order); i++) { + page_owner = get_page_owner(page_ext); + /* Only __reset_page_owner() wants to clear the bit */ + if (handle) { + __clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); + page_owner->free_handle = handle; + } + page_owner->free_ts_nsec = free_ts_nsec; + page_owner->free_pid = current->pid; + page_owner->free_tgid = current->tgid; + page_ext = page_ext_next(page_ext); + } +} + +void __reset_page_owner(struct page *page, unsigned short order) +{ struct page_ext *page_ext; depot_stack_handle_t handle; depot_stack_handle_t alloc_handle; @@ -245,16 +294,10 @@ void __reset_page_owner(struct page *page, unsigned short order) alloc_handle = page_owner->handle; handle = save_stack(GFP_NOWAIT | __GFP_NOWARN); - for (i = 0; i < (1 << order); i++) { - __clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); - page_owner->free_handle = handle; - page_owner->free_ts_nsec = free_ts_nsec; - page_owner->free_pid = current->pid; - page_owner->free_tgid = current->tgid; - page_ext = page_ext_next(page_ext); - page_owner = get_page_owner(page_ext); - } + __update_page_owner_free_handle(page_ext, handle, order, current->pid, + current->tgid, free_ts_nsec); page_ext_put(page_ext); + if (alloc_handle != early_handle) /* * early_handle is being set as a handle for all those @@ -266,36 +309,11 @@ void __reset_page_owner(struct page *page, unsigned short order) dec_stack_record_count(alloc_handle); } -static inline void __set_page_owner_handle(struct page_ext *page_ext, - depot_stack_handle_t handle, - unsigned short order, gfp_t gfp_mask) -{ - struct page_owner *page_owner; - int i; - u64 ts_nsec = local_clock(); - - for (i = 0; i < (1 << order); i++) { - page_owner = get_page_owner(page_ext); - page_owner->handle = handle; - page_owner->order = order; - page_owner->gfp_mask = gfp_mask; - page_owner->last_migrate_reason = -1; - page_owner->pid = current->pid; - page_owner->tgid = current->tgid; - page_owner->ts_nsec = ts_nsec; - strscpy(page_owner->comm, current->comm, - sizeof(page_owner->comm)); - __set_bit(PAGE_EXT_OWNER, &page_ext->flags); - __set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); - - page_ext = page_ext_next(page_ext); - } -} - noinline void __set_page_owner(struct page *page, unsigned short order, gfp_t gfp_mask) { struct page_ext *page_ext; + u64 ts_nsec = local_clock(); depot_stack_handle_t handle; handle = save_stack(gfp_mask); @@ -303,7 +321,9 @@ noinline void __set_page_owner(struct page *page, unsigned short order, page_ext = page_ext_get(page); if (unlikely(!page_ext)) return; - __set_page_owner_handle(page_ext, handle, order, gfp_mask); + __update_page_owner_handle(page_ext, handle, order, gfp_mask, -1, + current->pid, current->tgid, ts_nsec, + current->comm); page_ext_put(page_ext); inc_stack_record_count(handle, gfp_mask); } @@ -342,7 +362,7 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old) { struct page_ext *old_ext; struct page_ext *new_ext; - struct page_owner *old_page_owner, *new_page_owner; + struct page_owner *old_page_owner; old_ext = page_ext_get(&old->page); if (unlikely(!old_ext)) @@ -355,31 +375,21 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old) } old_page_owner = get_page_owner(old_ext); - new_page_owner = get_page_owner(new_ext); - new_page_owner->order = old_page_owner->order; - new_page_owner->gfp_mask = old_page_owner->gfp_mask; - new_page_owner->last_migrate_reason = - old_page_owner->last_migrate_reason; - new_page_owner->handle = old_page_owner->handle; - new_page_owner->pid = old_page_owner->pid; - new_page_owner->tgid = old_page_owner->tgid; - new_page_owner->free_pid = old_page_owner->free_pid; - new_page_owner->free_tgid = old_page_owner->free_tgid; - new_page_owner->ts_nsec = old_page_owner->ts_nsec; - new_page_owner->free_ts_nsec = old_page_owner->ts_nsec; - strcpy(new_page_owner->comm, old_page_owner->comm); - + __update_page_owner_handle(new_ext, old_page_owner->handle, + old_page_owner->order, old_page_owner->gfp_mask, + old_page_owner->last_migrate_reason, + old_page_owner->ts_nsec, old_page_owner->pid, + old_page_owner->tgid, old_page_owner->comm); /* - * We don't clear the bit on the old folio as it's going to be freed - * after migration. Until then, the info can be useful in case of - * a bug, and the overall stats will be off a bit only temporarily. - * Also, migrate_misplaced_transhuge_page() can still fail the - * migration and then we want the old folio to retain the info. But - * in that case we also don't need to explicitly clear the info from - * the new page, which will be freed. + * Do not proactively clear PAGE_EXT_OWNER{_ALLOCATED} bits as the folio + * will be freed after migration. Keep them until then as they may be + * useful. */ - __set_bit(PAGE_EXT_OWNER, &new_ext->flags); - __set_bit(PAGE_EXT_OWNER_ALLOCATED, &new_ext->flags); + __update_page_owner_free_handle(new_ext, 0, old_page_owner->order, + old_page_owner->free_pid, + old_page_owner->free_tgid, + old_page_owner->free_ts_nsec); + page_ext_put(new_ext); page_ext_put(old_ext); } @@ -787,8 +797,9 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) goto ext_put_continue; /* Found early allocated page */ - __set_page_owner_handle(page_ext, early_handle, - 0, 0); + __update_page_owner_handle(page_ext, early_handle, 0, 0, + -1, local_clock(), current->pid, + current->tgid, current->comm); count++; ext_put_continue: page_ext_put(page_ext); -- cgit 1.2.3-korg From f5c12105c15f0ddf0ff37646290568dd986fa2f3 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 4 Apr 2024 09:07:00 +0200 Subject: mm,page_owner: fix refcount imbalance Current code does not contemplate scenarios were an allocation and free operation on the same pages do not handle it in the same amount at once. To give an example, page_alloc_exact(), where we will allocate a page of enough order to stafisfy the size request, but we will free the remainings right away. In the above example, we will increment the stack_record refcount only once, but we will decrease it the same number of times as number of unused pages we have to free. This will lead to a warning because of refcount imbalance. Fix this by recording the number of base pages in the refcount field. Link: https://lkml.kernel.org/r/20240404070702.2744-3-osalvador@suse.de Reported-by: syzbot+41bbfdb8d41003d12c0f@syzkaller.appspotmail.com Closes: https://lore.kernel.org/linux-mm/00000000000090e8ff0613eda0e5@google.com Fixes: 217b2119b9e2 ("mm,page_owner: implement the tracking of the stacks count") Signed-off-by: Oscar Salvador Reviewed-by: Vlastimil Babka Tested-by: Alexandre Ghiti Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Marco Elver Cc: Michal Hocko Cc: Palmer Dabbelt Signed-off-by: Andrew Morton --- Documentation/mm/page_owner.rst | 73 +++++++++++++++++++++-------------------- mm/page_owner.c | 34 +++++++++++-------- 2 files changed, 58 insertions(+), 49 deletions(-) diff --git a/Documentation/mm/page_owner.rst b/Documentation/mm/page_owner.rst index 0d0334cd51798b..3a45a20fc05a1f 100644 --- a/Documentation/mm/page_owner.rst +++ b/Documentation/mm/page_owner.rst @@ -24,10 +24,10 @@ fragmentation statistics can be obtained through gfp flag information of each page. It is already implemented and activated if page owner is enabled. Other usages are more than welcome. -It can also be used to show all the stacks and their outstanding -allocations, which gives us a quick overview of where the memory is going -without the need to screen through all the pages and match the allocation -and free operation. +It can also be used to show all the stacks and their current number of +allocated base pages, which gives us a quick overview of where the memory +is going without the need to screen through all the pages and match the +allocation and free operation. page owner is disabled by default. So, if you'd like to use it, you need to add "page_owner=on" to your boot cmdline. If the kernel is built @@ -75,42 +75,45 @@ Usage cat /sys/kernel/debug/page_owner_stacks/show_stacks > stacks.txt cat stacks.txt - prep_new_page+0xa9/0x120 - get_page_from_freelist+0x7e6/0x2140 - __alloc_pages+0x18a/0x370 - new_slab+0xc8/0x580 - ___slab_alloc+0x1f2/0xaf0 - __slab_alloc.isra.86+0x22/0x40 - kmem_cache_alloc+0x31b/0x350 - __khugepaged_enter+0x39/0x100 - dup_mmap+0x1c7/0x5ce - copy_process+0x1afe/0x1c90 - kernel_clone+0x9a/0x3c0 - __do_sys_clone+0x66/0x90 - do_syscall_64+0x7f/0x160 - entry_SYSCALL_64_after_hwframe+0x6c/0x74 - stack_count: 234 + post_alloc_hook+0x177/0x1a0 + get_page_from_freelist+0xd01/0xd80 + __alloc_pages+0x39e/0x7e0 + allocate_slab+0xbc/0x3f0 + ___slab_alloc+0x528/0x8a0 + kmem_cache_alloc+0x224/0x3b0 + sk_prot_alloc+0x58/0x1a0 + sk_alloc+0x32/0x4f0 + inet_create+0x427/0xb50 + __sock_create+0x2e4/0x650 + inet_ctl_sock_create+0x30/0x180 + igmp_net_init+0xc1/0x130 + ops_init+0x167/0x410 + setup_net+0x304/0xa60 + copy_net_ns+0x29b/0x4a0 + create_new_namespaces+0x4a1/0x820 + nr_base_pages: 16 ... ... echo 7000 > /sys/kernel/debug/page_owner_stacks/count_threshold cat /sys/kernel/debug/page_owner_stacks/show_stacks> stacks_7000.txt cat stacks_7000.txt - prep_new_page+0xa9/0x120 - get_page_from_freelist+0x7e6/0x2140 - __alloc_pages+0x18a/0x370 - alloc_pages_mpol+0xdf/0x1e0 - folio_alloc+0x14/0x50 - filemap_alloc_folio+0xb0/0x100 - page_cache_ra_unbounded+0x97/0x180 - filemap_fault+0x4b4/0x1200 - __do_fault+0x2d/0x110 - do_pte_missing+0x4b0/0xa30 - __handle_mm_fault+0x7fa/0xb70 - handle_mm_fault+0x125/0x300 - do_user_addr_fault+0x3c9/0x840 - exc_page_fault+0x68/0x150 - asm_exc_page_fault+0x22/0x30 - stack_count: 8248 + post_alloc_hook+0x177/0x1a0 + get_page_from_freelist+0xd01/0xd80 + __alloc_pages+0x39e/0x7e0 + alloc_pages_mpol+0x22e/0x490 + folio_alloc+0xd5/0x110 + filemap_alloc_folio+0x78/0x230 + page_cache_ra_order+0x287/0x6f0 + filemap_get_pages+0x517/0x1160 + filemap_read+0x304/0x9f0 + xfs_file_buffered_read+0xe6/0x1d0 [xfs] + xfs_file_read_iter+0x1f0/0x380 [xfs] + __kernel_read+0x3b9/0x730 + kernel_read_file+0x309/0x4d0 + __do_sys_finit_module+0x381/0x730 + do_syscall_64+0x8d/0x150 + entry_SYSCALL_64_after_hwframe+0x62/0x6a + nr_base_pages: 20824 ... cat /sys/kernel/debug/page_owner > page_owner_full.txt diff --git a/mm/page_owner.c b/mm/page_owner.c index 52d1ced0b57fb8..5df0d6892bdce6 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -196,7 +196,8 @@ static void add_stack_record_to_list(struct stack_record *stack_record, spin_unlock_irqrestore(&stack_list_lock, flags); } -static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask) +static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask, + int nr_base_pages) { struct stack_record *stack_record = __stack_depot_get_stack_record(handle); @@ -217,15 +218,20 @@ static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask) /* Add the new stack_record to our list */ add_stack_record_to_list(stack_record, gfp_mask); } - refcount_inc(&stack_record->count); + refcount_add(nr_base_pages, &stack_record->count); } -static void dec_stack_record_count(depot_stack_handle_t handle) +static void dec_stack_record_count(depot_stack_handle_t handle, + int nr_base_pages) { struct stack_record *stack_record = __stack_depot_get_stack_record(handle); - if (stack_record) - refcount_dec(&stack_record->count); + if (!stack_record) + return; + + if (refcount_sub_and_test(nr_base_pages, &stack_record->count)) + pr_warn("%s: refcount went to 0 for %u handle\n", __func__, + handle); } static inline void __update_page_owner_handle(struct page_ext *page_ext, @@ -306,7 +312,7 @@ void __reset_page_owner(struct page *page, unsigned short order) * the machinery is not ready yet, we cannot decrement * their refcount either. */ - dec_stack_record_count(alloc_handle); + dec_stack_record_count(alloc_handle, 1 << order); } noinline void __set_page_owner(struct page *page, unsigned short order, @@ -325,7 +331,7 @@ noinline void __set_page_owner(struct page *page, unsigned short order, current->pid, current->tgid, ts_nsec, current->comm); page_ext_put(page_ext); - inc_stack_record_count(handle, gfp_mask); + inc_stack_record_count(handle, gfp_mask, 1 << order); } void __set_page_owner_migrate_reason(struct page *page, int reason) @@ -872,11 +878,11 @@ static void *stack_next(struct seq_file *m, void *v, loff_t *ppos) return stack; } -static unsigned long page_owner_stack_threshold; +static unsigned long page_owner_pages_threshold; static int stack_print(struct seq_file *m, void *v) { - int i, stack_count; + int i, nr_base_pages; struct stack *stack = v; unsigned long *entries; unsigned long nr_entries; @@ -887,14 +893,14 @@ static int stack_print(struct seq_file *m, void *v) nr_entries = stack_record->size; entries = stack_record->entries; - stack_count = refcount_read(&stack_record->count) - 1; + nr_base_pages = refcount_read(&stack_record->count) - 1; - if (stack_count < 1 || stack_count < page_owner_stack_threshold) + if (nr_base_pages < 1 || nr_base_pages < page_owner_pages_threshold) return 0; for (i = 0; i < nr_entries; i++) seq_printf(m, " %pS\n", (void *)entries[i]); - seq_printf(m, "stack_count: %d\n\n", stack_count); + seq_printf(m, "nr_base_pages: %d\n\n", nr_base_pages); return 0; } @@ -924,13 +930,13 @@ static const struct file_operations page_owner_stack_operations = { static int page_owner_threshold_get(void *data, u64 *val) { - *val = READ_ONCE(page_owner_stack_threshold); + *val = READ_ONCE(page_owner_pages_threshold); return 0; } static int page_owner_threshold_set(void *data, u64 val) { - WRITE_ONCE(page_owner_stack_threshold, val); + WRITE_ONCE(page_owner_pages_threshold, val); return 0; } -- cgit 1.2.3-korg From 718b1f3373a7999f77e617c17abdcb98a3c001ea Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 4 Apr 2024 09:07:01 +0200 Subject: mm,page_owner: fix accounting of pages when migrating Upon migration, new allocated pages are being given the handle of the old pages. This is problematic because it means that for the stack which allocated the old page, we will be substracting the old page + the new one when that page is freed, creating an accounting imbalance. There is an interest in keeping it that way, as otherwise the output will biased towards migration stacks should those operations occur often, but that is not really helpful. The link from the new page to the old stack is being performed by calling __update_page_owner_handle() in __folio_copy_owner(). The only thing that is left is to link the migrate stack to the old page, so the old page will be subtracted from the migrate stack, avoiding by doing so any possible imbalance. Link: https://lkml.kernel.org/r/20240404070702.2744-4-osalvador@suse.de Fixes: 217b2119b9e2 ("mm,page_owner: implement the tracking of the stacks count") Signed-off-by: Oscar Salvador Reviewed-by: Vlastimil Babka Cc: Alexander Potapenko Cc: Alexandre Ghiti Cc: Andrey Konovalov Cc: Marco Elver Cc: Michal Hocko Cc: Palmer Dabbelt Signed-off-by: Andrew Morton --- mm/page_owner.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/mm/page_owner.c b/mm/page_owner.c index 5df0d6892bdce6..b4476f45b376e1 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -366,9 +366,12 @@ void __split_page_owner(struct page *page, int old_order, int new_order) void __folio_copy_owner(struct folio *newfolio, struct folio *old) { + int i; struct page_ext *old_ext; struct page_ext *new_ext; struct page_owner *old_page_owner; + struct page_owner *new_page_owner; + depot_stack_handle_t migrate_handle; old_ext = page_ext_get(&old->page); if (unlikely(!old_ext)) @@ -381,6 +384,8 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old) } old_page_owner = get_page_owner(old_ext); + new_page_owner = get_page_owner(new_ext); + migrate_handle = new_page_owner->handle; __update_page_owner_handle(new_ext, old_page_owner->handle, old_page_owner->order, old_page_owner->gfp_mask, old_page_owner->last_migrate_reason, @@ -395,6 +400,16 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old) old_page_owner->free_pid, old_page_owner->free_tgid, old_page_owner->free_ts_nsec); + /* + * We linked the original stack to the new folio, we need to do the same + * for the new one and the old folio otherwise there will be an imbalance + * when subtracting those pages from the stack. + */ + for (i = 0; i < (1 << new_page_owner->order); i++) { + old_page_owner->handle = migrate_handle; + old_ext = page_ext_next(old_ext); + old_page_owner = get_page_owner(old_ext); + } page_ext_put(new_ext); page_ext_put(old_ext); -- cgit 1.2.3-korg From 74017458017127ca6bf14b1f9fda69e03f43389b Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 4 Apr 2024 09:07:02 +0200 Subject: mm,page_owner: fix printing of stack records When seq_* code sees that its buffer overflowed, it re-allocates a bigger onecand calls seq_operations->start() callback again. stack_start() naively though that if it got called again, it meant that the old record got already printed so it returned the next object, but that is not true. The consequence of that is that every time stack_stop() -> stack_start() get called because we needed a bigger buffer, stack_start() will skip entries, and those will not be printed. Fix it by not advancing to the next object in stack_start(). Link: https://lkml.kernel.org/r/20240404070702.2744-5-osalvador@suse.de Fixes: 765973a09803 ("mm,page_owner: display all stacks and their count") Signed-off-by: Oscar Salvador Reviewed-by: Vlastimil Babka Cc: Alexander Potapenko Cc: Alexandre Ghiti Cc: Andrey Konovalov Cc: Marco Elver Cc: Michal Hocko Cc: Palmer Dabbelt Signed-off-by: Andrew Morton --- mm/page_owner.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index b4476f45b376e1..9bef0b4428634b 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -872,13 +872,11 @@ static void *stack_start(struct seq_file *m, loff_t *ppos) * value of stack_list. */ stack = smp_load_acquire(&stack_list); + m->private = stack; } else { stack = m->private; - stack = stack->next; } - m->private = stack; - return stack; } -- cgit 1.2.3-korg From c5977c95dff182d6ee06f4d6f60bcb0284912969 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 5 Apr 2024 19:19:20 -0400 Subject: mm/userfaultfd: allow hugetlb change protection upon poison entry After UFFDIO_POISON, there can be two kinds of hugetlb pte markers, either the POISON one or UFFD_WP one. Allow change protection to run on a poisoned marker just like !hugetlb cases, ignoring the marker irrelevant of the permission. Here the two bits are mutual exclusive. For example, when install a poisoned entry it must not be UFFD_WP already (by checking pte_none() before such install). And it also means if UFFD_WP is set there must have no POISON bit set. It makes sense because UFFD_WP is a bit to reflect permission, and permissions do not apply if the pte is poisoned and destined to sigbus. So here we simply check uffd_wp bit set first, do nothing otherwise. Attach the Fixes to UFFDIO_POISON work, as before that it should not be possible to have poison entry for hugetlb (e.g., hugetlb doesn't do swap, so no chance of swapin errors). Link: https://lkml.kernel.org/r/20240405231920.1772199-1-peterx@redhat.com Link: https://lore.kernel.org/r/000000000000920d5e0615602dd1@google.com Fixes: fc71884a5f59 ("mm: userfaultfd: add new UFFDIO_POISON ioctl") Signed-off-by: Peter Xu Reported-by: syzbot+b07c8ac8eee3d4d8440f@syzkaller.appspotmail.com Reviewed-by: David Hildenbrand Reviewed-by: Axel Rasmussen Cc: [6.6+] Signed-off-by: Andrew Morton --- mm/hugetlb.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 23ef240ba48a60..31d00eee028f11 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -7044,9 +7044,13 @@ long hugetlb_change_protection(struct vm_area_struct *vma, if (!pte_same(pte, newpte)) set_huge_pte_at(mm, address, ptep, newpte, psize); } else if (unlikely(is_pte_marker(pte))) { - /* No other markers apply for now. */ - WARN_ON_ONCE(!pte_marker_uffd_wp(pte)); - if (uffd_wp_resolve) + /* + * Do nothing on a poison marker; page is + * corrupted, permissons do not apply. Here + * pte_marker_uffd_wp()==true implies !poison + * because they're mutual exclusive. + */ + if (pte_marker_uffd_wp(pte) && uffd_wp_resolve) /* Safe to modify directly (non-present->none). */ huge_pte_clear(mm, address, ptep, psize); } else if (!huge_pte_none(pte)) { -- cgit 1.2.3-korg From 1983184c22dd84a4d95a71e5c6775c2638557dc7 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sun, 7 Apr 2024 16:54:56 +0800 Subject: mm/memory-failure: fix deadlock when hugetlb_optimize_vmemmap is enabled When I did hard offline test with hugetlb pages, below deadlock occurs: ====================================================== WARNING: possible circular locking dependency detected 6.8.0-11409-gf6cef5f8c37f #1 Not tainted ------------------------------------------------------ bash/46904 is trying to acquire lock: ffffffffabe68910 (cpu_hotplug_lock){++++}-{0:0}, at: static_key_slow_dec+0x16/0x60 but task is already holding lock: ffffffffabf92ea8 (pcp_batch_high_lock){+.+.}-{3:3}, at: zone_pcp_disable+0x16/0x40 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (pcp_batch_high_lock){+.+.}-{3:3}: __mutex_lock+0x6c/0x770 page_alloc_cpu_online+0x3c/0x70 cpuhp_invoke_callback+0x397/0x5f0 __cpuhp_invoke_callback_range+0x71/0xe0 _cpu_up+0xeb/0x210 cpu_up+0x91/0xe0 cpuhp_bringup_mask+0x49/0xb0 bringup_nonboot_cpus+0xb7/0xe0 smp_init+0x25/0xa0 kernel_init_freeable+0x15f/0x3e0 kernel_init+0x15/0x1b0 ret_from_fork+0x2f/0x50 ret_from_fork_asm+0x1a/0x30 -> #0 (cpu_hotplug_lock){++++}-{0:0}: __lock_acquire+0x1298/0x1cd0 lock_acquire+0xc0/0x2b0 cpus_read_lock+0x2a/0xc0 static_key_slow_dec+0x16/0x60 __hugetlb_vmemmap_restore_folio+0x1b9/0x200 dissolve_free_huge_page+0x211/0x260 __page_handle_poison+0x45/0xc0 memory_failure+0x65e/0xc70 hard_offline_page_store+0x55/0xa0 kernfs_fop_write_iter+0x12c/0x1d0 vfs_write+0x387/0x550 ksys_write+0x64/0xe0 do_syscall_64+0xca/0x1e0 entry_SYSCALL_64_after_hwframe+0x6d/0x75 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(pcp_batch_high_lock); lock(cpu_hotplug_lock); lock(pcp_batch_high_lock); rlock(cpu_hotplug_lock); *** DEADLOCK *** 5 locks held by bash/46904: #0: ffff98f6c3bb23f0 (sb_writers#5){.+.+}-{0:0}, at: ksys_write+0x64/0xe0 #1: ffff98f6c328e488 (&of->mutex){+.+.}-{3:3}, at: kernfs_fop_write_iter+0xf8/0x1d0 #2: ffff98ef83b31890 (kn->active#113){.+.+}-{0:0}, at: kernfs_fop_write_iter+0x100/0x1d0 #3: ffffffffabf9db48 (mf_mutex){+.+.}-{3:3}, at: memory_failure+0x44/0xc70 #4: ffffffffabf92ea8 (pcp_batch_high_lock){+.+.}-{3:3}, at: zone_pcp_disable+0x16/0x40 stack backtrace: CPU: 10 PID: 46904 Comm: bash Kdump: loaded Not tainted 6.8.0-11409-gf6cef5f8c37f #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl+0x68/0xa0 check_noncircular+0x129/0x140 __lock_acquire+0x1298/0x1cd0 lock_acquire+0xc0/0x2b0 cpus_read_lock+0x2a/0xc0 static_key_slow_dec+0x16/0x60 __hugetlb_vmemmap_restore_folio+0x1b9/0x200 dissolve_free_huge_page+0x211/0x260 __page_handle_poison+0x45/0xc0 memory_failure+0x65e/0xc70 hard_offline_page_store+0x55/0xa0 kernfs_fop_write_iter+0x12c/0x1d0 vfs_write+0x387/0x550 ksys_write+0x64/0xe0 do_syscall_64+0xca/0x1e0 entry_SYSCALL_64_after_hwframe+0x6d/0x75 RIP: 0033:0x7fc862314887 Code: 10 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 48 89 54 24 18 48 89 74 24 RSP: 002b:00007fff19311268 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 000000000000000c RCX: 00007fc862314887 RDX: 000000000000000c RSI: 000056405645fe10 RDI: 0000000000000001 RBP: 000056405645fe10 R08: 00007fc8623d1460 R09: 000000007fffffff R10: 0000000000000000 R11: 0000000000000246 R12: 000000000000000c R13: 00007fc86241b780 R14: 00007fc862417600 R15: 00007fc862416a00 In short, below scene breaks the lock dependency chain: memory_failure __page_handle_poison zone_pcp_disable -- lock(pcp_batch_high_lock) dissolve_free_huge_page __hugetlb_vmemmap_restore_folio static_key_slow_dec cpus_read_lock -- rlock(cpu_hotplug_lock) Fix this by calling drain_all_pages() instead. This issue won't occur until commit a6b40850c442 ("mm: hugetlb: replace hugetlb_free_vmemmap_enabled with a static_key"). As it introduced rlock(cpu_hotplug_lock) in dissolve_free_huge_page() code path while lock(pcp_batch_high_lock) is already in the __page_handle_poison(). [linmiaohe@huawei.com: extend comment per Oscar] [akpm@linux-foundation.org: reflow block comment] Link: https://lkml.kernel.org/r/20240407085456.2798193-1-linmiaohe@huawei.com Fixes: a6b40850c442 ("mm: hugetlb: replace hugetlb_free_vmemmap_enabled with a static_key") Signed-off-by: Miaohe Lin Acked-by: Oscar Salvador Reviewed-by: Jane Chu Cc: Naoya Horiguchi Cc: Signed-off-by: Andrew Morton --- mm/memory-failure.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 9349948f1abfd1..9e62a00b46ddee 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -154,11 +154,23 @@ static int __page_handle_poison(struct page *page) { int ret; - zone_pcp_disable(page_zone(page)); + /* + * zone_pcp_disable() can't be used here. It will + * hold pcp_batch_high_lock and dissolve_free_huge_page() might hold + * cpu_hotplug_lock via static_key_slow_dec() when hugetlb vmemmap + * optimization is enabled. This will break current lock dependency + * chain and leads to deadlock. + * Disabling pcp before dissolving the page was a deterministic + * approach because we made sure that those pages cannot end up in any + * PCP list. Draining PCP lists expels those pages to the buddy system, + * but nothing guarantees that those pages do not get back to a PCP + * queue if we need to refill those. + */ ret = dissolve_free_huge_page(page); - if (!ret) + if (!ret) { + drain_all_pages(page_zone(page)); ret = take_page_off_buddy(page); - zone_pcp_enable(page_zone(page)); + } return ret; } -- cgit 1.2.3-korg From 07a57a338adb6ec9e766d6a6790f76527f45ceb5 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Sun, 7 Apr 2024 15:05:37 +0200 Subject: mm,swapops: update check in is_pfn_swap_entry for hwpoison entries Tony reported that the Machine check recovery was broken in v6.9-rc1, as he was hitting a VM_BUG_ON when injecting uncorrectable memory errors to DRAM. After some more digging and debugging on his side, he realized that this went back to v6.1, with the introduction of 'commit 0d206b5d2e0d ("mm/swap: add swp_offset_pfn() to fetch PFN from swap entry")'. That commit, among other things, introduced swp_offset_pfn(), replacing hwpoison_entry_to_pfn() in its favour. The patch also introduced a VM_BUG_ON() check for is_pfn_swap_entry(), but is_pfn_swap_entry() never got updated to cover hwpoison entries, which means that we would hit the VM_BUG_ON whenever we would call swp_offset_pfn() for such entries on environments with CONFIG_DEBUG_VM set. Fix this by updating the check to cover hwpoison entries as well, and update the comment while we are it. Link: https://lkml.kernel.org/r/20240407130537.16977-1-osalvador@suse.de Fixes: 0d206b5d2e0d ("mm/swap: add swp_offset_pfn() to fetch PFN from swap entry") Signed-off-by: Oscar Salvador Reported-by: Tony Luck Closes: https://lore.kernel.org/all/Zg8kLSl2yAlA3o5D@agluck-desk3/ Tested-by: Tony Luck Reviewed-by: Peter Xu Reviewed-by: David Hildenbrand Acked-by: Miaohe Lin Cc: [6.1.x] Signed-off-by: Andrew Morton --- include/linux/swapops.h | 65 +++++++++++++++++++++++++------------------------ 1 file changed, 33 insertions(+), 32 deletions(-) diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 48b700ba1d188a..a5c560a2f8c258 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -390,6 +390,35 @@ static inline bool is_migration_entry_dirty(swp_entry_t entry) } #endif /* CONFIG_MIGRATION */ +#ifdef CONFIG_MEMORY_FAILURE + +/* + * Support for hardware poisoned pages + */ +static inline swp_entry_t make_hwpoison_entry(struct page *page) +{ + BUG_ON(!PageLocked(page)); + return swp_entry(SWP_HWPOISON, page_to_pfn(page)); +} + +static inline int is_hwpoison_entry(swp_entry_t entry) +{ + return swp_type(entry) == SWP_HWPOISON; +} + +#else + +static inline swp_entry_t make_hwpoison_entry(struct page *page) +{ + return swp_entry(0, 0); +} + +static inline int is_hwpoison_entry(swp_entry_t swp) +{ + return 0; +} +#endif + typedef unsigned long pte_marker; #define PTE_MARKER_UFFD_WP BIT(0) @@ -483,8 +512,9 @@ static inline struct folio *pfn_swap_entry_folio(swp_entry_t entry) /* * A pfn swap entry is a special type of swap entry that always has a pfn stored - * in the swap offset. They are used to represent unaddressable device memory - * and to restrict access to a page undergoing migration. + * in the swap offset. They can either be used to represent unaddressable device + * memory, to restrict access to a page undergoing migration or to represent a + * pfn which has been hwpoisoned and unmapped. */ static inline bool is_pfn_swap_entry(swp_entry_t entry) { @@ -492,7 +522,7 @@ static inline bool is_pfn_swap_entry(swp_entry_t entry) BUILD_BUG_ON(SWP_TYPE_SHIFT < SWP_PFN_BITS); return is_migration_entry(entry) || is_device_private_entry(entry) || - is_device_exclusive_entry(entry); + is_device_exclusive_entry(entry) || is_hwpoison_entry(entry); } struct page_vma_mapped_walk; @@ -561,35 +591,6 @@ static inline int is_pmd_migration_entry(pmd_t pmd) } #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ -#ifdef CONFIG_MEMORY_FAILURE - -/* - * Support for hardware poisoned pages - */ -static inline swp_entry_t make_hwpoison_entry(struct page *page) -{ - BUG_ON(!PageLocked(page)); - return swp_entry(SWP_HWPOISON, page_to_pfn(page)); -} - -static inline int is_hwpoison_entry(swp_entry_t entry) -{ - return swp_type(entry) == SWP_HWPOISON; -} - -#else - -static inline swp_entry_t make_hwpoison_entry(struct page *page) -{ - return swp_entry(0, 0); -} - -static inline int is_hwpoison_entry(swp_entry_t swp) -{ - return 0; -} -#endif - static inline int non_swap_entry(swp_entry_t entry) { return swp_type(entry) >= MAX_SWAPFILES; -- cgit 1.2.3-korg From 9253c54e01b6505d348afbc02abaa4d9f8a01395 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Mon, 8 Apr 2024 23:02:06 +0100 Subject: Squashfs: check the inode number is not the invalid value of zero Syskiller has produced an out of bounds access in fill_meta_index(). That out of bounds access is ultimately caused because the inode has an inode number with the invalid value of zero, which was not checked. The reason this causes the out of bounds access is due to following sequence of events: 1. Fill_meta_index() is called to allocate (via empty_meta_index()) and fill a metadata index. It however suffers a data read error and aborts, invalidating the newly returned empty metadata index. It does this by setting the inode number of the index to zero, which means unused (zero is not a valid inode number). 2. When fill_meta_index() is subsequently called again on another read operation, locate_meta_index() returns the previous index because it matches the inode number of 0. Because this index has been returned it is expected to have been filled, and because it hasn't been, an out of bounds access is performed. This patch adds a sanity check which checks that the inode number is not zero when the inode is created and returns -EINVAL if it is. [phillip@squashfs.org.uk: whitespace fix] Link: https://lkml.kernel.org/r/20240409204723.446925-1-phillip@squashfs.org.uk Link: https://lkml.kernel.org/r/20240408220206.435788-1-phillip@squashfs.org.uk Signed-off-by: Phillip Lougher Reported-by: "Ubisectech Sirius" Closes: https://lore.kernel.org/lkml/87f5c007-b8a5-41ae-8b57-431e924c5915.bugreport@ubisectech.com/ Cc: Christian Brauner Cc: Signed-off-by: Andrew Morton --- fs/squashfs/inode.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c index aa3411354e66d0..16bd693d0b3aa2 100644 --- a/fs/squashfs/inode.c +++ b/fs/squashfs/inode.c @@ -48,6 +48,10 @@ static int squashfs_new_inode(struct super_block *sb, struct inode *inode, gid_t i_gid; int err; + inode->i_ino = le32_to_cpu(sqsh_ino->inode_number); + if (inode->i_ino == 0) + return -EINVAL; + err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->uid), &i_uid); if (err) return err; @@ -58,7 +62,6 @@ static int squashfs_new_inode(struct super_block *sb, struct inode *inode, i_uid_write(inode, i_uid); i_gid_write(inode, i_gid); - inode->i_ino = le32_to_cpu(sqsh_ino->inode_number); inode_set_mtime(inode, le32_to_cpu(sqsh_ino->mtime), 0); inode_set_atime(inode, inode_get_mtime_sec(inode), 0); inode_set_ctime(inode, inode_get_mtime_sec(inode), 0); -- cgit 1.2.3-korg From 0b2cf0a45e06d9538a2371f90150297a87b20eea Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 9 Apr 2024 15:17:15 +0200 Subject: mm,page_owner: defer enablement of static branch Kefeng Wang reported that he was seeing some memory leaks with kmemleak with page_owner enabled. The reason is that we enable the page_owner_inited static branch and then proceed with the linking of stack_list struct to dummy_stack, which means that exists a race window between these two steps where we can have pages already being allocated calling add_stack_record_to_list(), allocating objects and linking them to stack_list, but then we set stack_list pointing to dummy_stack in init_page_owner. Which means that the objects that have been allocated during that time window are unreferenced and lost. Fix this by deferring the enablement of the branch until we have properly set up the list. Link: https://lkml.kernel.org/r/20240409131715.13632-1-osalvador@suse.de Fixes: 4bedfb314bdd ("mm,page_owner: maintain own list of stack_records structs") Signed-off-by: Oscar Salvador Reported-by: Kefeng Wang Closes: https://lore.kernel.org/linux-mm/74b147b0-718d-4d50-be75-d6afc801cd24@huawei.com/ Tested-by: Kefeng Wang Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/page_owner.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index 9bef0b4428634b..742f432e5bf06f 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -118,7 +118,6 @@ static __init void init_page_owner(void) register_dummy_stack(); register_failure_stack(); register_early_stack(); - static_branch_enable(&page_owner_inited); init_early_allocated_pages(); /* Initialize dummy and failure stacks and link them to stack_list */ dummy_stack.stack_record = __stack_depot_get_stack_record(dummy_handle); @@ -129,6 +128,7 @@ static __init void init_page_owner(void) refcount_set(&failure_stack.stack_record->count, 1); dummy_stack.next = &failure_stack; stack_list = &dummy_stack; + static_branch_enable(&page_owner_inited); } struct page_ext_operations page_owner_ops = { -- cgit 1.2.3-korg From 1f737846aa3c45f07a06fa0d018b39e1afb8084a Mon Sep 17 00:00:00 2001 From: Sumanth Korikkar Date: Tue, 9 Apr 2024 17:54:07 +0200 Subject: mm/shmem: inline shmem_is_huge() for disabled transparent hugepages In order to minimize code size (CONFIG_CC_OPTIMIZE_FOR_SIZE=y), compiler might choose to make a regular function call (out-of-line) for shmem_is_huge() instead of inlining it. When transparent hugepages are disabled (CONFIG_TRANSPARENT_HUGEPAGE=n), it can cause compilation error. mm/shmem.c: In function `shmem_getattr': ./include/linux/huge_mm.h:383:27: note: in expansion of macro `BUILD_BUG' 383 | #define HPAGE_PMD_SIZE ({ BUILD_BUG(); 0; }) | ^~~~~~~~~ mm/shmem.c:1148:33: note: in expansion of macro `HPAGE_PMD_SIZE' 1148 | stat->blksize = HPAGE_PMD_SIZE; To prevent the possible error, always inline shmem_is_huge() when transparent hugepages are disabled. Link: https://lkml.kernel.org/r/20240409155407.2322714-1-sumanthk@linux.ibm.com Signed-off-by: Sumanth Korikkar Acked-by: David Hildenbrand Cc: Alexander Gordeev Cc: Heiko Carstens Cc: Hugh Dickins Cc: Ilya Leoshkevich Cc: Vasily Gorbik Cc: Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 9 +++++++++ mm/shmem.c | 6 ------ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index a4c15db2f5e540..3fb18f7eb73eaf 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -110,8 +110,17 @@ extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); int shmem_unuse(unsigned int type); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE extern bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, struct mm_struct *mm, unsigned long vm_flags); +#else +static __always_inline bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, + struct mm_struct *mm, unsigned long vm_flags) +{ + return false; +} +#endif + #ifdef CONFIG_SHMEM extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); #else diff --git a/mm/shmem.c b/mm/shmem.c index 0aad0d9a621b80..94ab99b6b574a4 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -748,12 +748,6 @@ static long shmem_unused_huge_count(struct super_block *sb, #define shmem_huge SHMEM_HUGE_DENY -bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, - struct mm_struct *mm, unsigned long vm_flags) -{ - return false; -} - static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, struct shrink_control *sc, unsigned long nr_to_split) { -- cgit 1.2.3-korg From 35e351780fa9d8240dd6f7e4f245f9ea37e96c19 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 10 Apr 2024 17:14:41 +0800 Subject: fork: defer linking file vma until vma is fully initialized Thorvald reported a WARNING [1]. And the root cause is below race: CPU 1 CPU 2 fork hugetlbfs_fallocate dup_mmap hugetlbfs_punch_hole i_mmap_lock_write(mapping); vma_interval_tree_insert_after -- Child vma is visible through i_mmap tree. i_mmap_unlock_write(mapping); hugetlb_dup_vma_private -- Clear vma_lock outside i_mmap_rwsem! i_mmap_lock_write(mapping); hugetlb_vmdelete_list vma_interval_tree_foreach hugetlb_vma_trylock_write -- Vma_lock is cleared. tmp->vm_ops->open -- Alloc new vma_lock outside i_mmap_rwsem! hugetlb_vma_unlock_write -- Vma_lock is assigned!!! i_mmap_unlock_write(mapping); hugetlb_dup_vma_private() and hugetlb_vm_op_open() are called outside i_mmap_rwsem lock while vma lock can be used in the same time. Fix this by deferring linking file vma until vma is fully initialized. Those vmas should be initialized first before they can be used. Link: https://lkml.kernel.org/r/20240410091441.3539905-1-linmiaohe@huawei.com Fixes: 8d9bfb260814 ("hugetlb: add vma based lock for pmd sharing") Signed-off-by: Miaohe Lin Reported-by: Thorvald Natvig Closes: https://lore.kernel.org/linux-mm/20240129161735.6gmjsswx62o4pbja@revolver/T/ [1] Reviewed-by: Jane Chu Cc: Christian Brauner Cc: Heiko Carstens Cc: Kent Overstreet Cc: Liam R. Howlett Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Muchun Song Cc: Oleg Nesterov Cc: Peng Zhang Cc: Tycho Andersen Cc: Signed-off-by: Andrew Morton --- kernel/fork.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 39a5046c2f0bf4..aebb3e6c96dc62 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -714,6 +714,23 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, } else if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; vm_flags_clear(tmp, VM_LOCKED_MASK); + /* + * Copy/update hugetlb private vma information. + */ + if (is_vm_hugetlb_page(tmp)) + hugetlb_dup_vma_private(tmp); + + /* + * Link the vma into the MT. After using __mt_dup(), memory + * allocation is not necessary here, so it cannot fail. + */ + vma_iter_bulk_store(&vmi, tmp); + + mm->map_count++; + + if (tmp->vm_ops && tmp->vm_ops->open) + tmp->vm_ops->open(tmp); + file = tmp->vm_file; if (file) { struct address_space *mapping = file->f_mapping; @@ -730,25 +747,9 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, i_mmap_unlock_write(mapping); } - /* - * Copy/update hugetlb private vma information. - */ - if (is_vm_hugetlb_page(tmp)) - hugetlb_dup_vma_private(tmp); - - /* - * Link the vma into the MT. After using __mt_dup(), memory - * allocation is not necessary here, so it cannot fail. - */ - vma_iter_bulk_store(&vmi, tmp); - - mm->map_count++; if (!(tmp->vm_flags & VM_WIPEONFORK)) retval = copy_page_range(tmp, mpnt); - if (tmp->vm_ops && tmp->vm_ops->open) - tmp->vm_ops->open(tmp); - if (retval) { mpnt = vma_next(&vmi); goto loop_out; -- cgit 1.2.3-korg From 8247bf1db92a9697d4ee26db3259d2a1959d1366 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Sat, 13 Apr 2024 03:17:20 +0900 Subject: MAINTAINERS: update Naoya Horiguchi's email address My old NEC address has been removed, so update MAINTAINERS and .mailmap to map it to my gmail address. Link: https://lkml.kernel.org/r/20240412181720.18452-1-nao.horiguchi@gmail.com Signed-off-by: Naoya Horiguchi Acked-by: Miaohe Lin Cc: Oscar Salvador Signed-off-by: Andrew Morton --- .mailmap | 3 ++- MAINTAINERS | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.mailmap b/.mailmap index 8284692f961071..625b496bf5f451 100644 --- a/.mailmap +++ b/.mailmap @@ -446,7 +446,8 @@ Mythri P K Nadav Amit Nadav Amit Nadia Yvette Chambers William Lee Irwin III -Naoya Horiguchi +Naoya Horiguchi +Naoya Horiguchi Nathan Chancellor Neeraj Upadhyay Neil Armstrong diff --git a/MAINTAINERS b/MAINTAINERS index c23fda1aa1f092..ee9cc2b40409ea 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10024,7 +10024,7 @@ F: drivers/media/platform/st/sti/hva HWPOISON MEMORY FAILURE HANDLING M: Miaohe Lin -R: Naoya Horiguchi +R: Naoya Horiguchi L: linux-mm@kvack.org S: Maintained F: mm/hwpoison-inject.c -- cgit 1.2.3-korg From c4a7dc9523b59b3e73fd522c73e95e072f876b16 Mon Sep 17 00:00:00 2001 From: Jeongjun Park Date: Tue, 16 Apr 2024 03:20:48 +0900 Subject: nilfs2: fix OOB in nilfs_set_de_type The size of the nilfs_type_by_mode array in the fs/nilfs2/dir.c file is defined as "S_IFMT >> S_SHIFT", but the nilfs_set_de_type() function, which uses this array, specifies the index to read from the array in the same way as "(mode & S_IFMT) >> S_SHIFT". static void nilfs_set_de_type(struct nilfs_dir_entry *de, struct inode *inode) { umode_t mode = inode->i_mode; de->file_type = nilfs_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; // oob } However, when the index is determined this way, an out-of-bounds (OOB) error occurs by referring to an index that is 1 larger than the array size when the condition "mode & S_IFMT == S_IFMT" is satisfied. Therefore, a patch to resize the nilfs_type_by_mode array should be applied to prevent OOB errors. Link: https://lkml.kernel.org/r/20240415182048.7144-1-konishi.ryusuke@gmail.com Reported-by: syzbot+2e22057de05b9f3b30d8@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=2e22057de05b9f3b30d8 Fixes: 2ba466d74ed7 ("nilfs2: directory entry operations") Signed-off-by: Jeongjun Park Signed-off-by: Ryusuke Konishi Tested-by: Ryusuke Konishi Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index bc846b904b68d4..aee40db7a036fb 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -240,7 +240,7 @@ nilfs_filetype_table[NILFS_FT_MAX] = { #define S_SHIFT 12 static unsigned char -nilfs_type_by_mode[S_IFMT >> S_SHIFT] = { +nilfs_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = { [S_IFREG >> S_SHIFT] = NILFS_FT_REG_FILE, [S_IFDIR >> S_SHIFT] = NILFS_FT_DIR, [S_IFCHR >> S_SHIFT] = NILFS_FT_CHRDEV, -- cgit 1.2.3-korg