aboutsummaryrefslogtreecommitdiffstatshomepage
diff options
context:
space:
mode:
authorAndrea Arcangeli <aarcange@redhat.com>2023-06-08 13:26:25 -0400
committerAndrea Arcangeli <aarcange@redhat.com>2023-11-11 22:03:36 -0500
commit6f6cacb457e261c27f50d13deef2cf675b0b6e16 (patch)
tree5744abcfe5a32274711cc0640163e7bf75fd0d2e
parent051f877967290dbacff57b3f88cb0dbcc3651565 (diff)
downloadaa-6f6cacb457e261c27f50d13deef2cf675b0b6e16.tar.gz
mm: gup: enable PageAnonGup to reduce the false positive copies in fork
This reduces the false positives COWs during fork to the intersections of the anon pages that have ever been pinned with the ones that still have the page count elevated for some reason and that aren't currently being shared. The original implementation of the gup page bitflag was proposed in the link. This however handles hugetlbfs and THP too so a simple page bitflag isn't enough and a compound one is required too. hugetlbfs only needs to reflect the "anon gup" bit in the compound field, the subpage bitflag is never set. THP has to reflect "anon gup" bit in the compound field as well as the subpage bit. To monitor the effect: bpftrace -e 'kretprobe:page_needs_cow_for_dma { @tot[retval] = count() } kretprobe:page_needs_cow_for_dma { @tot[retval] = count() }' Running the above during sddm plasma login and then starting firefox on Fedora the output is: [..] @tot[1]: 80 @tot[0]: 1037304 The false positives COWs are 80 over a total of >1M COWs. THP in the test was set to "always" in the kernel config. Even if all other things would be equal, the extra false positive copies, if they might ever happen, are preferable to deal with in fork() than the COW fault, because: 1) there will be at most one extra false positive copy per page no matter the amount of later mprotect and other mm activity. 2) if the false positive copy fails the allocation we can fail gracefully with -ENOMEM in fork() unlike in the COW fault which is forced to segfault. 3) fork() is the only place where we can count the mappings and skip the extra copies for virtually splitted THP. The COW fault doesn't hold the right lock for that. Link: lkml.kernel.org/r/20090311165833.GI27823@random.random Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
-rw-r--r--include/linux/mm.h6
-rw-r--r--include/linux/mm_types.h9
-rw-r--r--mm/gup.c89
-rw-r--r--mm/huge_memory.c10
-rw-r--r--mm/hugetlb.c5
-rw-r--r--mm/memory.c2
-rw-r--r--mm/util.c10
7 files changed, 113 insertions, 18 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 181809c045124c..74160a09e6886d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1219,7 +1219,8 @@ static inline void get_page(struct page *page)
page_ref_inc(page);
}
-bool __must_check try_grab_page(struct page *page, unsigned int flags);
+bool __must_check try_grab_page(struct page *page, unsigned long addr,
+ unsigned int flags);
struct page *try_grab_compound_head(struct page *page, int refs,
unsigned int flags);
@@ -1341,7 +1342,8 @@ static inline bool is_cow_mapping(vm_flags_t flags)
}
extern bool page_needs_cow_for_dma(struct vm_area_struct *vma,
- struct page *page);
+ struct page *page,
+ bool compound);
#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
#define SECTION_IN_PAGE_FLAGS
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6853a79a3a11a8..ddd75e24a3a3ba 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -165,6 +165,10 @@ struct page {
/* For both global and memcg */
struct list_head deferred_list;
};
+ struct { /* Third tail page of compound page */
+ unsigned long _compound_pad_2; /* compound_head */
+ bool anon_gup;
+ };
struct { /* Page table pages */
unsigned long _pt_pad_1; /* compound_head */
pgtable_t pmd_huge_pte; /* protected by page->ptl */
@@ -255,6 +259,11 @@ static inline atomic_t *compound_pincount_ptr(struct page *page)
return &page[2].hpage_pinned_refcount;
}
+static inline bool *compound_anon_gup(struct page *page)
+{
+ return &page[3].anon_gup;
+}
+
/*
* Used for sizing the vmemmap region on some architectures
*/
diff --git a/mm/gup.c b/mm/gup.c
index dcd8f907c9471b..4f7bcc8d9a9ae1 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -234,6 +234,50 @@ static void put_page_refs(struct page *page, int refs)
put_page(page);
}
+static inline void __set_compound_anon_gup(struct page *head)
+{
+ if (PageHead(head) && head_compound_mapcount(head)) {
+ /*
+ * Only set this if there can ever be a COW
+ * fault on a huge pmd. COW faults on PTE
+ * mapped THP only care about the subpage not
+ * to end up being mapped read-write in the
+ * child, not the whole THP.
+ */
+ if (!READ_ONCE(*compound_anon_gup(head)))
+ WRITE_ONCE(*compound_anon_gup(head), true);
+ }
+}
+
+static inline void set_compound_anon_gup(struct page *head)
+{
+ if (PageHeadAnonNoKsm(head))
+ __set_compound_anon_gup(head);
+}
+
+static void set_anon_gup_addr(struct page *page, unsigned long addr)
+{
+ bool need_subpage;
+ struct page *head = compound_head(page);
+
+ if (!PageHeadAnonNoKsm(head))
+ return;
+
+ need_subpage = true;
+ if (PageHead(head)) {
+ if (!PageHuge(head))
+ page = head + page_trans_huge_subpage_idx(addr);
+ else {
+ VM_BUG_ON(!PageHead(page));
+ need_subpage = false;
+ }
+ __set_compound_anon_gup(head);
+ }
+
+ if (!PageAnonGup(page) && need_subpage)
+ SetPageAnonGup(page);
+}
+
/*
* Return the compound head page with ref appropriately incremented,
* or NULL if that failed.
@@ -261,6 +305,8 @@ static inline struct page *try_get_compound_head(struct page *page, int refs)
return NULL;
}
+ set_compound_anon_gup(head);
+
return head;
}
@@ -378,13 +424,19 @@ static void put_compound_head(struct page *page, int refs, unsigned int flags)
* nor FOLL_GET was set, nothing is done). False for failure: FOLL_GET or
* FOLL_PIN was set, but the page could not be grabbed.
*/
-bool __must_check try_grab_page(struct page *page, unsigned int flags)
+bool __must_check try_grab_page(struct page *page, unsigned long addr,
+ unsigned int flags)
{
+ bool grabbed;
+
WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN));
- if (flags & FOLL_GET)
- return try_get_page(page);
- else if (flags & FOLL_PIN) {
+ if (flags & FOLL_GET) {
+ grabbed = try_get_page(page);
+ if (grabbed)
+ set_anon_gup_addr(page, addr);
+ return grabbed;
+ } else if (flags & FOLL_PIN) {
int refs = 1;
page = compound_head(page);
@@ -404,6 +456,7 @@ bool __must_check try_grab_page(struct page *page, unsigned int flags)
* once, so that the page really is pinned.
*/
page_ref_add(page, refs);
+ set_anon_gup_addr(page, addr);
mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED, 1);
}
@@ -768,7 +821,7 @@ retry:
goto out;
}
/* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */
- if (unlikely(!try_grab_page(page, flags))) {
+ if (unlikely(!try_grab_page(page, address, flags))) {
page = ERR_PTR(-ENOMEM);
goto out;
}
@@ -1130,7 +1183,7 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
goto unmap;
*page = pte_page(*pte);
}
- if (unlikely(!try_grab_page(*page, gup_flags))) {
+ if (unlikely(!try_grab_page(*page, address, gup_flags))) {
ret = -ENOMEM;
goto unmap;
}
@@ -2615,6 +2668,10 @@ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
goto pte_unmap;
}
}
+
+ if (PageAnonNoKsm(page) && !PageAnonGup(page))
+ SetPageAnonGup(page);
+
SetPageReferenced(page);
pages[*nr] = page;
(*nr)++;
@@ -2668,7 +2725,7 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
}
SetPageReferenced(page);
pages[*nr] = page;
- if (unlikely(!try_grab_page(page, flags))) {
+ if (unlikely(!try_grab_page(page, addr, flags))) {
undo_dev_pagemap(nr, nr_start, flags, pages);
ret = 0;
break;
@@ -2745,6 +2802,16 @@ static int record_subpages(struct page *page, unsigned long addr,
return nr;
}
+static void anongup_subpages(struct page *page, unsigned long addr,
+ unsigned long end)
+{
+ if (!PageAnonNoKsm(page) || PageHuge(page))
+ return;
+ for (; addr != end; addr += PAGE_SIZE, page++)
+ if (!PageAnonGup(page))
+ SetPageAnonGup(page);
+}
+
#ifdef CONFIG_ARCH_HAS_HUGEPD
static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
unsigned long sz)
@@ -2787,6 +2854,8 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
return 0;
}
+ anongup_subpages(page, addr, end);
+
*nr += refs;
SetPageReferenced(head);
return 1;
@@ -2856,6 +2925,8 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
return 0;
}
+ anongup_subpages(page, addr, end);
+
*nr += refs;
SetPageReferenced(head);
return 1;
@@ -2890,6 +2961,8 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
return 0;
}
+ anongup_subpages(page, addr, end);
+
*nr += refs;
SetPageReferenced(head);
return 1;
@@ -2919,6 +2992,8 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
return 0;
}
+ anongup_subpages(page, addr, end);
+
*nr += refs;
SetPageReferenced(head);
return 1;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index bdc629c550cb47..10101a10747dc2 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1019,7 +1019,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
if (!*pgmap)
return ERR_PTR(-EFAULT);
page = pfn_to_page(pfn);
- if (!try_grab_page(page, flags))
+ if (!try_grab_page(page, addr, flags))
page = ERR_PTR(-ENOMEM);
return page;
@@ -1105,7 +1105,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* best effort that the pinned pages won't be replaced by another
* random page during the coming copy-on-write.
*/
- if (unlikely(page_needs_cow_for_dma(src_vma, src_page))) {
+ if (unlikely(page_needs_cow_for_dma(src_vma, src_page, true))) {
pte_free(dst_mm, pgtable);
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
@@ -1186,7 +1186,7 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
if (!*pgmap)
return ERR_PTR(-EFAULT);
page = pfn_to_page(pfn);
- if (!try_grab_page(page, flags))
+ if (!try_grab_page(page, addr, flags))
page = ERR_PTR(-ENOMEM);
return page;
@@ -1219,7 +1219,7 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
}
/* Please refer to comments in copy_huge_pmd() */
- if (unlikely(page_needs_cow_for_dma(vma, pud_page(pud)))) {
+ if (unlikely(page_needs_cow_for_dma(vma, pud_page(pud), true))) {
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
__split_huge_pud(vma, src_pud, addr);
@@ -1421,7 +1421,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
gup_must_unshare(flags, page, addr, true, vma))
return ERR_PTR(-EMLINK);
- if (!try_grab_page(page, flags))
+ if (!try_grab_page(page, addr, flags))
return ERR_PTR(-ENOMEM);
if (flags & FOLL_TOUCH)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 474c7a502f6d44..2552484e9dd46f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4383,7 +4383,8 @@ again:
* need to be without the pgtable locks since we could
* sleep during the process.
*/
- if (unlikely(page_needs_cow_for_dma(vma, ptepage))) {
+ if (unlikely(page_needs_cow_for_dma(vma, ptepage,
+ true))) {
pte_t src_pte_old = entry;
struct page *new;
@@ -6290,7 +6291,7 @@ retry:
* in any way. So this page must be available at this point,
* unless the page refcount overflowed:
*/
- if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
+ if (WARN_ON_ONCE(!try_grab_page(page, address, flags))) {
page = NULL;
goto out;
}
diff --git a/mm/memory.c b/mm/memory.c
index ab25ec3dcea1f9..aa3bb5df701b05 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -900,7 +900,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
* the page count. That might give false positives for
* for pinning, but it will work correctly.
*/
- if (likely(!page_needs_cow_for_dma(src_vma, page)))
+ if (likely(!page_needs_cow_for_dma(src_vma, page, false)))
return 1;
new_page = *prealloc;
diff --git a/mm/util.c b/mm/util.c
index a2e14813fdebba..f819bff52a8c75 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -793,7 +793,8 @@ struct anon_vma *page_anon_vma(struct page *page)
* This should most likely only be called during fork() to see whether we
* should break the cow immediately for a page on the src mm.
*/
-bool page_needs_cow_for_dma(struct vm_area_struct *vma, struct page *page)
+bool page_needs_cow_for_dma(struct vm_area_struct *vma, struct page *page,
+ bool compound)
{
bool copy;
int val;
@@ -807,6 +808,13 @@ bool page_needs_cow_for_dma(struct vm_area_struct *vma, struct page *page)
if (!PageAnon(page))
return false;
+ VM_BUG_ON(compound && !PageHead(page));
+ if (compound) {
+ if (!READ_ONCE(*compound_anon_gup(page)))
+ return false;
+ } else if (!PageAnonGup(page))
+ return false;
+
/*
* If page_count is == 1 there cannot be any GUP pin and
* further GUP pins are prevented with write_protect_seq.