aboutsummaryrefslogtreecommitdiffstatshomepage
diff options
context:
space:
mode:
authorAndrea Arcangeli <aarcange@redhat.com>2023-06-06 20:56:24 -0400
committerAndrea Arcangeli <aarcange@redhat.com>2023-11-11 22:03:36 -0500
commita93c0e153788d782ff95f8edef60e65f351f6c0e (patch)
tree17b1b97e816e9c9829db03d9540dfc4805d63b95
parent8cd1f8e1e8fa330299bbbae6f51ec3a1b4fd8a41 (diff)
downloadaa-a93c0e153788d782ff95f8edef60e65f351f6c0e.tar.gz
mm: gup: fix synchronicity of all GUP pins universally
This makes O_DIRECT+thread+fork work completely safe with posix semantics retained at subpagesize hardblocksize granularity just like !O_DIRECT without the need of FOLL_PIN. It further closes any potential discussion about the vmsplice in parent followed by fork that should more likely -EFAULT if the parent munmaps the memory backing the previous vmsplice, just before the pipe reads it. Another view could be that if vmsplice must decouple itself from the virtual memory, then it shouldn't show any of the memory modifications done with the CPU on the pinned memory either, but it does. This change is not done for vmsplice, but it covers vmsplice before fork too just because it covers it all. In fact any sign of vmsplice usage in an app is a sign of inefficiency and it should be replaced with higher perf IPC methods based on shared memory. Last but not the least the vmsplice implementation is still not safe to use even after this change because it can still take long term gup pins without any privilege and without using FOLL_LONGTERM and without mmu notifiers which can cause a resource DoS. This commit is also deemed unnecessary for all security purposes. There's no expectation userland can takes advantage of the new semantics of short term FOLL_WRITE GUP pins taken before fork either. This is done for no practical reason, but just because this is the way. Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
-rw-r--r--include/linux/mm.h17
-rw-r--r--mm/gup.c22
-rw-r--r--mm/util.c53
3 files changed, 63 insertions, 29 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0943fc9b536e9c..181809c045124c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1340,21 +1340,8 @@ static inline bool is_cow_mapping(vm_flags_t flags)
return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
}
-/*
- * This should most likely only be called during fork() to see whether we
- * should break the cow immediately for a page on the src mm.
- */
-static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma,
- struct page *page)
-{
- if (!is_cow_mapping(vma->vm_flags))
- return false;
-
- if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags))
- return false;
-
- return page_maybe_dma_pinned(page);
-}
+extern bool page_needs_cow_for_dma(struct vm_area_struct *vma,
+ struct page *page);
#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
#define SECTION_IN_PAGE_FLAGS
diff --git a/mm/gup.c b/mm/gup.c
index 591f81cd964114..dcd8f907c9471b 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1572,8 +1572,7 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
BUG_ON(*locked != 1);
}
- if (flags & FOLL_PIN)
- mm_set_has_pinned_flag(&mm->flags);
+ mm_set_has_pinned_flag(&mm->flags);
/*
* FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior
@@ -3100,11 +3099,9 @@ static unsigned long lockless_pages_from_mm(unsigned long start,
!gup_fast_permitted(start, end))
return 0;
- if (gup_flags & FOLL_PIN) {
- seq = raw_read_seqcount(&current->mm->write_protect_seq);
- if (seq & 1)
- return 0;
- }
+ seq = raw_read_seqcount(&current->mm->write_protect_seq);
+ if (seq & 1)
+ return 0;
/*
* Disable interrupts. The nested form is used, in order to allow full,
@@ -3125,11 +3122,9 @@ static unsigned long lockless_pages_from_mm(unsigned long start,
* When pinning pages for DMA there could be a concurrent write protect
* from fork() via copy_page_range(), in this case always fail fast GUP.
*/
- if (gup_flags & FOLL_PIN) {
- if (read_seqcount_retry(&current->mm->write_protect_seq, seq)) {
- unpin_user_pages(pages, nr_pinned);
- return 0;
- }
+ if (read_seqcount_retry(&current->mm->write_protect_seq, seq)) {
+ unpin_user_pages(pages, nr_pinned);
+ return 0;
}
return nr_pinned;
}
@@ -3148,8 +3143,7 @@ static int internal_get_user_pages_fast(unsigned long start,
FOLL_FAST_ONLY | FOLL_NOFAULT)))
return -EINVAL;
- if (gup_flags & FOLL_PIN)
- mm_set_has_pinned_flag(&current->mm->flags);
+ mm_set_has_pinned_flag(&current->mm->flags);
if (!(gup_flags & FOLL_FAST_ONLY))
might_lock_read(&current->mm->mmap_lock);
diff --git a/mm/util.c b/mm/util.c
index 0cb52499af1969..a2e14813fdebba 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -789,6 +789,59 @@ struct anon_vma *page_anon_vma(struct page *page)
return __page_rmapping(page);
}
+/*
+ * This should most likely only be called during fork() to see whether we
+ * should break the cow immediately for a page on the src mm.
+ */
+bool page_needs_cow_for_dma(struct vm_area_struct *vma, struct page *page)
+{
+ bool copy;
+ int val;
+
+ if (!is_cow_mapping(vma->vm_flags))
+ return false;
+
+ if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags))
+ return false;
+
+ if (!PageAnon(page))
+ return false;
+
+ /*
+ * If page_count is == 1 there cannot be any GUP pin and
+ * further GUP pins are prevented with write_protect_seq.
+ */
+ val = page_count(page);
+ VM_WARN_ON_ONCE_PAGE(val < 1, page);
+ if (val == 1)
+ return false;
+
+ /*
+ * If mapcount is > 1 at any given time there cannot be any
+ * R/O GUP pin thanks to gup_must_unshare() and further GUP
+ * pins are prevented with write_protect_seq.
+ */
+ val = page_mapcount(page);
+ VM_WARN_ON_ONCE_PAGE(val < 1, page);
+ if (val > 1)
+ return false;
+
+ /*
+ * COWing too much is unsafe since GUP pin exists, but copying
+ * too much in fork is always safe as far as GUP pins are
+ * concerned.
+ */
+ copy = true;
+ if (PageSwapCache(page) && trylock_page(page)) {
+ val = page_count(page) - PageSwapCache(page);
+ VM_WARN_ON_ONCE_PAGE(val < 1, page);
+ copy = val != 1;
+ unlock_page(page);
+ }
+
+ return copy;
+}
+
struct address_space *page_mapping(struct page *page)
{
struct address_space *mapping;