Add two fixes

author: Zefan Li <lizefan@huawei.com> 2016-10-26 20:59:39 +0800
committer: Zefan Li <lizefan@huawei.com> 2016-10-26 20:59:39 +0800
commit: 71d933fddc013a6cde5dd00beb888806ddd824c4 (patch)
tree: 0f3c1d138372c463f0ca4eb51b2020b8d5f9f8e3
parent: bc45e503d0005b27c79bc8209d908265c62ca9c8 (diff)
download: linux-3.4.y-queue-71d933fddc013a6cde5dd00beb888806ddd824c4.tar.gz
3 files changed, 231 insertions, 0 deletions
diff --git a/patches/mm-remove-gup_flags-foll_write-games-from-__get_user_pages.patch b/patches/mm-remove-gup_flags-foll_write-games-from-__get_user_pages.patch
new file mode 100644
index 0000000..859c53a
--- /dev/null
+++ b/patches/mm-remove-gup_flags-foll_write-games-from-__get_user_pages.patch
@@ -0,0 +1,140 @@
+From: Michal Hocko <mhocko@suse.com>
+Date: Sun, 16 Oct 2016 11:55:00 +0200
+Subject: mm, gup: close FOLL MAP_PRIVATE race
+
+commit 19be0eaffa3ac7d8eb6784ad9bdbc7d67ed8e619 upstream.
+
+faultin_page drops FOLL_WRITE after the page fault handler did the CoW
+and then we retry follow_page_mask to get our CoWed page. This is racy,
+however because the page might have been unmapped by that time and so
+we would have to do a page fault again, this time without CoW. This
+would cause the page cache corruption for FOLL_FORCE on MAP_PRIVATE
+read only mappings with obvious consequences.
+
+This is an ancient bug that was actually already fixed once by Linus
+eleven years ago in commit 4ceb5db9757a ("Fix get_user_pages() race
+for write access") but that was then undone due to problems on s390
+by commit f33ea7f404e5 ("fix get_user_pages bug") because s390 didn't
+have proper dirty pte tracking until abf09bed3cce ("s390/mm: implement
+software dirty bits"). This wasn't a problem at the time as pointed out
+by Hugh Dickins because madvise relied on mmap_sem for write up until
+0a27a14a6292 ("mm: madvise avoid exclusive mmap_sem") but since then we
+can race with madvise which can unmap the fresh COWed page or with KSM
+and corrupt the content of the shared page.
+
+This patch is based on the Linus' approach to not clear FOLL_WRITE after
+the CoW page fault (aka VM_FAULT_WRITE) but instead introduces FOLL_COW
+to note this fact. The flag is then rechecked during follow_pfn_pte to
+enforce the page fault again if we do not see the CoWed page. Linus was
+suggesting to check pte_dirty again as s390 is OK now. But that would
+make backporting to some old kernels harder. So instead let's just make
+sure that vm_normal_page sees a pure anonymous page.
+
+This would guarantee we are seeing a real CoW page. Introduce
+can_follow_write_pte which checks both pte_write and falls back to
+PageAnon on forced write faults which passed CoW already. Thanks to Hugh
+to point out that a special care has to be taken for KSM pages because
+our COWed page might have been merged with a KSM one and keep its
+PageAnon flag.
+
+Fixes: 0a27a14a6292 ("mm: madvise avoid exclusive mmap_sem")
+Reported-by: Phil "not Paul" Oester <kernel@linuxace.com>
+Disclosed-by: Andy Lutomirski <luto@kernel.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Michal Hocko <mhocko@suse.com>
+[bwh: Backported to 3.2:
+ - Adjust filename, context, indentation
+ - The 'no_page' exit path in follow_page() is different, so open-code the
+   cleanup
+ - Delete a now-unused label]
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Zefan Li <lizefan@huawei.com>
+---
+ include/linux/mm.h |    1 +
+ mm/memory.c        |   39 ++++++++++++++++++++++++++++-----------
+ 2 files changed, 29 insertions(+), 11 deletions(-)
+
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -1525,6 +1525,7 @@ struct page *follow_page(struct vm_area_
+ #define FOLL_MLOCK	0x40	/* mark page as mlocked */
+ #define FOLL_SPLIT	0x80	/* don't return transhuge pages, split them */
+ #define FOLL_HWPOISON	0x100	/* check page is hwpoisoned */
++#define FOLL_COW	0x4000	/* internal GUP flag */
+ 
+ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
+ 			void *data);
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -1447,6 +1447,24 @@ int zap_vma_ptes(struct vm_area_struct *
+ }
+ EXPORT_SYMBOL_GPL(zap_vma_ptes);
+ 
++static inline bool can_follow_write_pte(pte_t pte, struct page *page,
++					unsigned int flags)
++{
++	if (pte_write(pte))
++		return true;
++
++	/*
++	 * Make sure that we are really following CoWed page. We do not really
++	 * have to care about exclusiveness of the page because we only want
++	 * to ensure that once COWed page hasn't disappeared in the meantime
++	 * or it hasn't been merged to a KSM page.
++	 */
++	if ((flags & FOLL_FORCE) && (flags & FOLL_COW))
++		return page && PageAnon(page) && !PageKsm(page);
++
++	return false;
++}
++
+ /**
+  * follow_page - look up a page descriptor from a user-virtual address
+  * @vma: vm_area_struct mapping @address
+@@ -1529,10 +1547,13 @@ split_fallthrough:
+ 	pte = *ptep;
+ 	if (!pte_present(pte))
+ 		goto no_page;
+-	if ((flags & FOLL_WRITE) && !pte_write(pte))
+-		goto unlock;
+ 
+ 	page = vm_normal_page(vma, address, pte);
++	if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, page, flags)) {
++		pte_unmap_unlock(ptep, ptl);
++		return NULL;
++	}
++
+ 	if (unlikely(!page)) {
+ 		if ((flags & FOLL_DUMP) ||
+ 		    !is_zero_pfn(pte_pfn(pte)))
+@@ -1575,7 +1596,7 @@ split_fallthrough:
+ 			unlock_page(page);
+ 		}
+ 	}
+-unlock:
++
+ 	pte_unmap_unlock(ptep, ptl);
+ out:
+ 	return page;
+@@ -1809,17 +1830,13 @@ int __get_user_pages(struct task_struct
+ 				 * The VM_FAULT_WRITE bit tells us that
+ 				 * do_wp_page has broken COW when necessary,
+ 				 * even if maybe_mkwrite decided not to set
+-				 * pte_write. We can thus safely do subsequent
+-				 * page lookups as if they were reads. But only
+-				 * do so when looping for pte_write is futile:
+-				 * in some cases userspace may also be wanting
+-				 * to write to the gotten user page, which a
+-				 * read fault here might prevent (a readonly
+-				 * page might get reCOWed by userspace write).
++				 * pte_write. We cannot simply drop FOLL_WRITE
++				 * here because the COWed page might be gone by
++				 * the time we do the subsequent page lookups.
+ 				 */
+ 				if ((ret & VM_FAULT_WRITE) &&
+ 				    !(vma->vm_flags & VM_WRITE))
+-					foll_flags &= ~FOLL_WRITE;
++					foll_flags |= FOLL_COW;
+ 
+ 				cond_resched();
+ 			}
diff --git a/patches/net-fix-use-after-free-in-the-recvmmsg-exit-path.patch b/patches/net-fix-use-after-free-in-the-recvmmsg-exit-path.patch
new file mode 100644
index 0000000..93a5c65
--- /dev/null
+++ b/patches/net-fix-use-after-free-in-the-recvmmsg-exit-path.patch
@@ -0,0 +1,89 @@
+From 34b88a68f26a75e4fded796f1a49c40f82234b7d Mon Sep 17 00:00:00 2001
+From: Arnaldo Carvalho de Melo <acme@redhat.com>
+Date: Mon, 14 Mar 2016 09:56:35 -0300
+Subject: net: Fix use after free in the recvmmsg exit path
+
+commit 34b88a68f26a75e4fded796f1a49c40f82234b7d upstream.
+
+The syzkaller fuzzer hit the following use-after-free:
+
+  Call Trace:
+   [<ffffffff8175ea0e>] __asan_report_load8_noabort+0x3e/0x40 mm/kasan/report.c:295
+   [<ffffffff851cc31a>] __sys_recvmmsg+0x6fa/0x7f0 net/socket.c:2261
+   [<     inline     >] SYSC_recvmmsg net/socket.c:2281
+   [<ffffffff851cc57f>] SyS_recvmmsg+0x16f/0x180 net/socket.c:2270
+   [<ffffffff86332bb6>] entry_SYSCALL_64_fastpath+0x16/0x7a
+  arch/x86/entry/entry_64.S:185
+
+And, as Dmitry rightly assessed, that is because we can drop the
+reference and then touch it when the underlying recvmsg calls return
+some packets and then hit an error, which will make recvmmsg to set
+sock->sk->sk_err, oops, fix it.
+
+Reported-and-Tested-by: Dmitry Vyukov <dvyukov@google.com>
+Cc: Alexander Potapenko <glider@google.com>
+Cc: Eric Dumazet <edumazet@google.com>
+Cc: Kostya Serebryany <kcc@google.com>
+Cc: Sasha Levin <sasha.levin@oracle.com>
+Fixes: a2e2725541fa ("net: Introduce recvmmsg socket syscall")
+http://lkml.kernel.org/r/20160122211644.GC2470@redhat.com
+Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Zefan Li <lizefan@huawei.com>
+---
+ net/socket.c |   38 +++++++++++++++++++-------------------
+ 1 file changed, 19 insertions(+), 19 deletions(-)
+
+--- a/net/socket.c
++++ b/net/socket.c
+@@ -2332,31 +2332,31 @@ int __sys_recvmmsg(int fd, struct mmsghd
+ 			break;
+ 	}
+ 
+-out_put:
+-	fput_light(sock->file, fput_needed);
+-
+ 	if (err == 0)
+-		return datagrams;
++		goto out_put;
++
++	if (datagrams == 0) {
++		datagrams = err;
++		goto out_put;
++	}
+ 
+-	if (datagrams != 0) {
++	/*
++	 * We may return less entries than requested (vlen) if the
++	 * sock is non block and there aren't enough datagrams...
++	 */
++	if (err != -EAGAIN) {
+ 		/*
+-		 * We may return less entries than requested (vlen) if the
+-		 * sock is non block and there aren't enough datagrams...
++		 * ... or  if recvmsg returns an error after we
++		 * received some datagrams, where we record the
++		 * error to return on the next call or if the
++		 * app asks about it using getsockopt(SO_ERROR).
+ 		 */
+-		if (err != -EAGAIN) {
+-			/*
+-			 * ... or  if recvmsg returns an error after we
+-			 * received some datagrams, where we record the
+-			 * error to return on the next call or if the
+-			 * app asks about it using getsockopt(SO_ERROR).
+-			 */
+-			sock->sk->sk_err = -err;
+-		}
+-
+-		return datagrams;
++		sock->sk->sk_err = -err;
+ 	}
++out_put:
++	fput_light(sock->file, fput_needed);
+ 
+-	return err;
++	return datagrams;
+ }
+ 
+ SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr __user *, mmsg,
diff --git a/patches/series b/patches/series
index 5147543..6779316 100644
--- a/patches/series
+++ b/patches/series
@@ -127,3 +127,5 @@ xen-pciback-save-the-number-of-msi-x-entries-to-be-copied-later.patch
 ser_gigaset-remove-unnecessary-kfree-calls-from-release-method.patch
 ser_gigaset-use-container_of-instead-of-detour.patch
 net-core-revert-net-fix-__netdev_update_features-return.-and-add-comment.patch
+mm-remove-gup_flags-foll_write-games-from-__get_user_pages.patch
+net-fix-use-after-free-in-the-recvmmsg-exit-path.patch
author	Zefan Li <lizefan@huawei.com>	2016-10-26 20:59:39 +0800
committer	Zefan Li <lizefan@huawei.com>	2016-10-26 20:59:39 +0800
commit	71d933fddc013a6cde5dd00beb888806ddd824c4 (patch)
tree	0f3c1d138372c463f0ca4eb51b2020b8d5f9f8e3
parent	bc45e503d0005b27c79bc8209d908265c62ca9c8 (diff)
download	linux-3.4.y-queue-71d933fddc013a6cde5dd00beb888806ddd824c4.tar.gz