hugetlb: use new vma_lock for pmd sharing synchronization

The new hugetlb vma lock is used to address this race: Faulting thread Unsharing thread ... ... ptep = huge_pte_offset() or ptep = huge_pte_alloc() ... i_mmap_lock_write lock page table ptep invalid <------------------------ huge_pmd_unshare() Could be in a previously unlock_page_table sharing process or worse i_mmap_unlock_write ... The vma_lock is used as follows: - During fault processing. The lock is acquired in read mode before doing a page table lock and allocation (huge_pte_alloc). The lock is held until code is finished with the page table entry (ptep). - The lock must be held in write mode whenever huge_pmd_unshare is called. Lock ordering issues come into play when unmapping a page from all vmas mapping the page. The i_mmap_rwsem must be held to search for the vmas, and the vma lock must be held before calling unmap which will call huge_pmd_unshare. This is done today in: - try_to_migrate_one and try_to_unmap_ for page migration and memory error handling. In these routines we 'try' to obtain the vma lock and fail to unmap if unsuccessful. Calling routines already deal with the failure of unmapping. - hugetlb_vmdelete_list for truncation and hole punch. This routine also tries to acquire the vma lock. If it fails, it skips the unmapping. However, we can not have file truncation or hole punch fail because of contention. After hugetlb_vmdelete_list, truncation and hole punch call remove_inode_hugepages. remove_inode_hugepages checks for mapped pages and call hugetlb_unmap_file_page to unmap them. hugetlb_unmap_file_page is designed to drop locks and reacquire in the correct order to guarantee unmap success. Link: https://lkml.kernel.org/r/20220914221810.95771-9-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com> Cc: Axel Rasmussen <axelrasmussen@google.com> Cc: David Hildenbrand <david@redhat.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: James Houghton <jthoughton@google.com> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Cc: Miaohe Lin <linmiaohe@huawei.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Mina Almasry <almasrymina@google.com> Cc: Muchun Song <songmuchun@bytedance.com> Cc: Naoya Horiguchi <naoya.horiguchi@linux.dev> Cc: Pasha Tatashin <pasha.tatashin@soleen.com> Cc: Peter Xu <peterx@redhat.com> Cc: Prakash Sangappa <prakash.sangappa@oracle.com> Cc: Sven Schnelle <svens@linux.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
author: Mike Kravetz <mike.kravetz@oracle.com> 2022-09-14 15:18:09 -0700
committer: Andrew Morton <akpm@linux-foundation.org> 2022-10-03 14:03:17 -0700
commit: 40549ba8f8e0ed1f8b235979563f619e9aa34fdf (patch)
tree: 0f510be9e9447727760f39adb9588cafbe27bd4c /mm/rmap.c
parent: 378397ccb8e5a695a42e819df545ccd28641b683 (diff)
download: linux-40549ba8f8e0ed1f8b235979563f619e9aa34fdf.tar.gz
1 files changed, 65 insertions, 35 deletions
diff --git a/mm/rmap.c b/mm/rmap.c
index 0e179c823e0ac..b6743c2b8b5f4 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1551,24 +1551,39 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 			 * To call huge_pmd_unshare, i_mmap_rwsem must be
 			 * held in write mode.  Caller needs to explicitly
 			 * do this outside rmap routines.
+			 *
+			 * We also must hold hugetlb vma_lock in write mode.
+			 * Lock order dictates acquiring vma_lock BEFORE
+			 * i_mmap_rwsem.  We can only try lock here and fail
+			 * if unsuccessful.
 			 */
-			VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED));
-			if (!anon && huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
-				flush_tlb_range(vma, range.start, range.end);
-				mmu_notifier_invalidate_range(mm, range.start,
-							      range.end);
-
-				/*
-				 * The ref count of the PMD page was dropped
-				 * which is part of the way map counting
-				 * is done for shared PMDs.  Return 'true'
-				 * here.  When there is no other sharing,
-				 * huge_pmd_unshare returns false and we will
-				 * unmap the actual page and drop map count
-				 * to zero.
-				 */
-				page_vma_mapped_walk_done(&pvmw);
-				break;
+			if (!anon) {
+				VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
+				if (!hugetlb_vma_trylock_write(vma)) {
+					page_vma_mapped_walk_done(&pvmw);
+					ret = false;
+					break;
+				}
+				if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
+					hugetlb_vma_unlock_write(vma);
+					flush_tlb_range(vma,
+						range.start, range.end);
+					mmu_notifier_invalidate_range(mm,
+						range.start, range.end);
+					/*
+					 * The ref count of the PMD page was
+					 * dropped which is part of the way map
+					 * counting is done for shared PMDs.
+					 * Return 'true' here.  When there is
+					 * no other sharing, huge_pmd_unshare
+					 * returns false and we will unmap the
+					 * actual page and drop map count
+					 * to zero.
+					 */
+					page_vma_mapped_walk_done(&pvmw);
+					break;
+				}
+				hugetlb_vma_unlock_write(vma);
 			}
 			pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
 		} else {
@@ -1926,26 +1941,41 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 			 * To call huge_pmd_unshare, i_mmap_rwsem must be
 			 * held in write mode.  Caller needs to explicitly
 			 * do this outside rmap routines.
+			 *
+			 * We also must hold hugetlb vma_lock in write mode.
+			 * Lock order dictates acquiring vma_lock BEFORE
+			 * i_mmap_rwsem.  We can only try lock here and
+			 * fail if unsuccessful.
 			 */
-			VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED));
-			if (!anon && huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
-				flush_tlb_range(vma, range.start, range.end);
-				mmu_notifier_invalidate_range(mm, range.start,
-							      range.end);
-
-				/*
-				 * The ref count of the PMD page was dropped
-				 * which is part of the way map counting
-				 * is done for shared PMDs.  Return 'true'
-				 * here.  When there is no other sharing,
-				 * huge_pmd_unshare returns false and we will
-				 * unmap the actual page and drop map count
-				 * to zero.
-				 */
-				page_vma_mapped_walk_done(&pvmw);
-				break;
+			if (!anon) {
+				VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
+				if (!hugetlb_vma_trylock_write(vma)) {
+					page_vma_mapped_walk_done(&pvmw);
+					ret = false;
+					break;
+				}
+				if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
+					hugetlb_vma_unlock_write(vma);
+					flush_tlb_range(vma,
+						range.start, range.end);
+					mmu_notifier_invalidate_range(mm,
+						range.start, range.end);
+
+					/*
+					 * The ref count of the PMD page was
+					 * dropped which is part of the way map
+					 * counting is done for shared PMDs.
+					 * Return 'true' here.  When there is
+					 * no other sharing, huge_pmd_unshare
+					 * returns false and we will unmap the
+					 * actual page and drop map count
+					 * to zero.
+					 */
+					page_vma_mapped_walk_done(&pvmw);
+					break;
+				}
+				hugetlb_vma_unlock_write(vma);
 			}
-
 			/* Nuke the hugetlb page table entry */
 			pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
 		} else {
author	Mike Kravetz <mike.kravetz@oracle.com>	2022-09-14 15:18:09 -0700
committer	Andrew Morton <akpm@linux-foundation.org>	2022-10-03 14:03:17 -0700
commit	40549ba8f8e0ed1f8b235979563f619e9aa34fdf (patch)
tree	0f510be9e9447727760f39adb9588cafbe27bd4c /mm/rmap.c
parent	378397ccb8e5a695a42e819df545ccd28641b683 (diff)
download	linux-40549ba8f8e0ed1f8b235979563f619e9aa34fdf.tar.gz