€•‰s      Œsphinx.addnodes”Œdocument”“”)”}”(Œ	rawsource”Œ ”Œchildren”]”(Œtranslations”ŒLanguagesNode”“”)”}”(hhh]”(h Œpending_xref”“”)”}”(hhh]”Œdocutils.nodes”ŒText”“”ŒChinese (Simplified)”…””}”Œparent”hsbaŒ
attributes”}”(Œids”]”Œclasses”]”Œnames”]”Œdupnames”]”Œbackrefs”]”Œ	refdomain”Œstd”Œreftype”Œdoc”Œ	reftarget”Œ /translations/zh_CN/mm/transhuge”Œmodname”NŒ	classname”NŒrefexplicit”ˆuŒtagname”hhhubh)”}”(hhh]”hŒChinese (Traditional)”…””}”hh2sbah}”(h]”h ]”h"]”h$]”h&]”Œ	refdomain”h)Œreftype”h+Œ	reftarget”Œ /translations/zh_TW/mm/transhuge”Œmodname”NŒ	classname”NŒrefexplicit”ˆuh1hhhubh)”}”(hhh]”hŒItalian”…””}”hhFsbah}”(h]”h ]”h"]”h$]”h&]”Œ	refdomain”h)Œreftype”h+Œ	reftarget”Œ /translations/it_IT/mm/transhuge”Œmodname”NŒ	classname”NŒrefexplicit”ˆuh1hhhubh)”}”(hhh]”hŒJapanese”…””}”hhZsbah}”(h]”h ]”h"]”h$]”h&]”Œ	refdomain”h)Œreftype”h+Œ	reftarget”Œ /translations/ja_JP/mm/transhuge”Œmodname”NŒ	classname”NŒrefexplicit”ˆuh1hhhubh)”}”(hhh]”hŒKorean”…””}”hhnsbah}”(h]”h ]”h"]”h$]”h&]”Œ	refdomain”h)Œreftype”h+Œ	reftarget”Œ /translations/ko_KR/mm/transhuge”Œmodname”NŒ	classname”NŒrefexplicit”ˆuh1hhhubh)”}”(hhh]”hŒSpanish”…””}”hh‚sbah}”(h]”h ]”h"]”h$]”h&]”Œ	refdomain”h)Œreftype”h+Œ	reftarget”Œ /translations/sp_SP/mm/transhuge”Œmodname”NŒ	classname”NŒrefexplicit”ˆuh1hhhubeh}”(h]”h ]”h"]”h$]”h&]”Œcurrent_language”ŒEnglish”uh1h
hhŒ	_document”hŒsource”NŒline”NubhŒsection”“”)”}”(hhh]”(hŒtitle”“”)”}”(hŒTransparent Hugepage Support”h]”hŒTransparent Hugepage Support”…””}”(hh¨hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h¦hh£hžhhŸŒ:/var/lib/git/docbuild/linux/Documentation/mm/transhuge.rst”h KubhŒ	paragraph”“”)”}”(hŒ–This document describes design principles for Transparent Hugepage (THP)
support and its interaction with other parts of the memory management
system.”h]”hŒ–This document describes design principles for Transparent Hugepage (THP)
support and its interaction with other parts of the memory management
system.”…””}”(hh¹hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h Khh£hžhubh¢)”}”(hhh]”(h§)”}”(hŒDesign principles”h]”hŒDesign principles”…””}”(hhÊhžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h¦hhÇhžhhŸh¶h K
ubhŒbullet_list”“”)”}”(hhh]”(hŒ	list_item”“”)”}”(hX  "graceful fallback": mm components which don't have transparent hugepage
knowledge fall back to breaking huge pmd mapping into table of ptes and,
if necessary, split a transparent hugepage. Therefore these components
can continue working on the regular pages or regular pte mappings.
”h]”h¸)”}”(hX  "graceful fallback": mm components which don't have transparent hugepage
knowledge fall back to breaking huge pmd mapping into table of ptes and,
if necessary, split a transparent hugepage. Therefore these components
can continue working on the regular pages or regular pte mappings.”h]”hX!  â€œgraceful fallbackâ€: mm components which donâ€™t have transparent hugepage
knowledge fall back to breaking huge pmd mapping into table of ptes and,
if necessary, split a transparent hugepage. Therefore these components
can continue working on the regular pages or regular pte mappings.”…””}”(hhãhžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h Khhßubah}”(h]”h ]”h"]”h$]”h&]”uh1hÝhhÚhžhhŸh¶h NubhÞ)”}”(hŒÖif a hugepage allocation fails because of memory fragmentation,
regular pages should be gracefully allocated instead and mixed in
the same vma without any failure or significant delay and without
userland noticing
”h]”h¸)”}”(hŒÕif a hugepage allocation fails because of memory fragmentation,
regular pages should be gracefully allocated instead and mixed in
the same vma without any failure or significant delay and without
userland noticing”h]”hŒÕif a hugepage allocation fails because of memory fragmentation,
regular pages should be gracefully allocated instead and mixed in
the same vma without any failure or significant delay and without
userland noticing”…””}”(hhûhžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h Khh÷ubah}”(h]”h ]”h"]”h$]”h&]”uh1hÝhhÚhžhhŸh¶h NubhÞ)”}”(hŒÛif some task quits and more hugepages become available (either
immediately in the buddy or through the VM), guest physical memory
backed by regular pages should be relocated on hugepages
automatically (with khugepaged)
”h]”h¸)”}”(hŒÚif some task quits and more hugepages become available (either
immediately in the buddy or through the VM), guest physical memory
backed by regular pages should be relocated on hugepages
automatically (with khugepaged)”h]”hŒÚif some task quits and more hugepages become available (either
immediately in the buddy or through the VM), guest physical memory
backed by regular pages should be relocated on hugepages
automatically (with khugepaged)”…””}”(hj  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h Khj  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hÝhhÚhžhhŸh¶h NubhÞ)”}”(hX[  it doesn't require memory reservation and in turn it uses hugepages
whenever possible (the only possible reservation here is kernelcore=
to avoid unmovable pages to fragment all the memory but such a tweak
is not specific to transparent hugepage support and it's a generic
feature that applies to all dynamic high order allocations in the
kernel)
”h]”h¸)”}”(hXZ  it doesn't require memory reservation and in turn it uses hugepages
whenever possible (the only possible reservation here is kernelcore=
to avoid unmovable pages to fragment all the memory but such a tweak
is not specific to transparent hugepage support and it's a generic
feature that applies to all dynamic high order allocations in the
kernel)”h]”hX^  it doesnâ€™t require memory reservation and in turn it uses hugepages
whenever possible (the only possible reservation here is kernelcore=
to avoid unmovable pages to fragment all the memory but such a tweak
is not specific to transparent hugepage support and itâ€™s a generic
feature that applies to all dynamic high order allocations in the
kernel)”…””}”(hj+  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h Khj'  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hÝhhÚhžhhŸh¶h Nubeh}”(h]”h ]”h"]”h$]”h&]”Œbullet”Œ-”uh1hØhŸh¶h KhhÇhžhubeh}”(h]”Œdesign-principles”ah ]”h"]”Œdesign principles”ah$]”h&]”uh1h¡hh£hžhhŸh¶h K
ubh¢)”}”(hhh]”(h§)”}”(hŒ!get_user_pages and pin_user_pages”h]”hŒ!get_user_pages and pin_user_pages”…””}”(hjR  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h¦hjO  hžhhŸh¶h K#ubh¸)”}”(hX›  get_user_pages and pin_user_pages if run on a hugepage, will return the
head or tail pages as usual (exactly as they would do on
hugetlbfs). Most GUP users will only care about the actual physical
address of the page and its temporary pinning to release after the I/O
is complete, so they won't ever notice the fact the page is huge. But
if any driver is going to mangle over the page structure of the tail
page (like for checking page->mapping or other bits that are relevant
for the head page and not the tail page), it should be updated to jump
to check head page instead. Taking a reference on any head/tail page would
prevent the page from being split by anyone.”h]”hX  get_user_pages and pin_user_pages if run on a hugepage, will return the
head or tail pages as usual (exactly as they would do on
hugetlbfs). Most GUP users will only care about the actual physical
address of the page and its temporary pinning to release after the I/O
is complete, so they wonâ€™t ever notice the fact the page is huge. But
if any driver is going to mangle over the page structure of the tail
page (like for checking page->mapping or other bits that are relevant
for the head page and not the tail page), it should be updated to jump
to check head page instead. Taking a reference on any head/tail page would
prevent the page from being split by anyone.”…””}”(hj`  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h K%hjO  hžhubhŒnote”“”)”}”(hŒÝthese aren't new constraints to the GUP API, and they match the
same constraints that apply to hugetlbfs too, so any driver capable
of handling GUP on hugetlbfs will also work fine on transparent
hugepage backed mappings.”h]”h¸)”}”(hŒÝthese aren't new constraints to the GUP API, and they match the
same constraints that apply to hugetlbfs too, so any driver capable
of handling GUP on hugetlbfs will also work fine on transparent
hugepage backed mappings.”h]”hŒßthese arenâ€™t new constraints to the GUP API, and they match the
same constraints that apply to hugetlbfs too, so any driver capable
of handling GUP on hugetlbfs will also work fine on transparent
hugepage backed mappings.”…””}”(hjt  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h K1hjp  ubah}”(h]”h ]”h"]”h$]”h&]”uh1jn  hjO  hžhhŸh¶h Nubeh}”(h]”Œ!get-user-pages-and-pin-user-pages”ah ]”h"]”Œ!get_user_pages and pin_user_pages”ah$]”h&]”uh1h¡hh£hžhhŸh¶h K#ubh¢)”}”(hhh]”(h§)”}”(hŒGraceful fallback”h]”hŒGraceful fallback”…””}”(hj“  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h¦hj  hžhhŸh¶h K7ubh¸)”}”(hXå  Code walking pagetables but unaware about huge pmds can simply call
split_huge_pmd(vma, pmd, addr) where the pmd is the one returned by
pmd_offset. It's trivial to make the code transparent hugepage aware
by just grepping for "pmd_offset" and adding split_huge_pmd where
missing after pmd_offset returns the pmd. Thanks to the graceful
fallback design, with a one liner change, you can avoid to write
hundreds if not thousands of lines of complex code to make your code
hugepage aware.”h]”hXë  Code walking pagetables but unaware about huge pmds can simply call
split_huge_pmd(vma, pmd, addr) where the pmd is the one returned by
pmd_offset. Itâ€™s trivial to make the code transparent hugepage aware
by just grepping for â€œpmd_offsetâ€ and adding split_huge_pmd where
missing after pmd_offset returns the pmd. Thanks to the graceful
fallback design, with a one liner change, you can avoid to write
hundreds if not thousands of lines of complex code to make your code
hugepage aware.”…””}”(hj¡  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h K9hj  hžhubh¸)”}”(hXN  If you're not walking pagetables but you run into a physical hugepage
that you can't handle natively in your code, you can split it by
calling split_huge_page(page). This is what the Linux VM does before
it tries to swapout the hugepage for example. split_huge_page() can fail
if the page is pinned and you must handle this correctly.”h]”hXR  If youâ€™re not walking pagetables but you run into a physical hugepage
that you canâ€™t handle natively in your code, you can split it by
calling split_huge_page(page). This is what the Linux VM does before
it tries to swapout the hugepage for example. split_huge_page() can fail
if the page is pinned and you must handle this correctly.”…””}”(hj¯  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h KBhj  hžhubh¸)”}”(hŒMExample to make mremap.c transparent hugepage aware with a one liner
change::”h]”hŒLExample to make mremap.c transparent hugepage aware with a one liner
change:”…””}”(hj½  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h KHhj  hžhubhŒliteral_block”“”)”}”(hX5  diff --git a/mm/mremap.c b/mm/mremap.c
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -41,6 +41,7 @@ static pmd_t *get_old_pmd(struct mm_stru
                return NULL;

        pmd = pmd_offset(pud, addr);
+       split_huge_pmd(vma, pmd, addr);
        if (pmd_none_or_clear_bad(pmd))
                return NULL;”h]”hX5  diff --git a/mm/mremap.c b/mm/mremap.c
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -41,6 +41,7 @@ static pmd_t *get_old_pmd(struct mm_stru
                return NULL;

        pmd = pmd_offset(pud, addr);
+       split_huge_pmd(vma, pmd, addr);
        if (pmd_none_or_clear_bad(pmd))
                return NULL;”…””}”hjÍ  sbah}”(h]”h ]”h"]”h$]”h&]”Œ	xml:space”Œpreserve”uh1jË  hŸh¶h KKhj  hžhubeh}”(h]”Œgraceful-fallback”ah ]”h"]”Œgraceful fallback”ah$]”h&]”uh1h¡hh£hžhhŸh¶h K7ubh¢)”}”(hhh]”(h§)”}”(hŒLocking in hugepage aware code”h]”hŒLocking in hugepage aware code”…””}”(hjè  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h¦hjå  hžhhŸh¶h KWubh¸)”}”(hŒmWe want as much code as possible hugepage aware, as calling
split_huge_page() or split_huge_pmd() has a cost.”h]”hŒmWe want as much code as possible hugepage aware, as calling
split_huge_page() or split_huge_pmd() has a cost.”…””}”(hjö  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h KYhjå  hžhubh¸)”}”(hX´  To make pagetable walks huge pmd aware, all you need to do is to call
pmd_trans_huge() on the pmd returned by pmd_offset. You must hold the
mmap_lock in read (or write) mode to be sure a huge pmd cannot be
created from under you by khugepaged (khugepaged collapse_huge_page
takes the mmap_lock in write mode in addition to the anon_vma lock). If
pmd_trans_huge returns false, you just fallback in the old code
paths. If instead pmd_trans_huge returns true, you have to take the
page table lock (pmd_lock()) and re-run pmd_trans_huge. Taking the
page table lock will prevent the huge pmd being converted into a
regular pmd from under you (split_huge_pmd can run in parallel to the
pagetable walk). If the second pmd_trans_huge returns false, you
should just drop the page table lock and fallback to the old code as
before. Otherwise, you can proceed to process the huge pmd and the
hugepage natively. Once finished, you can drop the page table lock.”h]”hX´  To make pagetable walks huge pmd aware, all you need to do is to call
pmd_trans_huge() on the pmd returned by pmd_offset. You must hold the
mmap_lock in read (or write) mode to be sure a huge pmd cannot be
created from under you by khugepaged (khugepaged collapse_huge_page
takes the mmap_lock in write mode in addition to the anon_vma lock). If
pmd_trans_huge returns false, you just fallback in the old code
paths. If instead pmd_trans_huge returns true, you have to take the
page table lock (pmd_lock()) and re-run pmd_trans_huge. Taking the
page table lock will prevent the huge pmd being converted into a
regular pmd from under you (split_huge_pmd can run in parallel to the
pagetable walk). If the second pmd_trans_huge returns false, you
should just drop the page table lock and fallback to the old code as
before. Otherwise, you can proceed to process the huge pmd and the
hugepage natively. Once finished, you can drop the page table lock.”…””}”(hj  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h K\hjå  hžhubeh}”(h]”Œlocking-in-hugepage-aware-code”ah ]”h"]”Œlocking in hugepage aware code”ah$]”h&]”uh1h¡hh£hžhhŸh¶h KWubh¢)”}”(hhh]”(h§)”}”(hŒ$Refcounts and transparent huge pages”h]”hŒ$Refcounts and transparent huge pages”…””}”(hj  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h¦hj  hžhhŸh¶h Klubh¸)”}”(hŒQRefcounting on THP is mostly consistent with refcounting on other compound
pages:”h]”hŒQRefcounting on THP is mostly consistent with refcounting on other compound
pages:”…””}”(hj+  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h Knhj  hžhubhŒblock_quote”“”)”}”(hXP  - get_page()/put_page() and GUP operate on the folio->_refcount.

- ->_refcount in tail pages is always zero: get_page_unless_zero() never
  succeeds on tail pages.

- map/unmap of a PMD entry for the whole THP increment/decrement
  folio->_entire_mapcount and folio->_large_mapcount.

  We also maintain the two slots for tracking MM owners (MM ID and
  corresponding mapcount), and the current status ("maybe mapped shared" vs.
  "mapped exclusively").

  With CONFIG_PAGE_MAPCOUNT, we also increment/decrement
  folio->_nr_pages_mapped by ENTIRELY_MAPPED when _entire_mapcount goes
  from -1 to 0 or 0 to -1.

- map/unmap of individual pages with PTE entry increment/decrement
  folio->_large_mapcount.

  We also maintain the two slots for tracking MM owners (MM ID and
  corresponding mapcount), and the current status ("maybe mapped shared" vs.
  "mapped exclusively").

  With CONFIG_PAGE_MAPCOUNT, we also increment/decrement
  page->_mapcount and increment/decrement folio->_nr_pages_mapped when
  page->_mapcount goes from -1 to 0 or 0 to -1 as this counts the number
  of pages mapped by PTE.
”h]”hÙ)”}”(hhh]”(hÞ)”}”(hŒ?get_page()/put_page() and GUP operate on the folio->_refcount.
”h]”h¸)”}”(hŒ>get_page()/put_page() and GUP operate on the folio->_refcount.”h]”hŒ>get_page()/put_page() and GUP operate on the folio->_refcount.”…””}”(hjF  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h KqhjB  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hÝhj?  ubhÞ)”}”(hŒ_->_refcount in tail pages is always zero: get_page_unless_zero() never
succeeds on tail pages.
”h]”h¸)”}”(hŒ^->_refcount in tail pages is always zero: get_page_unless_zero() never
succeeds on tail pages.”h]”hŒ^->_refcount in tail pages is always zero: get_page_unless_zero() never
succeeds on tail pages.”…””}”(hj^  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h KshjZ  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hÝhj?  ubhÞ)”}”(hX®  map/unmap of a PMD entry for the whole THP increment/decrement
folio->_entire_mapcount and folio->_large_mapcount.

We also maintain the two slots for tracking MM owners (MM ID and
corresponding mapcount), and the current status ("maybe mapped shared" vs.
"mapped exclusively").

With CONFIG_PAGE_MAPCOUNT, we also increment/decrement
folio->_nr_pages_mapped by ENTIRELY_MAPPED when _entire_mapcount goes
from -1 to 0 or 0 to -1.
”h]”(h¸)”}”(hŒrmap/unmap of a PMD entry for the whole THP increment/decrement
folio->_entire_mapcount and folio->_large_mapcount.”h]”hŒrmap/unmap of a PMD entry for the whole THP increment/decrement
folio->_entire_mapcount and folio->_large_mapcount.”…””}”(hjv  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h Kvhjr  ubh¸)”}”(hŒ¢We also maintain the two slots for tracking MM owners (MM ID and
corresponding mapcount), and the current status ("maybe mapped shared" vs.
"mapped exclusively").”h]”hŒªWe also maintain the two slots for tracking MM owners (MM ID and
corresponding mapcount), and the current status (â€œmaybe mapped sharedâ€ vs.
â€œmapped exclusivelyâ€).”…””}”(hj„  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h Kyhjr  ubh¸)”}”(hŒ•With CONFIG_PAGE_MAPCOUNT, we also increment/decrement
folio->_nr_pages_mapped by ENTIRELY_MAPPED when _entire_mapcount goes
from -1 to 0 or 0 to -1.”h]”hŒ•With CONFIG_PAGE_MAPCOUNT, we also increment/decrement
folio->_nr_pages_mapped by ENTIRELY_MAPPED when _entire_mapcount goes
from -1 to 0 or 0 to -1.”…””}”(hj’  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h K}hjr  ubeh}”(h]”h ]”h"]”h$]”h&]”uh1hÝhj?  ubhÞ)”}”(hXÙ  map/unmap of individual pages with PTE entry increment/decrement
folio->_large_mapcount.

We also maintain the two slots for tracking MM owners (MM ID and
corresponding mapcount), and the current status ("maybe mapped shared" vs.
"mapped exclusively").

With CONFIG_PAGE_MAPCOUNT, we also increment/decrement
page->_mapcount and increment/decrement folio->_nr_pages_mapped when
page->_mapcount goes from -1 to 0 or 0 to -1 as this counts the number
of pages mapped by PTE.
”h]”(h¸)”}”(hŒXmap/unmap of individual pages with PTE entry increment/decrement
folio->_large_mapcount.”h]”hŒXmap/unmap of individual pages with PTE entry increment/decrement
folio->_large_mapcount.”…””}”(hjª  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h Khj¦  ubh¸)”}”(hŒ¢We also maintain the two slots for tracking MM owners (MM ID and
corresponding mapcount), and the current status ("maybe mapped shared" vs.
"mapped exclusively").”h]”hŒªWe also maintain the two slots for tracking MM owners (MM ID and
corresponding mapcount), and the current status (â€œmaybe mapped sharedâ€ vs.
â€œmapped exclusivelyâ€).”…””}”(hj¸  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h K„hj¦  ubh¸)”}”(hŒÚWith CONFIG_PAGE_MAPCOUNT, we also increment/decrement
page->_mapcount and increment/decrement folio->_nr_pages_mapped when
page->_mapcount goes from -1 to 0 or 0 to -1 as this counts the number
of pages mapped by PTE.”h]”hŒÚWith CONFIG_PAGE_MAPCOUNT, we also increment/decrement
page->_mapcount and increment/decrement folio->_nr_pages_mapped when
page->_mapcount goes from -1 to 0 or 0 to -1 as this counts the number
of pages mapped by PTE.”…””}”(hjÆ  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h Kˆhj¦  ubeh}”(h]”h ]”h"]”h$]”h&]”uh1hÝhj?  ubeh}”(h]”h ]”h"]”h$]”h&]”jE  jF  uh1hØhŸh¶h Kqhj;  ubah}”(h]”h ]”h"]”h$]”h&]”uh1j9  hŸh¶h Kqhj  hžhubh¸)”}”(hX  split_huge_page internally has to distribute the refcounts in the head
page to the tail pages before clearing all PG_head/tail bits from the page
structures. It can be done easily for refcounts taken by page table
entries, but we don't have enough information on how to distribute any
additional pins (i.e. from get_user_pages). split_huge_page() fails any
requests to split pinned huge pages: it expects page count to be equal to
the sum of mapcount of all sub-pages plus one (split_huge_page caller must
have a reference to the head page).”h]”hX  split_huge_page internally has to distribute the refcounts in the head
page to the tail pages before clearing all PG_head/tail bits from the page
structures. It can be done easily for refcounts taken by page table
entries, but we donâ€™t have enough information on how to distribute any
additional pins (i.e. from get_user_pages). split_huge_page() fails any
requests to split pinned huge pages: it expects page count to be equal to
the sum of mapcount of all sub-pages plus one (split_huge_page caller must
have a reference to the head page).”…””}”(hjæ  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h Khj  hžhubh¸)”}”(hŒ‰split_huge_page uses migration entries to stabilize page->_refcount and
page->_mapcount of anonymous pages. File pages just get unmapped.”h]”hŒ‰split_huge_page uses migration entries to stabilize page->_refcount and
page->_mapcount of anonymous pages. File pages just get unmapped.”…””}”(hjô  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h K–hj  hžhubh¸)”}”(hŒŒWe are safe against physical memory scanners too: the only legitimate way
a scanner can get a reference to a page is get_page_unless_zero().”h]”hŒŒWe are safe against physical memory scanners too: the only legitimate way
a scanner can get a reference to a page is get_page_unless_zero().”…””}”(hj  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h K™hj  hžhubh¸)”}”(hX  All tail pages have zero ->_refcount until atomic_add(). This prevents the
scanner from getting a reference to the tail page up to that point. After the
atomic_add() we don't care about the ->_refcount value. We already know how
many references should be uncharged from the head page.”h]”hX  All tail pages have zero ->_refcount until atomic_add(). This prevents the
scanner from getting a reference to the tail page up to that point. After the
atomic_add() we donâ€™t care about the ->_refcount value. We already know how
many references should be uncharged from the head page.”…””}”(hj  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h Kœhj  hžhubh¸)”}”(hŒ–For head page get_page_unless_zero() will succeed and we don't mind. It's
clear where references should go after split: it will stay on the head page.”h]”hŒšFor head page get_page_unless_zero() will succeed and we donâ€™t mind. Itâ€™s
clear where references should go after split: it will stay on the head page.”…””}”(hj  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h K¡hj  hžhubh¸)”}”(hŒvNote that split_huge_pmd() doesn't have any limitations on refcounting:
pmd can be split at any point and never fails.”h]”hŒxNote that split_huge_pmd() doesnâ€™t have any limitations on refcounting:
pmd can be split at any point and never fails.”…””}”(hj,  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h K¤hj  hžhubeh}”(h]”Œ$refcounts-and-transparent-huge-pages”ah ]”h"]”Œ$refcounts and transparent huge pages”ah$]”h&]”uh1h¡hh£hžhhŸh¶h Klubh¢)”}”(hhh]”(h§)”}”(hŒ8Partial unmap and deferred_split_folio() (anon THP only)”h]”hŒ8Partial unmap and deferred_split_folio() (anon THP only)”…””}”(hjE  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h¦hjB  hžhhŸh¶h K¨ubh¸)”}”(hX  Unmapping part of THP (with munmap() or other way) is not going to free
memory immediately. Instead, we detect that a subpage of THP is not in use
in folio_remove_rmap_*() and queue the THP for splitting if memory pressure
comes. Splitting will free up unused subpages.”h]”hX  Unmapping part of THP (with munmap() or other way) is not going to free
memory immediately. Instead, we detect that a subpage of THP is not in use
in folio_remove_rmap_*() and queue the THP for splitting if memory pressure
comes. Splitting will free up unused subpages.”…””}”(hjS  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h KªhjB  hžhubh¸)”}”(hŒòSplitting the page right away is not an option due to locking context in
the place where we can detect partial unmap. It also might be
counterproductive since in many cases partial unmap happens during exit(2) if
a THP crosses a VMA boundary.”h]”hŒòSplitting the page right away is not an option due to locking context in
the place where we can detect partial unmap. It also might be
counterproductive since in many cases partial unmap happens during exit(2) if
a THP crosses a VMA boundary.”…””}”(hja  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h K¯hjB  hžhubh¸)”}”(hŒ The function deferred_split_folio() is used to queue a folio for splitting.
The splitting itself will happen when we get memory pressure via shrinker
interface.”h]”hŒ The function deferred_split_folio() is used to queue a folio for splitting.
The splitting itself will happen when we get memory pressure via shrinker
interface.”…””}”(hjo  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h K´hjB  hžhubh¸)”}”(hŒ`With CONFIG_PAGE_MAPCOUNT, we reliably detect partial mappings based on
folio->_nr_pages_mapped.”h]”hŒ`With CONFIG_PAGE_MAPCOUNT, we reliably detect partial mappings based on
folio->_nr_pages_mapped.”…””}”(hj}  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h K¸hjB  hžhubh¸)”}”(hX¯  With CONFIG_NO_PAGE_MAPCOUNT, we detect partial mappings based on the
average per-page mapcount in a THP: if the average is < 1, an anon THP is
certainly partially mapped. As long as only a single process maps a THP,
this detection is reliable. With long-running child processes, there can
be scenarios where partial mappings can currently not be detected, and
might need asynchronous detection during memory reclaim in the future.”h]”hX¯  With CONFIG_NO_PAGE_MAPCOUNT, we detect partial mappings based on the
average per-page mapcount in a THP: if the average is < 1, an anon THP is
certainly partially mapped. As long as only a single process maps a THP,
this detection is reliable. With long-running child processes, there can
be scenarios where partial mappings can currently not be detected, and
might need asynchronous detection during memory reclaim in the future.”…””}”(hj‹  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h K»hjB  hžhubeh}”(h]”Œ4partial-unmap-and-deferred-split-folio-anon-thp-only”ah ]”h"]”Œ8partial unmap and deferred_split_folio() (anon thp only)”ah$]”h&]”uh1h¡hh£hžhhŸh¶h K¨ubeh}”(h]”Œtransparent-hugepage-support”ah ]”h"]”Œtransparent hugepage support”ah$]”h&]”uh1h¡hhhžhhŸh¶h Kubeh}”(h]”h ]”h"]”h$]”h&]”Œsource”h¶uh1hŒcurrent_source”NŒcurrent_line”NŒsettings”Œdocutils.frontend”ŒValues”“”)”}”(h¦NŒ	generator”NŒ	datestamp”NŒsource_link”NŒ
source_url”NŒtoc_backlinks”Œentry”Œfootnote_backlinks”KŒsectnum_xform”KŒstrip_comments”NŒstrip_elements_with_classes”NŒstrip_classes”NŒreport_level”KŒ
halt_level”KŒexit_status_level”KŒdebug”NŒwarning_stream”NŒ	traceback”ˆŒinput_encoding”Œ	utf-8-sig”Œinput_encoding_error_handler”Œstrict”Œoutput_encoding”Œutf-8”Œoutput_encoding_error_handler”jÌ  Œerror_encoding”Œutf-8”Œerror_encoding_error_handler”Œbackslashreplace”Œlanguage_code”Œen”Œrecord_dependencies”NŒconfig”NŒ	id_prefix”hŒauto_id_prefix”Œid”Œdump_settings”NŒdump_internals”NŒdump_transforms”NŒdump_pseudo_xml”NŒexpose_internals”NŒstrict_visitor”NŒ_disable_config”NŒ_source”h¶Œ_destination”NŒ_config_files”]”Œ7/var/lib/git/docbuild/linux/Documentation/docutils.conf”aŒfile_insertion_enabled”ˆŒraw_enabled”KŒline_length_limit”M'Œpep_references”NŒpep_base_url”Œhttps://peps.python.org/”Œpep_file_url_template”Œpep-%04d”Œrfc_references”NŒrfc_base_url”Œ&https://datatracker.ietf.org/doc/html/”Œ	tab_width”KŒtrim_footnote_reference_space”‰Œsyntax_highlight”Œlong”Œsmart_quotes”ˆŒsmartquotes_locales”]”Œcharacter_level_inline_markup”‰Œdoctitle_xform”‰Œdocinfo_xform”KŒsectsubtitle_xform”‰Œimage_loading”Œlink”Œembed_stylesheet”‰Œcloak_email_addresses”ˆŒsection_self_link”‰Œenv”NubŒreporter”NŒindirect_targets”]”Œsubstitution_defs”}”Œsubstitution_names”}”Œrefnames”}”Œrefids”}”Œnameids”}”(j¦  j£  jL  jI  j  jŠ  jâ  jß  j  j  j?  j<  jž  j›  uŒ	nametypes”}”(j¦  ‰jL  ‰j  ‰jâ  ‰j  ‰j?  ‰jž  ‰uh}”(j£  h£jI  hÇjŠ  jO  jß  j  j  jå  j<  j  j›  jB  uŒfootnote_refs”}”Œcitation_refs”}”Œautofootnotes”]”Œautofootnote_refs”]”Œsymbol_footnotes”]”Œsymbol_footnote_refs”]”Œ	footnotes”]”Œ	citations”]”Œautofootnote_start”KŒsymbol_footnote_start”K Œ
id_counter”Œcollections”ŒCounter”“”}”…”R”Œparse_messages”]”Œtransform_messages”]”Œtransformer”NŒinclude_log”]”Œ
decoration”Nhžhub.