aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2024-04-29 08:12:10 +1000
committerStephen Rothwell <sfr@canb.auug.org.au>2024-04-29 08:12:10 +1000
commite441537f271a6507413174ac6a1f2346729473ae (patch)
tree01b458d1ba543d467b015a9de1522373427bbc32
parent0189aaf9ffaf70d049704a74b28ccad1118ae3c4 (diff)
parent0e7486112abe567267ea6a1bbcf4ca5fe9484136 (diff)
downloadlinux-next-history-e441537f271a6507413174ac6a1f2346729473ae.tar.gz
Merge branch 'mm-everything' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Notice: this object is not reachable from any branch.
Notice: this object is not reachable from any branch.
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-mm-damon6
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-mm-transparent-hugepage18
-rw-r--r--Documentation/admin-guide/cgroup-v1/memory.rst4
-rw-r--r--Documentation/admin-guide/cgroup-v2.rst18
-rw-r--r--Documentation/admin-guide/mm/damon/usage.rst26
-rw-r--r--Documentation/admin-guide/mm/transhuge.rst35
-rw-r--r--Documentation/admin-guide/mm/zswap.rst29
-rw-r--r--Documentation/core-api/floating-point.rst78
-rw-r--r--Documentation/core-api/index.rst1
-rw-r--r--Documentation/driver-api/crypto/iaa/iaa-crypto.rst2
-rw-r--r--Documentation/filesystems/api-summary.rst3
-rw-r--r--Documentation/filesystems/buffer.rst12
-rw-r--r--Documentation/filesystems/index.rst1
-rw-r--r--Documentation/mm/damon/design.rst20
-rw-r--r--Documentation/mm/page_table_check.rst9
-rw-r--r--Documentation/mm/transhuge.rst12
-rw-r--r--Documentation/userspace-api/index.rst1
-rw-r--r--Documentation/userspace-api/mseal.rst199
-rw-r--r--Makefile5
-rw-r--r--arch/Kconfig6
-rw-r--r--arch/alpha/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/arm/Kconfig1
-rw-r--r--arch/arm/Makefile7
-rw-r--r--arch/arm/include/asm/fpu.h15
-rw-r--r--arch/arm/lib/Makefile3
-rw-r--r--arch/arm/mm/fault.c30
-rw-r--r--arch/arm/tools/syscall.tbl1
-rw-r--r--arch/arm64/Kconfig1
-rw-r--r--arch/arm64/Makefile9
-rw-r--r--arch/arm64/include/asm/fpu.h15
-rw-r--r--arch/arm64/include/asm/pgtable.h55
-rw-r--r--arch/arm64/include/asm/unistd.h2
-rw-r--r--arch/arm64/include/asm/unistd32.h2
-rw-r--r--arch/arm64/lib/Makefile6
-rw-r--r--arch/arm64/mm/contpte.c29
-rw-r--r--arch/arm64/mm/fault.c43
-rw-r--r--arch/loongarch/Kconfig1
-rw-r--r--arch/loongarch/Makefile5
-rw-r--r--arch/loongarch/include/asm/fpu.h1
-rw-r--r--arch/m68k/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/microblaze/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/mips/kernel/syscalls/syscall_n32.tbl1
-rw-r--r--arch/mips/kernel/syscalls/syscall_n64.tbl1
-rw-r--r--arch/mips/kernel/syscalls/syscall_o32.tbl1
-rw-r--r--arch/parisc/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/powerpc/Kconfig1
-rw-r--r--arch/powerpc/Makefile5
-rw-r--r--arch/powerpc/include/asm/fpu.h28
-rw-r--r--arch/powerpc/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/riscv/Kconfig1
-rw-r--r--arch/riscv/Makefile3
-rw-r--r--arch/riscv/include/asm/fpu.h16
-rw-r--r--arch/riscv/kernel/Makefile1
-rw-r--r--arch/riscv/kernel/kernel_mode_fpu.c28
-rw-r--r--arch/s390/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/s390/pci/pci_mmio.c4
-rw-r--r--arch/sh/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/sh/mm/cache.c2
-rw-r--r--arch/sparc/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/Makefile20
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl1
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl1
-rw-r--r--arch/x86/include/asm/fpu.h13
-rw-r--r--arch/x86/include/asm/fpu/types.h6
-rw-r--r--arch/x86/include/asm/pgtable.h18
-rw-r--r--arch/x86/mm/pat/memtype.c5
-rw-r--r--arch/xtensa/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/xtensa/mm/tlb.c11
-rw-r--r--drivers/dax/kmem.c30
-rw-r--r--drivers/gpu/drm/amd/display/Kconfig2
-rw-r--r--drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c35
-rw-r--r--drivers/gpu/drm/amd/display/dc/dml/Makefile36
-rw-r--r--drivers/gpu/drm/amd/display/dc/dml2/Makefile36
-rw-r--r--drivers/media/cec/platform/sti/stih-cec.c1
-rw-r--r--drivers/media/rc/mtk-cir.c1
-rw-r--r--drivers/media/rc/serial_ir.c1
-rw-r--r--drivers/media/rc/st_rc.c1
-rw-r--r--drivers/media/rc/sunxi-cir.c1
-rw-r--r--drivers/vfio/vfio_iommu_type1.c4
-rw-r--r--drivers/virt/acrn/mm.c62
-rw-r--r--fs/buffer.c165
-rw-r--r--fs/crypto/inline_crypt.c6
-rw-r--r--fs/f2fs/data.c5
-rw-r--r--fs/nilfs2/nilfs.h4
-rw-r--r--fs/nilfs2/super.c388
-rw-r--r--fs/nilfs2/the_nilfs.c5
-rw-r--r--fs/nilfs2/the_nilfs.h6
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c1
-rw-r--r--fs/proc/task_mmu.c24
-rw-r--r--include/linux/bitops.h12
-rw-r--r--include/linux/buffer_head.h48
-rw-r--r--include/linux/damon.h2
-rw-r--r--include/linux/fpu.h12
-rw-r--r--include/linux/huge_mm.h52
-rw-r--r--include/linux/hugetlb.h14
-rw-r--r--include/linux/kexec.h6
-rw-r--r--include/linux/kfifo.h9
-rw-r--r--include/linux/ksm.h14
-rw-r--r--include/linux/memcontrol.h17
-rw-r--r--include/linux/memory-tiers.h13
-rw-r--r--include/linux/mm.h99
-rw-r--r--include/linux/mm_types.h14
-rw-r--r--include/linux/page-flags.h8
-rw-r--r--include/linux/page_ref.h11
-rw-r--r--include/linux/pagemap.h6
-rw-r--r--include/linux/pgtable.h74
-rw-r--r--include/linux/rmap.h42
-rw-r--r--include/linux/swap.h5
-rw-r--r--include/linux/syscalls.h1
-rw-r--r--include/linux/writeback.h1
-rw-r--r--include/linux/xarray.h8
-rw-r--r--include/trace/events/page_ref.h4
-rw-r--r--include/uapi/asm-generic/unistd.h5
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--lib/Kconfig.debug2
-rw-r--r--lib/Makefile26
-rw-r--r--lib/kfifo.c8
-rw-r--r--lib/raid6/Makefile33
-rw-r--r--lib/test_fpu.h8
-rw-r--r--lib/test_fpu_glue.c (renamed from lib/test_fpu.c)37
-rw-r--r--lib/test_fpu_impl.c37
-rw-r--r--lib/xarray.c3
-rw-r--r--mm/Makefile4
-rw-r--r--mm/backing-dev.c177
-rw-r--r--mm/damon/paddr.c64
-rw-r--r--mm/damon/sysfs-schemes.c1
-rw-r--r--mm/debug.c12
-rw-r--r--mm/filemap.c22
-rw-r--r--mm/folio-compat.c6
-rw-r--r--mm/gup.c23
-rw-r--r--mm/huge_memory.c92
-rw-r--r--mm/hugetlb.c29
-rw-r--r--mm/hugetlb_cgroup.c2
-rw-r--r--mm/hwpoison-inject.c11
-rw-r--r--mm/internal.h53
-rw-r--r--mm/khugepaged.c40
-rw-r--r--mm/ksm.c291
-rw-r--r--mm/madvise.c119
-rw-r--r--mm/memcontrol.c107
-rw-r--r--mm/memory-failure.c168
-rw-r--r--mm/memory-tiers.c123
-rw-r--r--mm/memory.c117
-rw-r--r--mm/memory_hotplug.c4
-rw-r--r--mm/memremap.c10
-rw-r--r--mm/migrate.c6
-rw-r--r--mm/migrate_device.c25
-rw-r--r--mm/mmap.c47
-rw-r--r--mm/mprotect.c10
-rw-r--r--mm/mremap.c31
-rw-r--r--mm/mseal.c307
-rw-r--r--mm/page-writeback.c79
-rw-r--r--mm/page_alloc.c62
-rw-r--r--mm/page_io.c1
-rw-r--r--mm/page_table_check.c30
-rw-r--r--mm/page_vma_mapped.c18
-rw-r--r--mm/pgtable-generic.c2
-rw-r--r--mm/rmap.c75
-rw-r--r--mm/slab.h2
-rw-r--r--mm/sparse.c2
-rw-r--r--mm/swap.c19
-rw-r--r--mm/swapfile.c39
-rw-r--r--mm/userfaultfd.c27
-rw-r--r--mm/vmalloc.c10
-rw-r--r--mm/vmscan.c40
-rw-r--r--mm/zswap.c98
-rw-r--r--samples/kfifo/dma-example.c3
-rw-r--r--scripts/Makefile.extrawarn29
-rw-r--r--scripts/gdb/linux/cpus.py11
-rw-r--r--scripts/gdb/linux/tasks.py2
-rw-r--r--scripts/gdb/linux/utils.py2
-rw-r--r--tools/include/linux/rbtree_augmented.h4
-rw-r--r--tools/lib/rbtree.c2
-rw-r--r--tools/testing/selftests/mm/.gitignore2
-rw-r--r--tools/testing/selftests/mm/Makefile2
-rw-r--r--tools/testing/selftests/mm/mseal_test.c1893
-rw-r--r--tools/testing/selftests/mm/seal_elf.c179
-rw-r--r--tools/testing/selftests/mm/soft-dirty.c2
-rw-r--r--tools/writeback/wb_monitor.py172
-rw-r--r--virt/kvm/kvm_main.c4
180 files changed, 5408 insertions, 1638 deletions
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon
index dad4d5ffd78656..cef6e1d20b1855 100644
--- a/Documentation/ABI/testing/sysfs-kernel-mm-damon
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon
@@ -314,9 +314,9 @@ Date: Dec 2022
Contact: SeongJae Park <sj@kernel.org>
Description: Writing to and reading from this file sets and gets the type of
the memory of the interest. 'anon' for anonymous pages,
- 'memcg' for specific memory cgroup, 'addr' for address range
- (an open-ended interval), or 'target' for DAMON monitoring
- target can be written and read.
+ 'memcg' for specific memory cgroup, 'young' for young pages,
+ 'addr' for address range (an open-ended interval), or 'target'
+ for DAMON monitoring target can be written and read.
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters/<F>/memcg_path
Date: Dec 2022
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-transparent-hugepage b/Documentation/ABI/testing/sysfs-kernel-mm-transparent-hugepage
new file mode 100644
index 00000000000000..7bfbb9cc2c1130
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-transparent-hugepage
@@ -0,0 +1,18 @@
+What: /sys/kernel/mm/transparent_hugepage/
+Date: April 2024
+Contact: Linux memory management mailing list <linux-mm@kvack.org>
+Description:
+ /sys/kernel/mm/transparent_hugepage/ contains a number of files and
+ subdirectories,
+
+ - defrag
+ - enabled
+ - hpage_pmd_size
+ - khugepaged
+ - shmem_enabled
+ - use_zero_page
+ - subdirectories of the form hugepages-<size>kB, where <size>
+ is the page size of the hugepages supported by the kernel/CPU
+ combination.
+
+ See Documentation/admin-guide/mm/transhuge.rst for details.
diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst
index 46110e6a31bbd9..9cde26d3384355 100644
--- a/Documentation/admin-guide/cgroup-v1/memory.rst
+++ b/Documentation/admin-guide/cgroup-v1/memory.rst
@@ -802,8 +802,8 @@ a page or a swap can be moved only when it is charged to the task's current
| | anonymous pages, file pages (and swaps) in the range mmapped by the task |
| | will be moved even if the task hasn't done page fault, i.e. they might |
| | not be the task's "RSS", but other task's "RSS" that maps the same file. |
-| | And mapcount of the page is ignored (the page can be moved even if |
-| | page_mapcount(page) > 1). You must enable Swap Extension (see 2.4) to |
+| | The mapcount of the page is ignored (the page can be moved independent |
+| | of the mapcount). You must enable Swap Extension (see 2.4) to |
| | enable move of swap charges. |
+---+--------------------------------------------------------------------------+
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 17e6e956515640..0270517ade47cf 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1296,17 +1296,10 @@ PAGE_SIZE multiple when read back.
This is a simple interface to trigger memory reclaim in the
target cgroup.
- This file accepts a single key, the number of bytes to reclaim.
- No nested keys are currently supported.
-
Example::
echo "1G" > memory.reclaim
- The interface can be later extended with nested keys to
- configure the reclaim behavior. For example, specify the
- type of memory to reclaim from (anon, file, ..).
-
Please note that the kernel can over or under reclaim from
the target cgroup. If less bytes are reclaimed than the
specified amount, -EAGAIN is returned.
@@ -1318,6 +1311,17 @@ PAGE_SIZE multiple when read back.
This means that the networking layer will not adapt based on
reclaim induced by memory.reclaim.
+The following nested keys are defined.
+
+ ========== ================================
+ swappiness Swappiness value to reclaim with
+ ========== ================================
+
+ Specifying a swappiness value instructs the kernel to perform
+ the reclaim with that swappiness value. Note that this has the
+ same semantics as vm.swappiness applied to memcg reclaim with
+ all the existing limitations and potential future extensions.
+
memory.peak
A read-only single value file which exists on non-root
cgroups.
diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index 6fce035fdbf5c7..69bc8fabf3781c 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -410,19 +410,19 @@ in the numeric order.
Each filter directory contains six files, namely ``type``, ``matcing``,
``memcg_path``, ``addr_start``, ``addr_end``, and ``target_idx``. To ``type``
-file, you can write one of four special keywords: ``anon`` for anonymous pages,
-``memcg`` for specific memory cgroup, ``addr`` for specific address range (an
-open-ended interval), or ``target`` for specific DAMON monitoring target
-filtering. In case of the memory cgroup filtering, you can specify the memory
-cgroup of the interest by writing the path of the memory cgroup from the
-cgroups mount point to ``memcg_path`` file. In case of the address range
-filtering, you can specify the start and end address of the range to
-``addr_start`` and ``addr_end`` files, respectively. For the DAMON monitoring
-target filtering, you can specify the index of the target between the list of
-the DAMON context's monitoring targets list to ``target_idx`` file. You can
-write ``Y`` or ``N`` to ``matching`` file to filter out pages that does or does
-not match to the type, respectively. Then, the scheme's action will not be
-applied to the pages that specified to be filtered out.
+file, you can write one of five special keywords: ``anon`` for anonymous pages,
+``memcg`` for specific memory cgroup, ``young`` for young pages, ``addr`` for
+specific address range (an open-ended interval), or ``target`` for specific
+DAMON monitoring target filtering. In case of the memory cgroup filtering, you
+can specify the memory cgroup of the interest by writing the path of the memory
+cgroup from the cgroups mount point to ``memcg_path`` file. In case of the
+address range filtering, you can specify the start and end address of the range
+to ``addr_start`` and ``addr_end`` files, respectively. For the DAMON
+monitoring target filtering, you can specify the index of the target between
+the list of the DAMON context's monitoring targets list to ``target_idx`` file.
+You can write ``Y`` or ``N`` to ``matching`` file to filter out pages that does
+or does not match to the type, respectively. Then, the scheme's action will
+not be applied to the pages that specified to be filtered out.
For example, below restricts a DAMOS action to be applied to only non-anonymous
pages of all memory cgroups except ``/having_care_already``.::
diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index 04eb45a2f94069..076443cc10a6c7 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -278,7 +278,8 @@ collapsed, resulting fewer pages being collapsed into
THPs, and lower memory access performance.
``max_ptes_shared`` specifies how many pages can be shared across multiple
-processes. Exceeding the number would block the collapse::
+processes. khugepaged might treat pages of THPs as shared if any page of
+that THP is shared. Exceeding the number would block the collapse::
/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_shared
@@ -369,7 +370,7 @@ monitor how successfully the system is providing huge pages for use.
thp_fault_alloc
is incremented every time a huge page is successfully
- allocated to handle a page fault.
+ allocated and charged to handle a page fault.
thp_collapse_alloc
is incremented by khugepaged when it has found
@@ -377,7 +378,7 @@ thp_collapse_alloc
successfully allocated a new huge page to store the data.
thp_fault_fallback
- is incremented if a page fault fails to allocate
+ is incremented if a page fault fails to allocate or charge
a huge page and instead falls back to using small pages.
thp_fault_fallback_charge
@@ -447,6 +448,34 @@ thp_swpout_fallback
Usually because failed to allocate some continuous swap space
for the huge page.
+In /sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/stats, There are
+also individual counters for each huge page size, which can be utilized to
+monitor the system's effectiveness in providing huge pages for usage. Each
+counter has its own corresponding file.
+
+anon_fault_alloc
+ is incremented every time a huge page is successfully
+ allocated and charged to handle a page fault.
+
+anon_fault_fallback
+ is incremented if a page fault fails to allocate or charge
+ a huge page and instead falls back to using huge pages with
+ lower orders or small pages.
+
+anon_fault_fallback_charge
+ is incremented if a page fault fails to charge a huge page and
+ instead falls back to using huge pages with lower orders or
+ small pages even though the allocation was successful.
+
+anon_swpout
+ is incremented every time a huge page is swapped out in one
+ piece without splitting.
+
+anon_swpout_fallback
+ is incremented if a huge page has to be split before swapout.
+ Usually because failed to allocate some continuous swap space
+ for the huge page.
+
As the system ages, allocating huge pages may be expensive as the
system uses memory compaction to copy data around memory to free a
huge page for use. There are some counters in ``/proc/vmstat`` to help
diff --git a/Documentation/admin-guide/mm/zswap.rst b/Documentation/admin-guide/mm/zswap.rst
index 13632671adaeaa..3598dcd7dbe7e1 100644
--- a/Documentation/admin-guide/mm/zswap.rst
+++ b/Documentation/admin-guide/mm/zswap.rst
@@ -111,35 +111,6 @@ checked if it is a same-value filled page before compressing it. If true, the
compressed length of the page is set to zero and the pattern or same-filled
value is stored.
-Same-value filled pages identification feature is enabled by default and can be
-disabled at boot time by setting the ``same_filled_pages_enabled`` attribute
-to 0, e.g. ``zswap.same_filled_pages_enabled=0``. It can also be enabled and
-disabled at runtime using the sysfs ``same_filled_pages_enabled``
-attribute, e.g.::
-
- echo 1 > /sys/module/zswap/parameters/same_filled_pages_enabled
-
-When zswap same-filled page identification is disabled at runtime, it will stop
-checking for the same-value filled pages during store operation.
-In other words, every page will be then considered non-same-value filled.
-However, the existing pages which are marked as same-value filled pages remain
-stored unchanged in zswap until they are either loaded or invalidated.
-
-In some circumstances it might be advantageous to make use of just the zswap
-ability to efficiently store same-filled pages without enabling the whole
-compressed page storage.
-In this case the handling of non-same-value pages by zswap (enabled by default)
-can be disabled by setting the ``non_same_filled_pages_enabled`` attribute
-to 0, e.g. ``zswap.non_same_filled_pages_enabled=0``.
-It can also be enabled and disabled at runtime using the sysfs
-``non_same_filled_pages_enabled`` attribute, e.g.::
-
- echo 1 > /sys/module/zswap/parameters/non_same_filled_pages_enabled
-
-Disabling both ``zswap.same_filled_pages_enabled`` and
-``zswap.non_same_filled_pages_enabled`` effectively disables accepting any new
-pages by zswap.
-
To prevent zswap from shrinking pool when zswap is full and there's a high
pressure on swap (this will result in flipping pages in and out zswap pool
without any real benefit but with a performance drop for the system), a
diff --git a/Documentation/core-api/floating-point.rst b/Documentation/core-api/floating-point.rst
new file mode 100644
index 00000000000000..a8d0d4b050529f
--- /dev/null
+++ b/Documentation/core-api/floating-point.rst
@@ -0,0 +1,78 @@
+.. SPDX-License-Identifier: GPL-2.0+
+
+Floating-point API
+==================
+
+Kernel code is normally prohibited from using floating-point (FP) registers or
+instructions, including the C float and double data types. This rule reduces
+system call overhead, because the kernel does not need to save and restore the
+userspace floating-point register state.
+
+However, occasionally drivers or library functions may need to include FP code.
+This is supported by isolating the functions containing FP code to a separate
+translation unit (a separate source file), and saving/restoring the FP register
+state around calls to those functions. This creates "critical sections" of
+floating-point usage.
+
+The reason for this isolation is to prevent the compiler from generating code
+touching the FP registers outside these critical sections. Compilers sometimes
+use FP registers to optimize inlined ``memcpy`` or variable assignment, as
+floating-point registers may be wider than general-purpose registers.
+
+Usability of floating-point code within the kernel is architecture-specific.
+Additionally, because a single kernel may be configured to support platforms
+both with and without a floating-point unit, FPU availability must be checked
+both at build time and at run time.
+
+Several architectures implement the generic kernel floating-point API from
+``linux/fpu.h``, as described below. Some other architectures implement their
+own unique APIs, which are documented separately.
+
+Build-time API
+--------------
+
+Floating-point code may be built if the option ``ARCH_HAS_KERNEL_FPU_SUPPORT``
+is enabled. For C code, such code must be placed in a separate file, and that
+file must have its compilation flags adjusted using the following pattern::
+
+ CFLAGS_foo.o += $(CC_FLAGS_FPU)
+ CFLAGS_REMOVE_foo.o += $(CC_FLAGS_NO_FPU)
+
+Architectures are expected to define one or both of these variables in their
+top-level Makefile as needed. For example::
+
+ CC_FLAGS_FPU := -mhard-float
+
+or::
+
+ CC_FLAGS_NO_FPU := -msoft-float
+
+Normal kernel code is assumed to use the equivalent of ``CC_FLAGS_NO_FPU``.
+
+Runtime API
+-----------
+
+The runtime API is provided in ``linux/fpu.h``. This header cannot be included
+from files implementing FP code (those with their compilation flags adjusted as
+above). Instead, it must be included when defining the FP critical sections.
+
+.. c:function:: bool kernel_fpu_available( void )
+
+ This function reports if floating-point code can be used on this CPU or
+ platform. The value returned by this function is not expected to change
+ at runtime, so it only needs to be called once, not before every
+ critical section.
+
+.. c:function:: void kernel_fpu_begin( void )
+ void kernel_fpu_end( void )
+
+ These functions create a floating-point critical section. It is only
+ valid to call ``kernel_fpu_begin()`` after a previous call to
+ ``kernel_fpu_available()`` returned ``true``. These functions are only
+ guaranteed to be callable from (preemptible or non-preemptible) process
+ context.
+
+ Preemption may be disabled inside critical sections, so their size
+ should be minimized. They are *not* required to be reentrant. If the
+ caller expects to nest critical sections, it must implement its own
+ reference counting.
diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst
index 7a3a08d81f111c..974beccd671f64 100644
--- a/Documentation/core-api/index.rst
+++ b/Documentation/core-api/index.rst
@@ -48,6 +48,7 @@ Library functionality that is used throughout the kernel.
errseq
wrappers/atomic_t
wrappers/atomic_bitops
+ floating-point
Low level entry and exit
========================
diff --git a/Documentation/driver-api/crypto/iaa/iaa-crypto.rst b/Documentation/driver-api/crypto/iaa/iaa-crypto.rst
index de587cf9cbed45..4cb1d52ea6dc4d 100644
--- a/Documentation/driver-api/crypto/iaa/iaa-crypto.rst
+++ b/Documentation/driver-api/crypto/iaa/iaa-crypto.rst
@@ -457,7 +457,6 @@ Use the following commands to enable zswap::
# echo deflate-iaa > /sys/module/zswap/parameters/compressor
# echo zsmalloc > /sys/module/zswap/parameters/zpool
# echo 1 > /sys/module/zswap/parameters/enabled
- # echo 0 > /sys/module/zswap/parameters/same_filled_pages_enabled
# echo 100 > /proc/sys/vm/swappiness
# echo never > /sys/kernel/mm/transparent_hugepage/enabled
# echo 1 > /proc/sys/vm/overcommit_memory
@@ -599,7 +598,6 @@ the 'fixed' compression mode::
echo deflate-iaa > /sys/module/zswap/parameters/compressor
echo zsmalloc > /sys/module/zswap/parameters/zpool
echo 1 > /sys/module/zswap/parameters/enabled
- echo 0 > /sys/module/zswap/parameters/same_filled_pages_enabled
echo 100 > /proc/sys/vm/swappiness
echo never > /sys/kernel/mm/transparent_hugepage/enabled
diff --git a/Documentation/filesystems/api-summary.rst b/Documentation/filesystems/api-summary.rst
index 98db2ea5fa12c6..cc5cc7f3fbd8e9 100644
--- a/Documentation/filesystems/api-summary.rst
+++ b/Documentation/filesystems/api-summary.rst
@@ -56,9 +56,6 @@ Other Functions
.. kernel-doc:: fs/namei.c
:export:
-.. kernel-doc:: fs/buffer.c
- :export:
-
.. kernel-doc:: block/bio.c
:export:
diff --git a/Documentation/filesystems/buffer.rst b/Documentation/filesystems/buffer.rst
new file mode 100644
index 00000000000000..ae24faf68efaba
--- /dev/null
+++ b/Documentation/filesystems/buffer.rst
@@ -0,0 +1,12 @@
+Buffer Heads
+============
+
+Linux uses buffer heads to maintain state about individual filesystem blocks.
+Buffer heads are deprecated and new filesystems should use iomap instead.
+
+Functions
+---------
+
+.. kernel-doc:: include/linux/buffer_head.h
+.. kernel-doc:: fs/buffer.c
+ :export:
diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst
index 1f9b4c905a6a7c..8f5c1ee02e2f94 100644
--- a/Documentation/filesystems/index.rst
+++ b/Documentation/filesystems/index.rst
@@ -50,6 +50,7 @@ filesystem implementations.
.. toctree::
:maxdepth: 2
+ buffer
journalling
fscrypt
fsverity
diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst
index 5620aab9b38507..f2baf617184d0a 100644
--- a/Documentation/mm/damon/design.rst
+++ b/Documentation/mm/damon/design.rst
@@ -461,15 +461,17 @@ number of filters for each scheme. Each filter specifies the type of target
memory, and whether it should exclude the memory of the type (filter-out), or
all except the memory of the type (filter-in).
-Currently, anonymous page, memory cgroup, address range, and DAMON monitoring
-target type filters are supported by the feature. Some filter target types
-require additional arguments. The memory cgroup filter type asks users to
-specify the file path of the memory cgroup for the filter. The address range
-type asks the start and end addresses of the range. The DAMON monitoring
-target type asks the index of the target from the context's monitoring targets
-list. Hence, users can apply specific schemes to only anonymous pages,
-non-anonymous pages, pages of specific cgroups, all pages excluding those of
-specific cgroups, pages in specific address range, pages in specific DAMON
+Currently, anonymous page, memory cgroup, young page, address range, and DAMON
+monitoring target type filters are supported by the feature. Some filter
+target types require additional arguments. The memory cgroup filter type asks
+users to specify the file path of the memory cgroup for the filter. The
+address range type asks the start and end addresses of the range. The DAMON
+monitoring target type asks the index of the target from the context's
+monitoring targets list. Hence, users can apply specific schemes to only
+anonymous pages, non-anonymous pages, pages of specific cgroups, all pages
+excluding those of specific cgroups, pages that not accessed after the last
+access check from the scheme, pages that accessed after the last access check
+from the scheme, pages in specific address range, pages in specific DAMON
monitoring targets, and any combination of those.
To handle filters efficiently, the address range and DAMON monitoring target
diff --git a/Documentation/mm/page_table_check.rst b/Documentation/mm/page_table_check.rst
index c12838ce6b8de2..c59f22eb6a0f9a 100644
--- a/Documentation/mm/page_table_check.rst
+++ b/Documentation/mm/page_table_check.rst
@@ -14,7 +14,7 @@ Page table check performs extra verifications at the time when new pages become
accessible from the userspace by getting their page table entries (PTEs PMDs
etc.) added into the table.
-In case of detected corruption, the kernel is crashed. There is a small
+In case of most detected corruption, the kernel is crashed. There is a small
performance and memory overhead associated with the page table check. Therefore,
it is disabled by default, but can be optionally enabled on systems where the
extra hardening outweighs the performance costs. Also, because page table check
@@ -22,6 +22,13 @@ is synchronous, it can help with debugging double map memory corruption issues,
by crashing kernel at the time wrong mapping occurs instead of later which is
often the case with memory corruptions bugs.
+It can also be used to do page table entry checks over various flags, dump
+warnings when illegal combinations of entry flags are detected. Currently,
+userfaultfd is the only user of such to sanity check wr-protect bit against
+any writable flags. Illegal flag combinations will not directly cause data
+corruption in this case immediately, but that will cause read-only data to
+be writable, leading to corrupt when the page content is later modified.
+
Double mapping detection logic
==============================
diff --git a/Documentation/mm/transhuge.rst b/Documentation/mm/transhuge.rst
index 93c9239b9ebe23..1ba0ad63246c53 100644
--- a/Documentation/mm/transhuge.rst
+++ b/Documentation/mm/transhuge.rst
@@ -116,14 +116,14 @@ pages:
succeeds on tail pages.
- map/unmap of a PMD entry for the whole THP increment/decrement
- folio->_entire_mapcount and also increment/decrement
- folio->_nr_pages_mapped by ENTIRELY_MAPPED when _entire_mapcount
- goes from -1 to 0 or 0 to -1.
+ folio->_entire_mapcount, increment/decrement folio->_large_mapcount
+ and also increment/decrement folio->_nr_pages_mapped by ENTIRELY_MAPPED
+ when _entire_mapcount goes from -1 to 0 or 0 to -1.
- map/unmap of individual pages with PTE entry increment/decrement
- page->_mapcount and also increment/decrement folio->_nr_pages_mapped
- when page->_mapcount goes from -1 to 0 or 0 to -1 as this counts
- the number of pages mapped by PTE.
+ page->_mapcount, increment/decrement folio->_large_mapcount and also
+ increment/decrement folio->_nr_pages_mapped when page->_mapcount goes
+ from -1 to 0 or 0 to -1 as this counts the number of pages mapped by PTE.
split_huge_page internally has to distribute the refcounts in the head
page to the tail pages before clearing all PG_head/tail bits from the page
diff --git a/Documentation/userspace-api/index.rst b/Documentation/userspace-api/index.rst
index afecfe3cc4a86b..5926115ec0ed86 100644
--- a/Documentation/userspace-api/index.rst
+++ b/Documentation/userspace-api/index.rst
@@ -20,6 +20,7 @@ System calls
futex2
ebpf/index
ioctl/index
+ mseal
Security-related interfaces
===========================
diff --git a/Documentation/userspace-api/mseal.rst b/Documentation/userspace-api/mseal.rst
new file mode 100644
index 00000000000000..4132eec995a399
--- /dev/null
+++ b/Documentation/userspace-api/mseal.rst
@@ -0,0 +1,199 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=====================
+Introduction of mseal
+=====================
+
+:Author: Jeff Xu <jeffxu@chromium.org>
+
+Modern CPUs support memory permissions such as RW and NX bits. The memory
+permission feature improves security stance on memory corruption bugs, i.e.
+the attacker can’t just write to arbitrary memory and point the code to it,
+the memory has to be marked with X bit, or else an exception will happen.
+
+Memory sealing additionally protects the mapping itself against
+modifications. This is useful to mitigate memory corruption issues where a
+corrupted pointer is passed to a memory management system. For example,
+such an attacker primitive can break control-flow integrity guarantees
+since read-only memory that is supposed to be trusted can become writable
+or .text pages can get remapped. Memory sealing can automatically be
+applied by the runtime loader to seal .text and .rodata pages and
+applications can additionally seal security critical data at runtime.
+
+A similar feature already exists in the XNU kernel with the
+VM_FLAGS_PERMANENT flag [1] and on OpenBSD with the mimmutable syscall [2].
+
+User API
+========
+mseal()
+-----------
+The mseal() syscall has the following signature:
+
+``int mseal(void addr, size_t len, unsigned long flags)``
+
+**addr/len**: virtual memory address range.
+
+The address range set by ``addr``/``len`` must meet:
+ - The start address must be in an allocated VMA.
+ - The start address must be page aligned.
+ - The end address (``addr`` + ``len``) must be in an allocated VMA.
+ - no gap (unallocated memory) between start and end address.
+
+The ``len`` will be paged aligned implicitly by the kernel.
+
+**flags**: reserved for future use.
+
+**return values**:
+
+- ``0``: Success.
+
+- ``-EINVAL``:
+ - Invalid input ``flags``.
+ - The start address (``addr``) is not page aligned.
+ - Address range (``addr`` + ``len``) overflow.
+
+- ``-ENOMEM``:
+ - The start address (``addr``) is not allocated.
+ - The end address (``addr`` + ``len``) is not allocated.
+ - A gap (unallocated memory) between start and end address.
+
+- ``-EPERM``:
+ - sealing is supported only on 64-bit CPUs, 32-bit is not supported.
+
+- For above error cases, users can expect the given memory range is
+ unmodified, i.e. no partial update.
+
+- There might be other internal errors/cases not listed here, e.g.
+ error during merging/splitting VMAs, or the process reaching the max
+ number of supported VMAs. In those cases, partial updates to the given
+ memory range could happen. However, those cases should be rare.
+
+**Blocked operations after sealing**:
+ Unmapping, moving to another location, and shrinking the size,
+ via munmap() and mremap(), can leave an empty space, therefore
+ can be replaced with a VMA with a new set of attributes.
+
+ Moving or expanding a different VMA into the current location,
+ via mremap().
+
+ Modifying a VMA via mmap(MAP_FIXED).
+
+ Size expansion, via mremap(), does not appear to pose any
+ specific risks to sealed VMAs. It is included anyway because
+ the use case is unclear. In any case, users can rely on
+ merging to expand a sealed VMA.
+
+ mprotect() and pkey_mprotect().
+
+ Some destructive madvice() behaviors (e.g. MADV_DONTNEED)
+ for anonymous memory, when users don't have write permission to the
+ memory. Those behaviors can alter region contents by discarding pages,
+ effectively a memset(0) for anonymous memory.
+
+ Kernel will return -EPERM for blocked operations.
+
+ For blocked operations, one can expect the given address is unmodified,
+ i.e. no partial update. Note, this is different from existing mm
+ system call behaviors, where partial updates are made till an error is
+ found and returned to userspace. To give an example:
+
+ Assume following code sequence:
+
+ - ptr = mmap(null, 8192, PROT_NONE);
+ - munmap(ptr + 4096, 4096);
+ - ret1 = mprotect(ptr, 8192, PROT_READ);
+ - mseal(ptr, 4096);
+ - ret2 = mprotect(ptr, 8192, PROT_NONE);
+
+ ret1 will be -ENOMEM, the page from ptr is updated to PROT_READ.
+
+ ret2 will be -EPERM, the page remains to be PROT_READ.
+
+**Note**:
+
+- mseal() only works on 64-bit CPUs, not 32-bit CPU.
+
+- users can call mseal() multiple times, mseal() on an already sealed memory
+ is a no-action (not error).
+
+- munseal() is not supported.
+
+Use cases:
+==========
+- glibc:
+ The dynamic linker, during loading ELF executables, can apply sealing to
+ non-writable memory segments.
+
+- Chrome browser: protect some security sensitive data-structures.
+
+Notes on which memory to seal:
+==============================
+
+It might be important to note that sealing changes the lifetime of a mapping,
+i.e. the sealed mapping won’t be unmapped till the process terminates or the
+exec system call is invoked. Applications can apply sealing to any virtual
+memory region from userspace, but it is crucial to thoroughly analyze the
+mapping's lifetime prior to apply the sealing.
+
+For example:
+
+- aio/shm
+
+ aio/shm can call mmap()/munmap() on behalf of userspace, e.g. ksys_shmdt() in
+ shm.c. The lifetime of those mapping are not tied to the lifetime of the
+ process. If those memories are sealed from userspace, then munmap() will fail,
+ causing leaks in VMA address space during the lifetime of the process.
+
+- Brk (heap)
+
+ Currently, userspace applications can seal parts of the heap by calling
+ malloc() and mseal().
+ let's assume following calls from user space:
+
+ - ptr = malloc(size);
+ - mprotect(ptr, size, RO);
+ - mseal(ptr, size);
+ - free(ptr);
+
+ Technically, before mseal() is added, the user can change the protection of
+ the heap by calling mprotect(RO). As long as the user changes the protection
+ back to RW before free(), the memory range can be reused.
+
+ Adding mseal() into the picture, however, the heap is then sealed partially,
+ the user can still free it, but the memory remains to be RO. If the address
+ is re-used by the heap manager for another malloc, the process might crash
+ soon after. Therefore, it is important not to apply sealing to any memory
+ that might get recycled.
+
+ Furthermore, even if the application never calls the free() for the ptr,
+ the heap manager may invoke the brk system call to shrink the size of the
+ heap. In the kernel, the brk-shrink will call munmap(). Consequently,
+ depending on the location of the ptr, the outcome of brk-shrink is
+ nondeterministic.
+
+
+Additional notes:
+=================
+As Jann Horn pointed out in [3], there are still a few ways to write
+to RO memory, which is, in a way, by design. Those cases are not covered
+by mseal(). If applications want to block such cases, sandbox tools (such as
+seccomp, LSM, etc) might be considered.
+
+Those cases are:
+
+- Write to read-only memory through /proc/self/mem interface.
+- Write to read-only memory through ptrace (such as PTRACE_POKETEXT).
+- userfaultfd.
+
+The idea that inspired this patch comes from Stephen Röttger’s work in V8
+CFI [4]. Chrome browser in ChromeOS will be the first user of this API.
+
+Reference:
+==========
+[1] https://github.com/apple-oss-distributions/xnu/blob/1031c584a5e37aff177559b9f69dbd3c8c3fd30a/osfmk/mach/vm_statistics.h#L274
+
+[2] https://man.openbsd.org/mimmutable.2
+
+[3] https://lore.kernel.org/lkml/CAG48ez3ShUYey+ZAFsU2i1RpQn0a5eOs2hzQ426FkcgnfUGLvA@mail.gmail.com
+
+[4] https://docs.google.com/document/d/1O2jwK4dxI3nRcOJuPYkonhTkNQfbmwdvxQMyXgeaRHo/edit#heading=h.bvaojj9fu6hc
diff --git a/Makefile b/Makefile
index 40fb2ca6fe4c04..4a0b501865955c 100644
--- a/Makefile
+++ b/Makefile
@@ -964,6 +964,11 @@ KBUILD_CFLAGS += $(CC_FLAGS_CFI)
export CC_FLAGS_CFI
endif
+# Architectures can define flags to add/remove for floating-point support
+CC_FLAGS_FPU += -D_LINUX_FPU_COMPILATION_UNIT
+export CC_FLAGS_FPU
+export CC_FLAGS_NO_FPU
+
ifneq ($(CONFIG_FUNCTION_ALIGNMENT),0)
# Set the minimal function alignment. Use the newer GCC option
# -fmin-function-alignment if it is available, or fall back to -falign-funtions.
diff --git a/arch/Kconfig b/arch/Kconfig
index 30f7930275d838..c5962cd8356109 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1577,6 +1577,12 @@ config ARCH_HAS_NONLEAF_PMD_YOUNG
address translations. Page table walkers that clear the accessed bit
may use this capability to reduce their search space.
+config ARCH_HAS_KERNEL_FPU_SUPPORT
+ bool
+ help
+ Architectures that select this option can run floating-point code in
+ the kernel, as described in Documentation/core-api/floating-point.rst.
+
source "kernel/gcov/Kconfig"
source "scripts/gcc-plugins/Kconfig"
diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index 8ff110826ce21d..d8f96362e9f83e 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -501,3 +501,4 @@
569 common lsm_get_self_attr sys_lsm_get_self_attr
570 common lsm_set_self_attr sys_lsm_set_self_attr
571 common lsm_list_modules sys_lsm_list_modules
+572 common mseal sys_mseal
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 817918f6635a50..35fc382635354a 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -15,6 +15,7 @@ config ARM
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_KEEPINITRD
select ARCH_HAS_KCOV
+ select ARCH_HAS_KERNEL_FPU_SUPPORT if KERNEL_MODE_NEON
select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
select ARCH_HAS_PTE_SPECIAL if ARM_LPAE
diff --git a/arch/arm/Makefile b/arch/arm/Makefile
index d82908b1b1bb44..71afdd98ddf27f 100644
--- a/arch/arm/Makefile
+++ b/arch/arm/Makefile
@@ -130,6 +130,13 @@ endif
# Accept old syntax despite ".syntax unified"
AFLAGS_NOWARN :=$(call as-option,-Wa$(comma)-mno-warn-deprecated,-Wa$(comma)-W)
+# The GCC option -ffreestanding is required in order to compile code containing
+# ARM/NEON intrinsics in a non C99-compliant environment (such as the kernel)
+CC_FLAGS_FPU := -ffreestanding
+# Enable <arm_neon.h>
+CC_FLAGS_FPU += -isystem $(shell $(CC) -print-file-name=include)
+CC_FLAGS_FPU += -march=armv7-a -mfloat-abi=softfp -mfpu=neon
+
ifeq ($(CONFIG_THUMB2_KERNEL),y)
CFLAGS_ISA :=-Wa,-mimplicit-it=always $(AFLAGS_NOWARN)
AFLAGS_ISA :=$(CFLAGS_ISA) -Wa$(comma)-mthumb
diff --git a/arch/arm/include/asm/fpu.h b/arch/arm/include/asm/fpu.h
new file mode 100644
index 00000000000000..2ae50bdce59b43
--- /dev/null
+++ b/arch/arm/include/asm/fpu.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2023 SiFive
+ */
+
+#ifndef __ASM_FPU_H
+#define __ASM_FPU_H
+
+#include <asm/neon.h>
+
+#define kernel_fpu_available() cpu_has_neon()
+#define kernel_fpu_begin() kernel_neon_begin()
+#define kernel_fpu_end() kernel_neon_end()
+
+#endif /* ! __ASM_FPU_H */
diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
index 650404be6768a0..0ca5aae1bcc3e3 100644
--- a/arch/arm/lib/Makefile
+++ b/arch/arm/lib/Makefile
@@ -40,8 +40,7 @@ $(obj)/csumpartialcopy.o: $(obj)/csumpartialcopygeneric.S
$(obj)/csumpartialcopyuser.o: $(obj)/csumpartialcopygeneric.S
ifeq ($(CONFIG_KERNEL_MODE_NEON),y)
- NEON_FLAGS := -march=armv7-a -mfloat-abi=softfp -mfpu=neon
- CFLAGS_xor-neon.o += $(NEON_FLAGS)
+ CFLAGS_xor-neon.o += $(CC_FLAGS_FPU)
obj-$(CONFIG_XOR_BLOCKS) += xor-neon.o
endif
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index 5c4b417e24f9ed..bf07afdb2dd9e1 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -226,9 +226,6 @@ void do_bad_area(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
}
#ifdef CONFIG_MMU
-#define VM_FAULT_BADMAP ((__force vm_fault_t)0x010000)
-#define VM_FAULT_BADACCESS ((__force vm_fault_t)0x020000)
-
static inline bool is_permission_fault(unsigned int fsr)
{
int fs = fsr_fs(fsr);
@@ -295,7 +292,8 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
if (!(vma->vm_flags & vm_flags)) {
vma_end_read(vma);
count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
- fault = VM_FAULT_BADACCESS;
+ fault = 0;
+ code = SEGV_ACCERR;
goto bad_area;
}
fault = handle_mm_fault(vma, addr, flags | FAULT_FLAG_VMA_LOCK, regs);
@@ -321,7 +319,8 @@ lock_mmap:
retry:
vma = lock_mm_and_find_vma(mm, addr, regs);
if (unlikely(!vma)) {
- fault = VM_FAULT_BADMAP;
+ fault = 0;
+ code = SEGV_MAPERR;
goto bad_area;
}
@@ -329,10 +328,14 @@ retry:
* ok, we have a good vm_area for this memory access, check the
* permissions on the VMA allow for the fault which occurred.
*/
- if (!(vma->vm_flags & vm_flags))
- fault = VM_FAULT_BADACCESS;
- else
- fault = handle_mm_fault(vma, addr & PAGE_MASK, flags, regs);
+ if (!(vma->vm_flags & vm_flags)) {
+ mmap_read_unlock(mm);
+ fault = 0;
+ code = SEGV_ACCERR;
+ goto bad_area;
+ }
+
+ fault = handle_mm_fault(vma, addr & PAGE_MASK, flags, regs);
/* If we need to retry but a fatal signal is pending, handle the
* signal first. We do not need to release the mmap_lock because
@@ -358,12 +361,11 @@ retry:
mmap_read_unlock(mm);
done:
- /*
- * Handle the "normal" case first - VM_FAULT_MAJOR
- */
- if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS))))
+ /* Handle the "normal" case first */
+ if (likely(!(fault & VM_FAULT_ERROR)))
return 0;
+ code = SEGV_MAPERR;
bad_area:
/*
* If we are in kernel mode at this point, we
@@ -395,8 +397,6 @@ bad_area:
* isn't in our memory map..
*/
sig = SIGSEGV;
- code = fault == VM_FAULT_BADACCESS ?
- SEGV_ACCERR : SEGV_MAPERR;
}
__do_user_fault(addr, fsr, sig, code, regs);
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index b6c9e01e14f559..2ed7d229c8f96d 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -475,3 +475,4 @@
459 common lsm_get_self_attr sys_lsm_get_self_attr
460 common lsm_set_self_attr sys_lsm_set_self_attr
461 common lsm_list_modules sys_lsm_list_modules
+462 common mseal sys_mseal
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index de076a191e9fd6..1bc3d2470ad1e5 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -30,6 +30,7 @@ config ARM64
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_GIGANTIC_PAGE
select ARCH_HAS_KCOV
+ select ARCH_HAS_KERNEL_FPU_SUPPORT if KERNEL_MODE_NEON
select ARCH_HAS_KEEPINITRD
select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index 0e075d3c546b48..3e863e5b016909 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -36,7 +36,14 @@ ifeq ($(CONFIG_BROKEN_GAS_INST),y)
$(warning Detected assembler with broken .inst; disassembly will be unreliable)
endif
-KBUILD_CFLAGS += -mgeneral-regs-only \
+# The GCC option -ffreestanding is required in order to compile code containing
+# ARM/NEON intrinsics in a non C99-compliant environment (such as the kernel)
+CC_FLAGS_FPU := -ffreestanding
+# Enable <arm_neon.h>
+CC_FLAGS_FPU += -isystem $(shell $(CC) -print-file-name=include)
+CC_FLAGS_NO_FPU := -mgeneral-regs-only
+
+KBUILD_CFLAGS += $(CC_FLAGS_NO_FPU) \
$(compat_vdso) $(cc_has_k_constraint)
KBUILD_CFLAGS += $(call cc-disable-warning, psabi)
KBUILD_AFLAGS += $(compat_vdso)
diff --git a/arch/arm64/include/asm/fpu.h b/arch/arm64/include/asm/fpu.h
new file mode 100644
index 00000000000000..2ae50bdce59b43
--- /dev/null
+++ b/arch/arm64/include/asm/fpu.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2023 SiFive
+ */
+
+#ifndef __ASM_FPU_H
+#define __ASM_FPU_H
+
+#include <asm/neon.h>
+
+#define kernel_fpu_available() cpu_has_neon()
+#define kernel_fpu_begin() kernel_neon_begin()
+#define kernel_fpu_end() kernel_neon_end()
+
+#endif /* ! __ASM_FPU_H */
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 9fd8613b2db2a9..1303d30287dce0 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1223,6 +1223,46 @@ static inline void __wrprotect_ptes(struct mm_struct *mm, unsigned long address,
__ptep_set_wrprotect(mm, address, ptep);
}
+static inline void __clear_young_dirty_pte(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t pte, cydp_t flags)
+{
+ pte_t old_pte;
+
+ do {
+ old_pte = pte;
+
+ if (flags & CYDP_CLEAR_YOUNG)
+ pte = pte_mkold(pte);
+ if (flags & CYDP_CLEAR_DIRTY)
+ pte = pte_mkclean(pte);
+
+ pte_val(pte) = cmpxchg_relaxed(&pte_val(*ptep),
+ pte_val(old_pte), pte_val(pte));
+ } while (pte_val(pte) != pte_val(old_pte));
+}
+
+static inline void __clear_young_dirty_ptes(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ unsigned int nr, cydp_t flags)
+{
+ pte_t pte;
+
+ for (;;) {
+ pte = __ptep_get(ptep);
+
+ if (flags == (CYDP_CLEAR_YOUNG | CYDP_CLEAR_DIRTY))
+ __set_pte(ptep, pte_mkclean(pte_mkold(pte)));
+ else
+ __clear_young_dirty_pte(vma, addr, ptep, pte, flags);
+
+ if (--nr == 0)
+ break;
+ ptep++;
+ addr += PAGE_SIZE;
+ }
+}
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define __HAVE_ARCH_PMDP_SET_WRPROTECT
static inline void pmdp_set_wrprotect(struct mm_struct *mm,
@@ -1379,6 +1419,9 @@ extern void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
extern int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep,
pte_t entry, int dirty);
+extern void contpte_clear_young_dirty_ptes(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ unsigned int nr, cydp_t flags);
static __always_inline void contpte_try_fold(struct mm_struct *mm,
unsigned long addr, pte_t *ptep, pte_t pte)
@@ -1603,6 +1646,17 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,
return contpte_ptep_set_access_flags(vma, addr, ptep, entry, dirty);
}
+#define clear_young_dirty_ptes clear_young_dirty_ptes
+static inline void clear_young_dirty_ptes(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ unsigned int nr, cydp_t flags)
+{
+ if (likely(nr == 1 && !pte_cont(__ptep_get(ptep))))
+ __clear_young_dirty_ptes(vma, addr, ptep, nr, flags);
+ else
+ contpte_clear_young_dirty_ptes(vma, addr, ptep, nr, flags);
+}
+
#else /* CONFIG_ARM64_CONTPTE */
#define ptep_get __ptep_get
@@ -1622,6 +1676,7 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,
#define wrprotect_ptes __wrprotect_ptes
#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
#define ptep_set_access_flags __ptep_set_access_flags
+#define clear_young_dirty_ptes __clear_young_dirty_ptes
#endif /* CONFIG_ARM64_CONTPTE */
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index 491b2b9bd5536f..1346579f802f7f 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -39,7 +39,7 @@
#define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5)
#define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800)
-#define __NR_compat_syscalls 462
+#define __NR_compat_syscalls 463
#endif
#define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 7118282d1c7976..266b96acc01439 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -929,6 +929,8 @@ __SYSCALL(__NR_lsm_get_self_attr, sys_lsm_get_self_attr)
__SYSCALL(__NR_lsm_set_self_attr, sys_lsm_set_self_attr)
#define __NR_lsm_list_modules 461
__SYSCALL(__NR_lsm_list_modules, sys_lsm_list_modules)
+#define __NR_mseal 462
+__SYSCALL(__NR_mseal, sys_mseal)
/*
* Please add new compat syscalls above this comment and update
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 29490be2546bfc..13e6a2829116ca 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -7,10 +7,8 @@ lib-y := clear_user.o delay.o copy_from_user.o \
ifeq ($(CONFIG_KERNEL_MODE_NEON), y)
obj-$(CONFIG_XOR_BLOCKS) += xor-neon.o
-CFLAGS_REMOVE_xor-neon.o += -mgeneral-regs-only
-CFLAGS_xor-neon.o += -ffreestanding
-# Enable <arm_neon.h>
-CFLAGS_xor-neon.o += -isystem $(shell $(CC) -print-file-name=include)
+CFLAGS_xor-neon.o += $(CC_FLAGS_FPU)
+CFLAGS_REMOVE_xor-neon.o += $(CC_FLAGS_NO_FPU)
endif
lib-$(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) += uaccess_flushcache.o
diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
index 1b64b4c3f8bf8a..9f9486de0004dd 100644
--- a/arch/arm64/mm/contpte.c
+++ b/arch/arm64/mm/contpte.c
@@ -361,6 +361,35 @@ void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
}
EXPORT_SYMBOL_GPL(contpte_wrprotect_ptes);
+void contpte_clear_young_dirty_ptes(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ unsigned int nr, cydp_t flags)
+{
+ /*
+ * We can safely clear access/dirty without needing to unfold from
+ * the architectures perspective, even when contpte is set. If the
+ * range starts or ends midway through a contpte block, we can just
+ * expand to include the full contpte block. While this is not
+ * exactly what the core-mm asked for, it tracks access/dirty per
+ * folio, not per page. And since we only create a contpte block
+ * when it is covered by a single folio, we can get away with
+ * clearing access/dirty for the whole block.
+ */
+ unsigned long start = addr;
+ unsigned long end = start + nr;
+
+ if (pte_cont(__ptep_get(ptep + nr - 1)))
+ end = ALIGN(end, CONT_PTE_SIZE);
+
+ if (pte_cont(__ptep_get(ptep))) {
+ start = ALIGN_DOWN(start, CONT_PTE_SIZE);
+ ptep = contpte_align_down(ptep);
+ }
+
+ __clear_young_dirty_ptes(vma, start, ptep, end - start, flags);
+}
+EXPORT_SYMBOL_GPL(contpte_clear_young_dirty_ptes);
+
int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep,
pte_t entry, int dirty)
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 405f9aa831bd20..451ba7cbd5adb0 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -500,9 +500,6 @@ static bool is_write_abort(unsigned long esr)
return (esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM);
}
-#define VM_FAULT_BADMAP ((__force vm_fault_t)0x010000)
-#define VM_FAULT_BADACCESS ((__force vm_fault_t)0x020000)
-
static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
struct pt_regs *regs)
{
@@ -513,6 +510,7 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
unsigned int mm_flags = FAULT_FLAG_DEFAULT;
unsigned long addr = untagged_addr(far);
struct vm_area_struct *vma;
+ int si_code;
if (kprobe_page_fault(regs, esr))
return 0;
@@ -572,9 +570,10 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
if (!(vma->vm_flags & vm_flags)) {
vma_end_read(vma);
- fault = VM_FAULT_BADACCESS;
+ fault = 0;
+ si_code = SEGV_ACCERR;
count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
- goto done;
+ goto bad_area;
}
fault = handle_mm_fault(vma, addr, mm_flags | FAULT_FLAG_VMA_LOCK, regs);
if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
@@ -599,15 +598,19 @@ lock_mmap:
retry:
vma = lock_mm_and_find_vma(mm, addr, regs);
if (unlikely(!vma)) {
- fault = VM_FAULT_BADMAP;
- goto done;
+ fault = 0;
+ si_code = SEGV_MAPERR;
+ goto bad_area;
}
- if (!(vma->vm_flags & vm_flags))
- fault = VM_FAULT_BADACCESS;
- else
- fault = handle_mm_fault(vma, addr, mm_flags, regs);
+ if (!(vma->vm_flags & vm_flags)) {
+ mmap_read_unlock(mm);
+ fault = 0;
+ si_code = SEGV_ACCERR;
+ goto bad_area;
+ }
+ fault = handle_mm_fault(vma, addr, mm_flags, regs);
/* Quick path to respond to signals */
if (fault_signal_pending(fault, regs)) {
if (!user_mode(regs))
@@ -626,13 +629,12 @@ retry:
mmap_read_unlock(mm);
done:
- /*
- * Handle the "normal" (no error) case first.
- */
- if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP |
- VM_FAULT_BADACCESS))))
+ /* Handle the "normal" (no error) case first. */
+ if (likely(!(fault & VM_FAULT_ERROR)))
return 0;
+ si_code = SEGV_MAPERR;
+bad_area:
/*
* If we are in kernel mode at this point, we have no context to
* handle this fault with.
@@ -667,13 +669,8 @@ done:
arm64_force_sig_mceerr(BUS_MCEERR_AR, far, lsb, inf->name);
} else {
- /*
- * Something tried to access memory that isn't in our memory
- * map.
- */
- arm64_force_sig_fault(SIGSEGV,
- fault == VM_FAULT_BADACCESS ? SEGV_ACCERR : SEGV_MAPERR,
- far, inf->name);
+ /* Something tried to access memory that out of memory map */
+ arm64_force_sig_fault(SIGSEGV, si_code, far, inf->name);
}
return 0;
diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index 331450bd23c78c..13077142ae8251 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -18,6 +18,7 @@ config LOONGARCH
select ARCH_HAS_CURRENT_STACK_POINTER
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_KCOV
+ select ARCH_HAS_KERNEL_FPU_SUPPORT if CPU_HAS_FPU
select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS
select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
select ARCH_HAS_PTE_SPECIAL
diff --git a/arch/loongarch/Makefile b/arch/loongarch/Makefile
index df6caf79537a9d..efb5440a43ec4c 100644
--- a/arch/loongarch/Makefile
+++ b/arch/loongarch/Makefile
@@ -26,6 +26,9 @@ endif
32bit-emul = elf32loongarch
64bit-emul = elf64loongarch
+CC_FLAGS_FPU := -mfpu=64
+CC_FLAGS_NO_FPU := -msoft-float
+
ifdef CONFIG_UNWINDER_ORC
orc_hash_h := arch/$(SRCARCH)/include/generated/asm/orc_hash.h
orc_hash_sh := $(srctree)/scripts/orc_hash.sh
@@ -59,7 +62,7 @@ ld-emul = $(64bit-emul)
cflags-y += -mabi=lp64s
endif
-cflags-y += -pipe -msoft-float
+cflags-y += -pipe $(CC_FLAGS_NO_FPU)
LDFLAGS_vmlinux += -static -n -nostdlib
# When the assembler supports explicit relocation hint, we must use it.
diff --git a/arch/loongarch/include/asm/fpu.h b/arch/loongarch/include/asm/fpu.h
index c2d8962fda00be..3177674228f896 100644
--- a/arch/loongarch/include/asm/fpu.h
+++ b/arch/loongarch/include/asm/fpu.h
@@ -21,6 +21,7 @@
struct sigcontext;
+#define kernel_fpu_available() cpu_has_fpu
extern void kernel_fpu_begin(void);
extern void kernel_fpu_end(void);
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index 7fd43fd4c9f2e2..22a3cbd4c60298 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -461,3 +461,4 @@
459 common lsm_get_self_attr sys_lsm_get_self_attr
460 common lsm_set_self_attr sys_lsm_set_self_attr
461 common lsm_list_modules sys_lsm_list_modules
+462 common mseal sys_mseal
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index b00ab2cabab92a..2b81a6bd78b292 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -467,3 +467,4 @@
459 common lsm_get_self_attr sys_lsm_get_self_attr
460 common lsm_set_self_attr sys_lsm_set_self_attr
461 common lsm_list_modules sys_lsm_list_modules
+462 common mseal sys_mseal
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index 83cfc9eb6b88e6..cc869f5d569318 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -400,3 +400,4 @@
459 n32 lsm_get_self_attr sys_lsm_get_self_attr
460 n32 lsm_set_self_attr sys_lsm_set_self_attr
461 n32 lsm_list_modules sys_lsm_list_modules
+462 n32 mseal sys_mseal
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index 532b855df58957..1464c6be6eb3c7 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -376,3 +376,4 @@
459 n64 lsm_get_self_attr sys_lsm_get_self_attr
460 n64 lsm_set_self_attr sys_lsm_set_self_attr
461 n64 lsm_list_modules sys_lsm_list_modules
+462 n64 mseal sys_mseal
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index f45c9530ea93a1..008ebe60263e37 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -449,3 +449,4 @@
459 o32 lsm_get_self_attr sys_lsm_get_self_attr
460 o32 lsm_set_self_attr sys_lsm_set_self_attr
461 o32 lsm_list_modules sys_lsm_list_modules
+462 o32 mseal sys_mseal
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index b236a84c4e127e..b13c21373974c8 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -460,3 +460,4 @@
459 common lsm_get_self_attr sys_lsm_get_self_attr
460 common lsm_set_self_attr sys_lsm_set_self_attr
461 common lsm_list_modules sys_lsm_list_modules
+462 common mseal sys_mseal
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index e42cc8cd415ff4..45addf6848ad15 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -137,6 +137,7 @@ config PPC
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_HUGEPD if HUGETLB_PAGE
select ARCH_HAS_KCOV
+ select ARCH_HAS_KERNEL_FPU_SUPPORT if PPC_FPU
select ARCH_HAS_MEMBARRIER_CALLBACKS
select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_MEMREMAP_COMPAT_ALIGN if PPC_64S_HASH_MMU
diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 65261cbe5bfdbf..93d89f055b70b7 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -153,6 +153,9 @@ CFLAGS-$(CONFIG_PPC32) += $(call cc-option, $(MULTIPLEWORD))
CFLAGS-$(CONFIG_PPC32) += $(call cc-option,-mno-readonly-in-sdata)
+CC_FLAGS_FPU := $(call cc-option,-mhard-float)
+CC_FLAGS_NO_FPU := $(call cc-option,-msoft-float)
+
ifdef CONFIG_FUNCTION_TRACER
ifdef CONFIG_ARCH_USING_PATCHABLE_FUNCTION_ENTRY
KBUILD_CPPFLAGS += -DCC_USING_PATCHABLE_FUNCTION_ENTRY
@@ -174,7 +177,7 @@ asinstr := $(call as-instr,lis 9$(comma)foo@high,-DHAVE_AS_ATHIGH=1)
KBUILD_CPPFLAGS += -I $(srctree)/arch/powerpc $(asinstr)
KBUILD_AFLAGS += $(AFLAGS-y)
-KBUILD_CFLAGS += $(call cc-option,-msoft-float)
+KBUILD_CFLAGS += $(CC_FLAGS_NO_FPU)
KBUILD_CFLAGS += $(CFLAGS-y)
CPP = $(CC) -E $(KBUILD_CFLAGS)
diff --git a/arch/powerpc/include/asm/fpu.h b/arch/powerpc/include/asm/fpu.h
new file mode 100644
index 00000000000000..ca584e4bc40f84
--- /dev/null
+++ b/arch/powerpc/include/asm/fpu.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2023 SiFive
+ */
+
+#ifndef _ASM_POWERPC_FPU_H
+#define _ASM_POWERPC_FPU_H
+
+#include <linux/preempt.h>
+
+#include <asm/cpu_has_feature.h>
+#include <asm/switch_to.h>
+
+#define kernel_fpu_available() (!cpu_has_feature(CPU_FTR_FPU_UNAVAILABLE))
+
+static inline void kernel_fpu_begin(void)
+{
+ preempt_disable();
+ enable_kernel_fp();
+}
+
+static inline void kernel_fpu_end(void)
+{
+ disable_kernel_fp();
+ preempt_enable();
+}
+
+#endif /* ! _ASM_POWERPC_FPU_H */
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index 17173b82ca21dc..3656f1ca7a21c6 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -548,3 +548,4 @@
459 common lsm_get_self_attr sys_lsm_get_self_attr
460 common lsm_set_self_attr sys_lsm_set_self_attr
461 common lsm_list_modules sys_lsm_list_modules
+462 common mseal sys_mseal
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 3ee60ddef93ebf..bf1a83c09ea059 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -27,6 +27,7 @@ config RISCV
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_GIGANTIC_PAGE
select ARCH_HAS_KCOV
+ select ARCH_HAS_KERNEL_FPU_SUPPORT if 64BIT && FPU
select ARCH_HAS_MEMBARRIER_CALLBACKS
select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_MMIOWB
diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile
index 5b3115a1985226..032b9dc5afbf3f 100644
--- a/arch/riscv/Makefile
+++ b/arch/riscv/Makefile
@@ -84,6 +84,9 @@ KBUILD_CFLAGS += -march=$(shell echo $(riscv-march-y) | sed -E 's/(rv32ima|rv64i
KBUILD_AFLAGS += -march=$(riscv-march-y)
+# For C code built with floating-point support, exclude V but keep F and D.
+CC_FLAGS_FPU := -march=$(shell echo $(riscv-march-y) | sed -E 's/(rv32ima|rv64ima)([^v_]*)v?/\1\2/')
+
KBUILD_CFLAGS += -mno-save-restore
KBUILD_CFLAGS += -DCONFIG_PAGE_OFFSET=$(CONFIG_PAGE_OFFSET)
diff --git a/arch/riscv/include/asm/fpu.h b/arch/riscv/include/asm/fpu.h
new file mode 100644
index 00000000000000..91c04c244e12d3
--- /dev/null
+++ b/arch/riscv/include/asm/fpu.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2023 SiFive
+ */
+
+#ifndef _ASM_RISCV_FPU_H
+#define _ASM_RISCV_FPU_H
+
+#include <asm/switch_to.h>
+
+#define kernel_fpu_available() has_fpu()
+
+void kernel_fpu_begin(void);
+void kernel_fpu_end(void);
+
+#endif /* ! _ASM_RISCV_FPU_H */
diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
index 81d94a8ee10f29..5b243d46f4b174 100644
--- a/arch/riscv/kernel/Makefile
+++ b/arch/riscv/kernel/Makefile
@@ -67,6 +67,7 @@ obj-$(CONFIG_RISCV_MISALIGNED) += unaligned_access_speed.o
obj-$(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS) += copy-unaligned.o
obj-$(CONFIG_FPU) += fpu.o
+obj-$(CONFIG_FPU) += kernel_mode_fpu.o
obj-$(CONFIG_RISCV_ISA_V) += vector.o
obj-$(CONFIG_RISCV_ISA_V) += kernel_mode_vector.o
obj-$(CONFIG_SMP) += smpboot.o
diff --git a/arch/riscv/kernel/kernel_mode_fpu.c b/arch/riscv/kernel/kernel_mode_fpu.c
new file mode 100644
index 00000000000000..0ac8348876c499
--- /dev/null
+++ b/arch/riscv/kernel/kernel_mode_fpu.c
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2023 SiFive
+ */
+
+#include <linux/export.h>
+#include <linux/preempt.h>
+
+#include <asm/csr.h>
+#include <asm/fpu.h>
+#include <asm/processor.h>
+#include <asm/switch_to.h>
+
+void kernel_fpu_begin(void)
+{
+ preempt_disable();
+ fstate_save(current, task_pt_regs(current));
+ csr_set(CSR_SSTATUS, SR_FS);
+}
+EXPORT_SYMBOL_GPL(kernel_fpu_begin);
+
+void kernel_fpu_end(void)
+{
+ csr_clear(CSR_SSTATUS, SR_FS);
+ fstate_restore(current, task_pt_regs(current));
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(kernel_fpu_end);
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index 095bb86339a7d3..bd0fee24ad10a3 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -464,3 +464,4 @@
459 common lsm_get_self_attr sys_lsm_get_self_attr sys_lsm_get_self_attr
460 common lsm_set_self_attr sys_lsm_set_self_attr sys_lsm_set_self_attr
461 common lsm_list_modules sys_lsm_list_modules sys_lsm_list_modules
+462 common mseal sys_mseal sys_mseal
diff --git a/arch/s390/pci/pci_mmio.c b/arch/s390/pci/pci_mmio.c
index a90499c087f0c5..5398729bfe1b77 100644
--- a/arch/s390/pci/pci_mmio.c
+++ b/arch/s390/pci/pci_mmio.c
@@ -169,7 +169,7 @@ SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr,
if (!(vma->vm_flags & VM_WRITE))
goto out_unlock_mmap;
- ret = follow_pte(vma->vm_mm, mmio_addr, &ptep, &ptl);
+ ret = follow_pte(vma, mmio_addr, &ptep, &ptl);
if (ret)
goto out_unlock_mmap;
@@ -308,7 +308,7 @@ SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr,
if (!(vma->vm_flags & VM_WRITE))
goto out_unlock_mmap;
- ret = follow_pte(vma->vm_mm, mmio_addr, &ptep, &ptl);
+ ret = follow_pte(vma, mmio_addr, &ptep, &ptl);
if (ret)
goto out_unlock_mmap;
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index 86fe269f02203b..bbf83a2db9868d 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -464,3 +464,4 @@
459 common lsm_get_self_attr sys_lsm_get_self_attr
460 common lsm_set_self_attr sys_lsm_set_self_attr
461 common lsm_list_modules sys_lsm_list_modules
+462 common mseal sys_mseal
diff --git a/arch/sh/mm/cache.c b/arch/sh/mm/cache.c
index 9bcaa5619eabd1..d8be352e14d20d 100644
--- a/arch/sh/mm/cache.c
+++ b/arch/sh/mm/cache.c
@@ -84,7 +84,7 @@ void copy_from_user_page(struct vm_area_struct *vma, struct page *page,
{
struct folio *folio = page_folio(page);
- if (boot_cpu_data.dcache.n_aliases && page_mapcount(page) &&
+ if (boot_cpu_data.dcache.n_aliases && folio_mapped(folio) &&
test_bit(PG_dcache_clean, &folio->flags)) {
void *vfrom = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK);
memcpy(dst, vfrom, len);
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index b23d59313589aa..ac6c281ccfe029 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -507,3 +507,4 @@
459 common lsm_get_self_attr sys_lsm_get_self_attr
460 common lsm_set_self_attr sys_lsm_set_self_attr
461 common lsm_list_modules sys_lsm_list_modules
+462 common mseal sys_mseal
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 59df6c4180810a..2e2a69647e5ec7 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -84,6 +84,7 @@ config X86
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_KCOV if X86_64
+ select ARCH_HAS_KERNEL_FPU_SUPPORT
select ARCH_HAS_MEM_ENCRYPT
select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 5ab93fcdd691dc..164498b3382ae2 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -74,6 +74,26 @@ KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx
KBUILD_RUSTFLAGS += --target=$(objtree)/scripts/target.json
KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2
+#
+# CFLAGS for compiling floating point code inside the kernel.
+#
+CC_FLAGS_FPU := -msse -msse2
+ifdef CONFIG_CC_IS_GCC
+# Stack alignment mismatch, proceed with caution.
+# GCC < 7.1 cannot compile code using `double` and -mpreferred-stack-boundary=3
+# (8B stack alignment).
+# See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383
+#
+# The "-msse" in the first argument is there so that the
+# -mpreferred-stack-boundary=3 build error:
+#
+# -mpreferred-stack-boundary=3 is not between 4 and 12
+#
+# can be triggered. Otherwise gcc doesn't complain.
+CC_FLAGS_FPU += -mhard-float
+CC_FLAGS_FPU += $(call cc-option,-msse -mpreferred-stack-boundary=3,-mpreferred-stack-boundary=4)
+endif
+
ifeq ($(CONFIG_X86_KERNEL_IBT),y)
#
# Kernel IBT has S_CET.NOTRACK_EN=0, as such the compilers must not generate
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 5f8591ce7f25e7..7fd1f57ad3d30c 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -466,3 +466,4 @@
459 i386 lsm_get_self_attr sys_lsm_get_self_attr
460 i386 lsm_set_self_attr sys_lsm_set_self_attr
461 i386 lsm_list_modules sys_lsm_list_modules
+462 i386 mseal sys_mseal
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 7e8d46f4147f57..52df0dec70da6c 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -383,6 +383,7 @@
459 common lsm_get_self_attr sys_lsm_get_self_attr
460 common lsm_set_self_attr sys_lsm_set_self_attr
461 common lsm_list_modules sys_lsm_list_modules
+462 common mseal sys_mseal
#
# Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/x86/include/asm/fpu.h b/arch/x86/include/asm/fpu.h
new file mode 100644
index 00000000000000..b2743fe19339a6
--- /dev/null
+++ b/arch/x86/include/asm/fpu.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2023 SiFive
+ */
+
+#ifndef _ASM_X86_FPU_H
+#define _ASM_X86_FPU_H
+
+#include <asm/fpu/api.h>
+
+#define kernel_fpu_available() true
+
+#endif /* ! _ASM_X86_FPU_H */
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index ace9aa3b78a305..eb17f31b06d25c 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -2,8 +2,8 @@
/*
* FPU data structures:
*/
-#ifndef _ASM_X86_FPU_H
-#define _ASM_X86_FPU_H
+#ifndef _ASM_X86_FPU_TYPES_H
+#define _ASM_X86_FPU_TYPES_H
#include <asm/page_types.h>
@@ -596,4 +596,4 @@ struct fpu_state_config {
/* FPU state configuration information */
extern struct fpu_state_config fpu_kernel_cfg, fpu_user_cfg;
-#endif /* _ASM_X86_FPU_H */
+#endif /* _ASM_X86_FPU_TYPES_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 273f7557218cd6..65b8e5bb902cc3 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -388,23 +388,7 @@ static inline pte_t pte_wrprotect(pte_t pte)
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
static inline int pte_uffd_wp(pte_t pte)
{
- bool wp = pte_flags(pte) & _PAGE_UFFD_WP;
-
-#ifdef CONFIG_DEBUG_VM
- /*
- * Having write bit for wr-protect-marked present ptes is fatal,
- * because it means the uffd-wp bit will be ignored and write will
- * just go through.
- *
- * Use any chance of pgtable walking to verify this (e.g., when
- * page swapped out or being migrated for all purposes). It means
- * something is already wrong. Tell the admin even before the
- * process crashes. We also nail it with wrong pgtable setup.
- */
- WARN_ON_ONCE(wp && pte_write(pte));
-#endif
-
- return wp;
+ return pte_flags(pte) & _PAGE_UFFD_WP;
}
static inline pte_t pte_mkuffd_wp(pte_t pte)
diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
index d01c3b0bd6eb1c..bdc2a240c2aae6 100644
--- a/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -954,10 +954,7 @@ static int follow_phys(struct vm_area_struct *vma, unsigned long *prot,
pte_t *ptep, pte;
spinlock_t *ptl;
- if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
- return -EINVAL;
-
- if (follow_pte(vma->vm_mm, vma->vm_start, &ptep, &ptl))
+ if (follow_pte(vma, vma->vm_start, &ptep, &ptl))
return -EINVAL;
pte = ptep_get(ptep);
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index dd116598fb2517..67083fc1b2f563 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -432,3 +432,4 @@
459 common lsm_get_self_attr sys_lsm_get_self_attr
460 common lsm_set_self_attr sys_lsm_set_self_attr
461 common lsm_list_modules sys_lsm_list_modules
+462 common mseal sys_mseal
diff --git a/arch/xtensa/mm/tlb.c b/arch/xtensa/mm/tlb.c
index 4f974b74883cae..d8b60d6e50a8e7 100644
--- a/arch/xtensa/mm/tlb.c
+++ b/arch/xtensa/mm/tlb.c
@@ -256,12 +256,13 @@ static int check_tlb_entry(unsigned w, unsigned e, bool dtlb)
dtlb ? 'D' : 'I', w, e, r0, r1, pte);
if (pte == 0 || !pte_present(__pte(pte))) {
struct page *p = pfn_to_page(r1 >> PAGE_SHIFT);
- pr_err("page refcount: %d, mapcount: %d\n",
- page_count(p),
- page_mapcount(p));
- if (!page_count(p))
+ struct folio *f = page_folio(p);
+
+ pr_err("folio refcount: %d, mapcount: %d\n",
+ folio_ref_count(f), folio_mapcount(f));
+ if (!folio_ref_count(f))
rc |= TLB_INSANE;
- else if (page_mapcount(p))
+ else if (folio_mapped(f))
rc |= TLB_SUSPICIOUS;
} else {
rc |= TLB_INSANE;
diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
index 42ee360cf4e3db..4fe9d040e37545 100644
--- a/drivers/dax/kmem.c
+++ b/drivers/dax/kmem.c
@@ -55,36 +55,14 @@ static LIST_HEAD(kmem_memory_types);
static struct memory_dev_type *kmem_find_alloc_memory_type(int adist)
{
- bool found = false;
- struct memory_dev_type *mtype;
-
- mutex_lock(&kmem_memory_type_lock);
- list_for_each_entry(mtype, &kmem_memory_types, list) {
- if (mtype->adistance == adist) {
- found = true;
- break;
- }
- }
- if (!found) {
- mtype = alloc_memory_type(adist);
- if (!IS_ERR(mtype))
- list_add(&mtype->list, &kmem_memory_types);
- }
- mutex_unlock(&kmem_memory_type_lock);
-
- return mtype;
+ guard(mutex)(&kmem_memory_type_lock);
+ return mt_find_alloc_memory_type(adist, &kmem_memory_types);
}
static void kmem_put_memory_types(void)
{
- struct memory_dev_type *mtype, *mtn;
-
- mutex_lock(&kmem_memory_type_lock);
- list_for_each_entry_safe(mtype, mtn, &kmem_memory_types, list) {
- list_del(&mtype->list);
- put_memory_type(mtype);
- }
- mutex_unlock(&kmem_memory_type_lock);
+ guard(mutex)(&kmem_memory_type_lock);
+ mt_put_memory_types(&kmem_memory_types);
}
static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
diff --git a/drivers/gpu/drm/amd/display/Kconfig b/drivers/gpu/drm/amd/display/Kconfig
index 901d1961b73927..5fcd4f778dc3dd 100644
--- a/drivers/gpu/drm/amd/display/Kconfig
+++ b/drivers/gpu/drm/amd/display/Kconfig
@@ -8,7 +8,7 @@ config DRM_AMD_DC
depends on BROKEN || !CC_IS_CLANG || ARM64 || RISCV || SPARC64 || X86_64
select SND_HDA_COMPONENT if SND_HDA_CORE
# !CC_IS_CLANG: https://github.com/ClangBuiltLinux/linux/issues/1752
- select DRM_AMD_DC_FP if (X86 || LOONGARCH || (PPC64 && ALTIVEC) || (ARM64 && KERNEL_MODE_NEON && !CC_IS_CLANG))
+ select DRM_AMD_DC_FP if ARCH_HAS_KERNEL_FPU_SUPPORT && (!ARM64 || !CC_IS_CLANG)
help
Choose this option if you want to use the new display engine
support for AMDGPU. This adds required support for Vega and
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c b/drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c
index 4ae4720535a56d..e46f8ce41d871c 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c
@@ -26,16 +26,7 @@
#include "dc_trace.h"
-#if defined(CONFIG_X86)
-#include <asm/fpu/api.h>
-#elif defined(CONFIG_PPC64)
-#include <asm/switch_to.h>
-#include <asm/cputable.h>
-#elif defined(CONFIG_ARM64)
-#include <asm/neon.h>
-#elif defined(CONFIG_LOONGARCH)
-#include <asm/fpu.h>
-#endif
+#include <linux/fpu.h>
/**
* DOC: DC FPU manipulation overview
@@ -87,20 +78,9 @@ void dc_fpu_begin(const char *function_name, const int line)
WARN_ON_ONCE(!in_task());
preempt_disable();
depth = __this_cpu_inc_return(fpu_recursion_depth);
-
if (depth == 1) {
-#if defined(CONFIG_X86) || defined(CONFIG_LOONGARCH)
+ BUG_ON(!kernel_fpu_available());
kernel_fpu_begin();
-#elif defined(CONFIG_PPC64)
- if (cpu_has_feature(CPU_FTR_VSX_COMP))
- enable_kernel_vsx();
- else if (cpu_has_feature(CPU_FTR_ALTIVEC_COMP))
- enable_kernel_altivec();
- else if (!cpu_has_feature(CPU_FTR_FPU_UNAVAILABLE))
- enable_kernel_fp();
-#elif defined(CONFIG_ARM64)
- kernel_neon_begin();
-#endif
}
TRACE_DCN_FPU(true, function_name, line, depth);
@@ -122,18 +102,7 @@ void dc_fpu_end(const char *function_name, const int line)
depth = __this_cpu_dec_return(fpu_recursion_depth);
if (depth == 0) {
-#if defined(CONFIG_X86) || defined(CONFIG_LOONGARCH)
kernel_fpu_end();
-#elif defined(CONFIG_PPC64)
- if (cpu_has_feature(CPU_FTR_VSX_COMP))
- disable_kernel_vsx();
- else if (cpu_has_feature(CPU_FTR_ALTIVEC_COMP))
- disable_kernel_altivec();
- else if (!cpu_has_feature(CPU_FTR_FPU_UNAVAILABLE))
- disable_kernel_fp();
-#elif defined(CONFIG_ARM64)
- kernel_neon_end();
-#endif
} else {
WARN_ON_ONCE(depth < 0);
}
diff --git a/drivers/gpu/drm/amd/display/dc/dml/Makefile b/drivers/gpu/drm/amd/display/dc/dml/Makefile
index c4a5efd2dda51c..a94b6d546cd1a1 100644
--- a/drivers/gpu/drm/amd/display/dc/dml/Makefile
+++ b/drivers/gpu/drm/amd/display/dc/dml/Makefile
@@ -25,40 +25,8 @@
# It provides the general basic services required by other DAL
# subcomponents.
-ifdef CONFIG_X86
-dml_ccflags-$(CONFIG_CC_IS_GCC) := -mhard-float
-dml_ccflags := $(dml_ccflags-y) -msse
-endif
-
-ifdef CONFIG_PPC64
-dml_ccflags := -mhard-float -maltivec
-endif
-
-ifdef CONFIG_ARM64
-dml_rcflags := -mgeneral-regs-only
-endif
-
-ifdef CONFIG_LOONGARCH
-dml_ccflags := -mfpu=64
-dml_rcflags := -msoft-float
-endif
-
-ifdef CONFIG_CC_IS_GCC
-ifneq ($(call gcc-min-version, 70100),y)
-IS_OLD_GCC = 1
-endif
-endif
-
-ifdef CONFIG_X86
-ifdef IS_OLD_GCC
-# Stack alignment mismatch, proceed with caution.
-# GCC < 7.1 cannot compile code using `double` and -mpreferred-stack-boundary=3
-# (8B stack alignment).
-dml_ccflags += -mpreferred-stack-boundary=4
-else
-dml_ccflags += -msse2
-endif
-endif
+dml_ccflags := $(CC_FLAGS_FPU)
+dml_rcflags := $(CC_FLAGS_NO_FPU)
ifneq ($(CONFIG_FRAME_WARN),0)
ifeq ($(filter y,$(CONFIG_KASAN)$(CONFIG_KCSAN)),y)
diff --git a/drivers/gpu/drm/amd/display/dc/dml2/Makefile b/drivers/gpu/drm/amd/display/dc/dml2/Makefile
index acff3449b8d787..4f6c804a26ad6c 100644
--- a/drivers/gpu/drm/amd/display/dc/dml2/Makefile
+++ b/drivers/gpu/drm/amd/display/dc/dml2/Makefile
@@ -24,40 +24,8 @@
#
# Makefile for dml2.
-ifdef CONFIG_X86
-dml2_ccflags-$(CONFIG_CC_IS_GCC) := -mhard-float
-dml2_ccflags := $(dml2_ccflags-y) -msse
-endif
-
-ifdef CONFIG_PPC64
-dml2_ccflags := -mhard-float -maltivec
-endif
-
-ifdef CONFIG_ARM64
-dml2_rcflags := -mgeneral-regs-only
-endif
-
-ifdef CONFIG_LOONGARCH
-dml2_ccflags := -mfpu=64
-dml2_rcflags := -msoft-float
-endif
-
-ifdef CONFIG_CC_IS_GCC
-ifeq ($(call cc-ifversion, -lt, 0701, y), y)
-IS_OLD_GCC = 1
-endif
-endif
-
-ifdef CONFIG_X86
-ifdef IS_OLD_GCC
-# Stack alignment mismatch, proceed with caution.
-# GCC < 7.1 cannot compile code using `double` and -mpreferred-stack-boundary=3
-# (8B stack alignment).
-dml2_ccflags += -mpreferred-stack-boundary=4
-else
-dml2_ccflags += -msse2
-endif
-endif
+dml2_ccflags := $(CC_FLAGS_FPU)
+dml2_rcflags := $(CC_FLAGS_NO_FPU)
ifneq ($(CONFIG_FRAME_WARN),0)
ifeq ($(filter y,$(CONFIG_KASAN)$(CONFIG_KCSAN)),y)
diff --git a/drivers/media/cec/platform/sti/stih-cec.c b/drivers/media/cec/platform/sti/stih-cec.c
index a20fc5c0c88d0e..99978a7c8d9b70 100644
--- a/drivers/media/cec/platform/sti/stih-cec.c
+++ b/drivers/media/cec/platform/sti/stih-cec.c
@@ -6,6 +6,7 @@
*/
#include <linux/clk.h>
#include <linux/interrupt.h>
+#include <linux/io.h>
#include <linux/kernel.h>
#include <linux/mfd/syscon.h>
#include <linux/module.h>
diff --git a/drivers/media/rc/mtk-cir.c b/drivers/media/rc/mtk-cir.c
index 4e294e59d3cba8..b2f82b2d1c8d40 100644
--- a/drivers/media/rc/mtk-cir.c
+++ b/drivers/media/rc/mtk-cir.c
@@ -8,6 +8,7 @@
#include <linux/clk.h>
#include <linux/interrupt.h>
#include <linux/module.h>
+#include <linux/io.h>
#include <linux/of.h>
#include <linux/platform_device.h>
#include <linux/reset.h>
diff --git a/drivers/media/rc/serial_ir.c b/drivers/media/rc/serial_ir.c
index 96ae0294ac102a..fc5fd392717720 100644
--- a/drivers/media/rc/serial_ir.c
+++ b/drivers/media/rc/serial_ir.c
@@ -18,6 +18,7 @@
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
+#include <linux/io.h>
#include <linux/kernel.h>
#include <linux/serial_reg.h>
#include <linux/types.h>
diff --git a/drivers/media/rc/st_rc.c b/drivers/media/rc/st_rc.c
index 28477aa955635d..988b09191c4c74 100644
--- a/drivers/media/rc/st_rc.c
+++ b/drivers/media/rc/st_rc.c
@@ -6,6 +6,7 @@
#include <linux/kernel.h>
#include <linux/clk.h>
#include <linux/interrupt.h>
+#include <linux/io.h>
#include <linux/module.h>
#include <linux/of.h>
#include <linux/platform_device.h>
diff --git a/drivers/media/rc/sunxi-cir.c b/drivers/media/rc/sunxi-cir.c
index bf58c965ead8cf..b49df8355e6b39 100644
--- a/drivers/media/rc/sunxi-cir.c
+++ b/drivers/media/rc/sunxi-cir.c
@@ -12,6 +12,7 @@
#include <linux/clk.h>
#include <linux/interrupt.h>
+#include <linux/io.h>
#include <linux/module.h>
#include <linux/of.h>
#include <linux/platform_device.h>
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index b5c15fe8f9fcf9..3a0218171cfaa0 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -518,7 +518,7 @@ static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm,
spinlock_t *ptl;
int ret;
- ret = follow_pte(vma->vm_mm, vaddr, &ptep, &ptl);
+ ret = follow_pte(vma, vaddr, &ptep, &ptl);
if (ret) {
bool unlocked = false;
@@ -532,7 +532,7 @@ static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm,
if (ret)
return ret;
- ret = follow_pte(vma->vm_mm, vaddr, &ptep, &ptl);
+ ret = follow_pte(vma, vaddr, &ptep, &ptl);
if (ret)
return ret;
}
diff --git a/drivers/virt/acrn/mm.c b/drivers/virt/acrn/mm.c
index b30077baf352b4..db8ff1d0ac23ce 100644
--- a/drivers/virt/acrn/mm.c
+++ b/drivers/virt/acrn/mm.c
@@ -156,23 +156,29 @@ int acrn_vm_memseg_unmap(struct acrn_vm *vm, struct acrn_vm_memmap *memmap)
int acrn_vm_ram_map(struct acrn_vm *vm, struct acrn_vm_memmap *memmap)
{
struct vm_memory_region_batch *regions_info;
- int nr_pages, i = 0, order, nr_regions = 0;
+ int nr_pages, i, order, nr_regions = 0;
struct vm_memory_mapping *region_mapping;
struct vm_memory_region_op *vm_region;
struct page **pages = NULL, *page;
void *remap_vaddr;
int ret, pinned;
u64 user_vm_pa;
- unsigned long pfn;
struct vm_area_struct *vma;
if (!vm || !memmap)
return -EINVAL;
+ /* Get the page number of the map region */
+ nr_pages = memmap->len >> PAGE_SHIFT;
+ if (!nr_pages)
+ return -EINVAL;
+
mmap_read_lock(current->mm);
vma = vma_lookup(current->mm, memmap->vma_base);
if (vma && ((vma->vm_flags & VM_PFNMAP) != 0)) {
+ unsigned long start_pfn, cur_pfn;
spinlock_t *ptl;
+ bool writable;
pte_t *ptep;
if ((memmap->vma_base + memmap->len) > vma->vm_end) {
@@ -180,25 +186,52 @@ int acrn_vm_ram_map(struct acrn_vm *vm, struct acrn_vm_memmap *memmap)
return -EINVAL;
}
- ret = follow_pte(vma->vm_mm, memmap->vma_base, &ptep, &ptl);
- if (ret < 0) {
- mmap_read_unlock(current->mm);
+ for (i = 0; i < nr_pages; i++) {
+ ret = follow_pte(vma, memmap->vma_base + i * PAGE_SIZE,
+ &ptep, &ptl);
+ if (ret)
+ break;
+
+ cur_pfn = pte_pfn(ptep_get(ptep));
+ if (i == 0)
+ start_pfn = cur_pfn;
+ writable = !!pte_write(ptep_get(ptep));
+ pte_unmap_unlock(ptep, ptl);
+
+ /* Disallow write access if the PTE is not writable. */
+ if (!writable &&
+ (memmap->attr & ACRN_MEM_ACCESS_WRITE)) {
+ ret = -EFAULT;
+ break;
+ }
+
+ /* Disallow refcounted pages. */
+ if (pfn_valid(cur_pfn) &&
+ !PageReserved(pfn_to_page(cur_pfn))) {
+ ret = -EFAULT;
+ break;
+ }
+
+ /* Disallow non-contiguous ranges. */
+ if (cur_pfn != start_pfn + i) {
+ ret = -EINVAL;
+ break;
+ }
+ }
+ mmap_read_unlock(current->mm);
+
+ if (ret) {
dev_dbg(acrn_dev.this_device,
"Failed to lookup PFN at VMA:%pK.\n", (void *)memmap->vma_base);
return ret;
}
- pfn = pte_pfn(ptep_get(ptep));
- pte_unmap_unlock(ptep, ptl);
- mmap_read_unlock(current->mm);
return acrn_mm_region_add(vm, memmap->user_vm_pa,
- PFN_PHYS(pfn), memmap->len,
+ PFN_PHYS(start_pfn), memmap->len,
ACRN_MEM_TYPE_WB, memmap->attr);
}
mmap_read_unlock(current->mm);
- /* Get the page number of the map region */
- nr_pages = memmap->len >> PAGE_SHIFT;
pages = vzalloc(array_size(nr_pages, sizeof(*pages)));
if (!pages)
return -ENOMEM;
@@ -242,12 +275,11 @@ int acrn_vm_ram_map(struct acrn_vm *vm, struct acrn_vm_memmap *memmap)
mutex_unlock(&vm->regions_mapping_lock);
/* Calculate count of vm_memory_region_op */
- while (i < nr_pages) {
+ for (i = 0; i < nr_pages; i += 1 << order) {
page = pages[i];
VM_BUG_ON_PAGE(PageTail(page), page);
order = compound_order(page);
nr_regions++;
- i += 1 << order;
}
/* Prepare the vm_memory_region_batch */
@@ -264,8 +296,7 @@ int acrn_vm_ram_map(struct acrn_vm *vm, struct acrn_vm_memmap *memmap)
regions_info->vmid = vm->vmid;
regions_info->regions_gpa = virt_to_phys(vm_region);
user_vm_pa = memmap->user_vm_pa;
- i = 0;
- while (i < nr_pages) {
+ for (i = 0; i < nr_pages; i += 1 << order) {
u32 region_size;
page = pages[i];
@@ -281,7 +312,6 @@ int acrn_vm_ram_map(struct acrn_vm *vm, struct acrn_vm_memmap *memmap)
vm_region++;
user_vm_pa += region_size;
- i += 1 << order;
}
/* Inform the ACRN Hypervisor to set up EPT mappings */
diff --git a/fs/buffer.c b/fs/buffer.c
index 4f73d23c2c4691..ed698caa8834b2 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -687,30 +687,37 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
}
EXPORT_SYMBOL(mark_buffer_dirty_inode);
-/*
- * Add a page to the dirty page list.
- *
- * It is a sad fact of life that this function is called from several places
- * deeply under spinlocking. It may not sleep.
- *
- * If the page has buffers, the uptodate buffers are set dirty, to preserve
- * dirty-state coherency between the page and the buffers. It the page does
- * not have buffers then when they are later attached they will all be set
- * dirty.
- *
- * The buffers are dirtied before the page is dirtied. There's a small race
- * window in which a writepage caller may see the page cleanness but not the
- * buffer dirtiness. That's fine. If this code were to set the page dirty
- * before the buffers, a concurrent writepage caller could clear the page dirty
- * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
- * page on the dirty page list.
- *
- * We use i_private_lock to lock against try_to_free_buffers while using the
- * page's buffer list. Also use this to protect against clean buffers being
- * added to the page after it was set dirty.
- *
- * FIXME: may need to call ->reservepage here as well. That's rather up to the
- * address_space though.
+/**
+ * block_dirty_folio - Mark a folio as dirty.
+ * @mapping: The address space containing this folio.
+ * @folio: The folio to mark dirty.
+ *
+ * Filesystems which use buffer_heads can use this function as their
+ * ->dirty_folio implementation. Some filesystems need to do a little
+ * work before calling this function. Filesystems which do not use
+ * buffer_heads should call filemap_dirty_folio() instead.
+ *
+ * If the folio has buffers, the uptodate buffers are set dirty, to
+ * preserve dirty-state coherency between the folio and the buffers.
+ * Buffers added to a dirty folio are created dirty.
+ *
+ * The buffers are dirtied before the folio is dirtied. There's a small
+ * race window in which writeback may see the folio cleanness but not the
+ * buffer dirtiness. That's fine. If this code were to set the folio
+ * dirty before the buffers, writeback could clear the folio dirty flag,
+ * see a bunch of clean buffers and we'd end up with dirty buffers/clean
+ * folio on the dirty folio list.
+ *
+ * We use i_private_lock to lock against try_to_free_buffers() while
+ * using the folio's buffer list. This also prevents clean buffers
+ * being added to the folio after it was set dirty.
+ *
+ * Context: May only be called from process context. Does not sleep.
+ * Caller must ensure that @folio cannot be truncated during this call,
+ * typically by holding the folio lock or having a page in the folio
+ * mapped and holding the page table lock.
+ *
+ * Return: True if the folio was dirtied; false if it was already dirtied.
*/
bool block_dirty_folio(struct address_space *mapping, struct folio *folio)
{
@@ -1219,26 +1226,28 @@ void mark_buffer_write_io_error(struct buffer_head *bh)
}
EXPORT_SYMBOL(mark_buffer_write_io_error);
-/*
- * Decrement a buffer_head's reference count. If all buffers against a page
- * have zero reference count, are clean and unlocked, and if the page is clean
- * and unlocked then try_to_free_buffers() may strip the buffers from the page
- * in preparation for freeing it (sometimes, rarely, buffers are removed from
- * a page but it ends up not being freed, and buffers may later be reattached).
+/**
+ * __brelse - Release a buffer.
+ * @bh: The buffer to release.
+ *
+ * This variant of brelse() can be called if @bh is guaranteed to not be NULL.
*/
-void __brelse(struct buffer_head * buf)
+void __brelse(struct buffer_head *bh)
{
- if (atomic_read(&buf->b_count)) {
- put_bh(buf);
+ if (atomic_read(&bh->b_count)) {
+ put_bh(bh);
return;
}
WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
}
EXPORT_SYMBOL(__brelse);
-/*
- * bforget() is like brelse(), except it discards any
- * potentially dirty data.
+/**
+ * __bforget - Discard any dirty data in a buffer.
+ * @bh: The buffer to forget.
+ *
+ * This variant of bforget() can be called if @bh is guaranteed to not
+ * be NULL.
*/
void __bforget(struct buffer_head *bh)
{
@@ -1415,6 +1424,11 @@ EXPORT_SYMBOL(__find_get_block);
* @size: The size of buffer_heads for this @bdev.
* @gfp: The memory allocation flags to use.
*
+ * The returned buffer head has its reference count incremented, but is
+ * not locked. The caller should call brelse() when it has finished
+ * with the buffer. The buffer may not be uptodate. If needed, the
+ * caller can bring it uptodate either by reading it or overwriting it.
+ *
* Return: The buffer head, or NULL if memory could not be allocated.
*/
struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block,
@@ -1446,20 +1460,29 @@ void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
EXPORT_SYMBOL(__breadahead);
/**
- * __bread_gfp() - reads a specified block and returns the bh
- * @bdev: the block_device to read from
- * @block: number of block
- * @size: size (in bytes) to read
- * @gfp: page allocation flag
- *
- * Reads a specified block, and returns buffer head that contains it.
- * The page cache can be allocated from non-movable area
- * not to prevent page migration if you set gfp to zero.
- * It returns NULL if the block was unreadable.
+ * __bread_gfp() - Read a block.
+ * @bdev: The block device to read from.
+ * @block: Block number in units of block size.
+ * @size: The block size of this device in bytes.
+ * @gfp: Not page allocation flags; see below.
+ *
+ * You are not expected to call this function. You should use one of
+ * sb_bread(), sb_bread_unmovable() or __bread().
+ *
+ * Read a specified block, and return the buffer head that refers to it.
+ * If @gfp is 0, the memory will be allocated using the block device's
+ * default GFP flags. If @gfp is __GFP_MOVABLE, the memory may be
+ * allocated from a movable area. Do not pass in a complete set of
+ * GFP flags.
+ *
+ * The returned buffer head has its refcount increased. The caller should
+ * call brelse() when it has finished with the buffer.
+ *
+ * Context: May sleep waiting for I/O.
+ * Return: NULL if the block was unreadable.
*/
-struct buffer_head *
-__bread_gfp(struct block_device *bdev, sector_t block,
- unsigned size, gfp_t gfp)
+struct buffer_head *__bread_gfp(struct block_device *bdev, sector_t block,
+ unsigned size, gfp_t gfp)
{
struct buffer_head *bh;
@@ -2861,26 +2884,6 @@ int sync_dirty_buffer(struct buffer_head *bh)
}
EXPORT_SYMBOL(sync_dirty_buffer);
-/*
- * try_to_free_buffers() checks if all the buffers on this particular folio
- * are unused, and releases them if so.
- *
- * Exclusion against try_to_free_buffers may be obtained by either
- * locking the folio or by holding its mapping's i_private_lock.
- *
- * If the folio is dirty but all the buffers are clean then we need to
- * be sure to mark the folio clean as well. This is because the folio
- * may be against a block device, and a later reattachment of buffers
- * to a dirty folio will set *all* buffers dirty. Which would corrupt
- * filesystem data on the same device.
- *
- * The same applies to regular filesystem folios: if all the buffers are
- * clean then we set the folio clean and proceed. To do that, we require
- * total exclusion from block_dirty_folio(). That is obtained with
- * i_private_lock.
- *
- * try_to_free_buffers() is non-blocking.
- */
static inline int buffer_busy(struct buffer_head *bh)
{
return atomic_read(&bh->b_count) |
@@ -2914,6 +2917,30 @@ failed:
return false;
}
+/**
+ * try_to_free_buffers - Release buffers attached to this folio.
+ * @folio: The folio.
+ *
+ * If any buffers are in use (dirty, under writeback, elevated refcount),
+ * no buffers will be freed.
+ *
+ * If the folio is dirty but all the buffers are clean then we need to
+ * be sure to mark the folio clean as well. This is because the folio
+ * may be against a block device, and a later reattachment of buffers
+ * to a dirty folio will set *all* buffers dirty. Which would corrupt
+ * filesystem data on the same device.
+ *
+ * The same applies to regular filesystem folios: if all the buffers are
+ * clean then we set the folio clean and proceed. To do that, we require
+ * total exclusion from block_dirty_folio(). That is obtained with
+ * i_private_lock.
+ *
+ * Exclusion against try_to_free_buffers may be obtained by either
+ * locking the folio or by holding its mapping's i_private_lock.
+ *
+ * Context: Process context. @folio must be locked. Will not sleep.
+ * Return: true if all buffers attached to this folio were freed.
+ */
bool try_to_free_buffers(struct folio *folio)
{
struct address_space * const mapping = folio->mapping;
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index b4002aea7cdb9d..40de69860dcf97 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -284,7 +284,7 @@ static bool bh_get_inode_and_lblk_num(const struct buffer_head *bh,
const struct inode **inode_ret,
u64 *lblk_num_ret)
{
- struct page *page = bh->b_page;
+ struct folio *folio = bh->b_folio;
const struct address_space *mapping;
const struct inode *inode;
@@ -292,13 +292,13 @@ static bool bh_get_inode_and_lblk_num(const struct buffer_head *bh,
* The ext4 journal (jbd2) can submit a buffer_head it directly created
* for a non-pagecache page. fscrypt doesn't care about these.
*/
- mapping = page_mapping(page);
+ mapping = folio_mapping(folio);
if (!mapping)
return false;
inode = mapping->host;
*inode_ret = inode;
- *lblk_num_ret = ((u64)page->index << (PAGE_SHIFT - inode->i_blkbits)) +
+ *lblk_num_ret = ((u64)folio->index << (PAGE_SHIFT - inode->i_blkbits)) +
(bh_offset(bh) >> inode->i_blkbits);
return true;
}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index d9494b5fc7c185..961e6ff77c7235 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -4082,11 +4082,12 @@ const struct address_space_operations f2fs_dblock_aops = {
void f2fs_clear_page_cache_dirty_tag(struct page *page)
{
- struct address_space *mapping = page_mapping(page);
+ struct folio *folio = page_folio(page);
+ struct address_space *mapping = folio->mapping;
unsigned long flags;
xa_lock_irqsave(&mapping->i_pages, flags);
- __xa_clear_mark(&mapping->i_pages, page_index(page),
+ __xa_clear_mark(&mapping->i_pages, folio->index,
PAGECACHE_TAG_DIRTY);
xa_unlock_irqrestore(&mapping->i_pages, flags);
}
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 2e29b98ba8bab2..728e90be3570b7 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -335,8 +335,8 @@ void __nilfs_error(struct super_block *sb, const char *function,
extern struct nilfs_super_block *
nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **);
-extern int nilfs_store_magic_and_option(struct super_block *,
- struct nilfs_super_block *, char *);
+extern int nilfs_store_magic(struct super_block *sb,
+ struct nilfs_super_block *sbp);
extern int nilfs_check_feature_compatibility(struct super_block *,
struct nilfs_super_block *);
extern void nilfs_set_log_cursor(struct nilfs_super_block *,
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index ac24ed109ce935..e835e1f5a71201 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -29,13 +29,13 @@
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/blkdev.h>
-#include <linux/parser.h>
#include <linux/crc32.h>
#include <linux/vfs.h>
#include <linux/writeback.h>
#include <linux/seq_file.h>
#include <linux/mount.h>
#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include "nilfs.h"
#include "export.h"
#include "mdt.h"
@@ -61,7 +61,6 @@ struct kmem_cache *nilfs_segbuf_cachep;
struct kmem_cache *nilfs_btree_path_cache;
static int nilfs_setup_super(struct super_block *sb, int is_mount);
-static int nilfs_remount(struct super_block *sb, int *flags, char *data);
void __nilfs_msg(struct super_block *sb, const char *fmt, ...)
{
@@ -702,105 +701,98 @@ static const struct super_operations nilfs_sops = {
.freeze_fs = nilfs_freeze,
.unfreeze_fs = nilfs_unfreeze,
.statfs = nilfs_statfs,
- .remount_fs = nilfs_remount,
.show_options = nilfs_show_options
};
enum {
- Opt_err_cont, Opt_err_panic, Opt_err_ro,
- Opt_barrier, Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery,
- Opt_discard, Opt_nodiscard, Opt_err,
+ Opt_err, Opt_barrier, Opt_snapshot, Opt_order, Opt_norecovery,
+ Opt_discard,
};
-static match_table_t tokens = {
- {Opt_err_cont, "errors=continue"},
- {Opt_err_panic, "errors=panic"},
- {Opt_err_ro, "errors=remount-ro"},
- {Opt_barrier, "barrier"},
- {Opt_nobarrier, "nobarrier"},
- {Opt_snapshot, "cp=%u"},
- {Opt_order, "order=%s"},
- {Opt_norecovery, "norecovery"},
- {Opt_discard, "discard"},
- {Opt_nodiscard, "nodiscard"},
- {Opt_err, NULL}
+static const struct constant_table nilfs_param_err[] = {
+ {"continue", NILFS_MOUNT_ERRORS_CONT},
+ {"panic", NILFS_MOUNT_ERRORS_PANIC},
+ {"remount-ro", NILFS_MOUNT_ERRORS_RO},
+ {}
};
-static int parse_options(char *options, struct super_block *sb, int is_remount)
-{
- struct the_nilfs *nilfs = sb->s_fs_info;
- char *p;
- substring_t args[MAX_OPT_ARGS];
-
- if (!options)
- return 1;
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
+static const struct fs_parameter_spec nilfs_param_spec[] = {
+ fsparam_enum ("errors", Opt_err, nilfs_param_err),
+ fsparam_flag_no ("barrier", Opt_barrier),
+ fsparam_u64 ("cp", Opt_snapshot),
+ fsparam_string ("order", Opt_order),
+ fsparam_flag ("norecovery", Opt_norecovery),
+ fsparam_flag_no ("discard", Opt_discard),
+ {}
+};
- if (!*p)
- continue;
+struct nilfs_fs_context {
+ unsigned long ns_mount_opt;
+ __u64 cno;
+};
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_barrier:
- nilfs_set_opt(nilfs, BARRIER);
- break;
- case Opt_nobarrier:
+static int nilfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct nilfs_fs_context *nilfs = fc->fs_private;
+ int is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE;
+ struct fs_parse_result result;
+ int opt;
+
+ opt = fs_parse(fc, nilfs_param_spec, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_barrier:
+ if (result.negated)
nilfs_clear_opt(nilfs, BARRIER);
- break;
- case Opt_order:
- if (strcmp(args[0].from, "relaxed") == 0)
- /* Ordered data semantics */
- nilfs_clear_opt(nilfs, STRICT_ORDER);
- else if (strcmp(args[0].from, "strict") == 0)
- /* Strict in-order semantics */
- nilfs_set_opt(nilfs, STRICT_ORDER);
- else
- return 0;
- break;
- case Opt_err_panic:
- nilfs_write_opt(nilfs, ERROR_MODE, ERRORS_PANIC);
- break;
- case Opt_err_ro:
- nilfs_write_opt(nilfs, ERROR_MODE, ERRORS_RO);
- break;
- case Opt_err_cont:
- nilfs_write_opt(nilfs, ERROR_MODE, ERRORS_CONT);
- break;
- case Opt_snapshot:
- if (is_remount) {
- nilfs_err(sb,
- "\"%s\" option is invalid for remount",
- p);
- return 0;
- }
- break;
- case Opt_norecovery:
- nilfs_set_opt(nilfs, NORECOVERY);
- break;
- case Opt_discard:
- nilfs_set_opt(nilfs, DISCARD);
- break;
- case Opt_nodiscard:
- nilfs_clear_opt(nilfs, DISCARD);
- break;
- default:
- nilfs_err(sb, "unrecognized mount option \"%s\"", p);
- return 0;
+ else
+ nilfs_set_opt(nilfs, BARRIER);
+ break;
+ case Opt_order:
+ if (strcmp(param->string, "relaxed") == 0)
+ /* Ordered data semantics */
+ nilfs_clear_opt(nilfs, STRICT_ORDER);
+ else if (strcmp(param->string, "strict") == 0)
+ /* Strict in-order semantics */
+ nilfs_set_opt(nilfs, STRICT_ORDER);
+ else
+ return -EINVAL;
+ break;
+ case Opt_err:
+ nilfs->ns_mount_opt &= ~NILFS_MOUNT_ERROR_MODE;
+ nilfs->ns_mount_opt |= result.uint_32;
+ break;
+ case Opt_snapshot:
+ if (is_remount) {
+ struct super_block *sb = fc->root->d_sb;
+
+ nilfs_err(sb,
+ "\"%s\" option is invalid for remount",
+ param->key);
+ return -EINVAL;
+ }
+ if (result.uint_64 == 0) {
+ nilfs_err(NULL,
+ "invalid option \"cp=0\": invalid checkpoint number 0");
+ return -EINVAL;
}
+ nilfs->cno = result.uint_64;
+ break;
+ case Opt_norecovery:
+ nilfs_set_opt(nilfs, NORECOVERY);
+ break;
+ case Opt_discard:
+ if (result.negated)
+ nilfs_clear_opt(nilfs, DISCARD);
+ else
+ nilfs_set_opt(nilfs, DISCARD);
+ break;
+ default:
+ return -EINVAL;
}
- return 1;
-}
-
-static inline void
-nilfs_set_default_options(struct super_block *sb,
- struct nilfs_super_block *sbp)
-{
- struct the_nilfs *nilfs = sb->s_fs_info;
- nilfs->ns_mount_opt =
- NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER;
+ return 0;
}
static int nilfs_setup_super(struct super_block *sb, int is_mount)
@@ -857,9 +849,8 @@ struct nilfs_super_block *nilfs_read_super_block(struct super_block *sb,
return (struct nilfs_super_block *)((char *)(*pbh)->b_data + offset);
}
-int nilfs_store_magic_and_option(struct super_block *sb,
- struct nilfs_super_block *sbp,
- char *data)
+int nilfs_store_magic(struct super_block *sb,
+ struct nilfs_super_block *sbp)
{
struct the_nilfs *nilfs = sb->s_fs_info;
@@ -870,14 +861,12 @@ int nilfs_store_magic_and_option(struct super_block *sb,
sb->s_flags |= SB_NOATIME;
#endif
- nilfs_set_default_options(sb, sbp);
-
nilfs->ns_resuid = le16_to_cpu(sbp->s_def_resuid);
nilfs->ns_resgid = le16_to_cpu(sbp->s_def_resgid);
nilfs->ns_interval = le32_to_cpu(sbp->s_c_interval);
nilfs->ns_watermark = le32_to_cpu(sbp->s_c_block_max);
- return !parse_options(data, sb, 0) ? -EINVAL : 0;
+ return 0;
}
int nilfs_check_feature_compatibility(struct super_block *sb,
@@ -1035,17 +1024,17 @@ int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno)
/**
* nilfs_fill_super() - initialize a super block instance
* @sb: super_block
- * @data: mount options
- * @silent: silent mode flag
+ * @fc: filesystem context
*
* This function is called exclusively by nilfs->ns_mount_mutex.
* So, the recovery process is protected from other simultaneous mounts.
*/
static int
-nilfs_fill_super(struct super_block *sb, void *data, int silent)
+nilfs_fill_super(struct super_block *sb, struct fs_context *fc)
{
struct the_nilfs *nilfs;
struct nilfs_root *fsroot;
+ struct nilfs_fs_context *ctx = fc->fs_private;
__u64 cno;
int err;
@@ -1055,10 +1044,13 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_fs_info = nilfs;
- err = init_nilfs(nilfs, sb, (char *)data);
+ err = init_nilfs(nilfs, sb);
if (err)
goto failed_nilfs;
+ /* Copy in parsed mount options */
+ nilfs->ns_mount_opt = ctx->ns_mount_opt;
+
sb->s_op = &nilfs_sops;
sb->s_export_op = &nilfs_export_ops;
sb->s_root = NULL;
@@ -1117,34 +1109,25 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
return err;
}
-static int nilfs_remount(struct super_block *sb, int *flags, char *data)
+static int nilfs_reconfigure(struct fs_context *fc)
{
+ struct nilfs_fs_context *ctx = fc->fs_private;
+ struct super_block *sb = fc->root->d_sb;
struct the_nilfs *nilfs = sb->s_fs_info;
- unsigned long old_sb_flags;
- unsigned long old_mount_opt;
int err;
sync_filesystem(sb);
- old_sb_flags = sb->s_flags;
- old_mount_opt = nilfs->ns_mount_opt;
-
- if (!parse_options(data, sb, 1)) {
- err = -EINVAL;
- goto restore_opts;
- }
- sb->s_flags = (sb->s_flags & ~SB_POSIXACL);
err = -EINVAL;
if (!nilfs_valid_fs(nilfs)) {
nilfs_warn(sb,
"couldn't remount because the filesystem is in an incomplete recovery state");
- goto restore_opts;
+ goto ignore_opts;
}
-
- if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
+ if ((bool)(fc->sb_flags & SB_RDONLY) == sb_rdonly(sb))
goto out;
- if (*flags & SB_RDONLY) {
+ if (fc->sb_flags & SB_RDONLY) {
sb->s_flags |= SB_RDONLY;
/*
@@ -1172,138 +1155,67 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
"couldn't remount RDWR because of unsupported optional features (%llx)",
(unsigned long long)features);
err = -EROFS;
- goto restore_opts;
+ goto ignore_opts;
}
sb->s_flags &= ~SB_RDONLY;
root = NILFS_I(d_inode(sb->s_root))->i_root;
err = nilfs_attach_log_writer(sb, root);
- if (err)
- goto restore_opts;
+ if (err) {
+ sb->s_flags |= SB_RDONLY;
+ goto ignore_opts;
+ }
down_write(&nilfs->ns_sem);
nilfs_setup_super(sb, true);
up_write(&nilfs->ns_sem);
}
out:
- return 0;
-
- restore_opts:
- sb->s_flags = old_sb_flags;
- nilfs->ns_mount_opt = old_mount_opt;
- return err;
-}
-
-struct nilfs_super_data {
- __u64 cno;
- int flags;
-};
-
-static int nilfs_parse_snapshot_option(const char *option,
- const substring_t *arg,
- struct nilfs_super_data *sd)
-{
- unsigned long long val;
- const char *msg = NULL;
- int err;
-
- if (!(sd->flags & SB_RDONLY)) {
- msg = "read-only option is not specified";
- goto parse_error;
- }
-
- err = kstrtoull(arg->from, 0, &val);
- if (err) {
- if (err == -ERANGE)
- msg = "too large checkpoint number";
- else
- msg = "malformed argument";
- goto parse_error;
- } else if (val == 0) {
- msg = "invalid checkpoint number 0";
- goto parse_error;
- }
- sd->cno = val;
- return 0;
-
-parse_error:
- nilfs_err(NULL, "invalid option \"%s\": %s", option, msg);
- return 1;
-}
-
-/**
- * nilfs_identify - pre-read mount options needed to identify mount instance
- * @data: mount options
- * @sd: nilfs_super_data
- */
-static int nilfs_identify(char *data, struct nilfs_super_data *sd)
-{
- char *p, *options = data;
- substring_t args[MAX_OPT_ARGS];
- int token;
- int ret = 0;
-
- do {
- p = strsep(&options, ",");
- if (p != NULL && *p) {
- token = match_token(p, tokens, args);
- if (token == Opt_snapshot)
- ret = nilfs_parse_snapshot_option(p, &args[0],
- sd);
- }
- if (!options)
- break;
- BUG_ON(options == data);
- *(options - 1) = ',';
- } while (!ret);
- return ret;
-}
+ sb->s_flags = (sb->s_flags & ~SB_POSIXACL);
+ /* Copy over parsed remount options */
+ nilfs->ns_mount_opt = ctx->ns_mount_opt;
-static int nilfs_set_bdev_super(struct super_block *s, void *data)
-{
- s->s_dev = *(dev_t *)data;
return 0;
-}
-static int nilfs_test_bdev_super(struct super_block *s, void *data)
-{
- return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)data;
+ ignore_opts:
+ return err;
}
-static struct dentry *
-nilfs_mount(struct file_system_type *fs_type, int flags,
- const char *dev_name, void *data)
+static int
+nilfs_get_tree(struct fs_context *fc)
{
- struct nilfs_super_data sd = { .flags = flags };
+ struct nilfs_fs_context *ctx = fc->fs_private;
struct super_block *s;
dev_t dev;
int err;
- if (nilfs_identify(data, &sd))
- return ERR_PTR(-EINVAL);
+ if (ctx->cno && !(fc->sb_flags & SB_RDONLY)) {
+ nilfs_err(NULL,
+ "invalid option \"cp=%llu\": read-only option is not specified",
+ ctx->cno);
+ return -EINVAL;
+ }
- err = lookup_bdev(dev_name, &dev);
+ err = lookup_bdev(fc->source, &dev);
if (err)
- return ERR_PTR(err);
+ return err;
- s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, flags,
- &dev);
+ s = sget_dev(fc, dev);
if (IS_ERR(s))
- return ERR_CAST(s);
+ return PTR_ERR(s);
if (!s->s_root) {
- err = setup_bdev_super(s, flags, NULL);
+ err = setup_bdev_super(s, fc->sb_flags, fc);
if (!err)
- err = nilfs_fill_super(s, data,
- flags & SB_SILENT ? 1 : 0);
+ err = nilfs_fill_super(s, fc);
if (err)
goto failed_super;
s->s_flags |= SB_ACTIVE;
- } else if (!sd.cno) {
+ } else if (!ctx->cno) {
if (nilfs_tree_is_busy(s->s_root)) {
- if ((flags ^ s->s_flags) & SB_RDONLY) {
+ if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) {
nilfs_err(s,
"the device already has a %s mount.",
sb_rdonly(s) ? "read-only" : "read/write");
@@ -1312,37 +1224,75 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
}
} else {
/*
- * Try remount to setup mount states if the current
+ * Try reconfigure to setup mount states if the current
* tree is not mounted and only snapshots use this sb.
+ *
+ * Since nilfs_reconfigure() requires fc->root to be
+ * set, set it first and release it on failure.
*/
- err = nilfs_remount(s, &flags, data);
- if (err)
+ fc->root = dget(s->s_root);
+ err = nilfs_reconfigure(fc);
+ if (err) {
+ dput(fc->root);
+ fc->root = NULL; /* prevent double release */
goto failed_super;
+ }
+ return 0;
}
}
- if (sd.cno) {
+ if (ctx->cno) {
struct dentry *root_dentry;
- err = nilfs_attach_snapshot(s, sd.cno, &root_dentry);
+ err = nilfs_attach_snapshot(s, ctx->cno, &root_dentry);
if (err)
goto failed_super;
- return root_dentry;
+ fc->root = root_dentry;
+ return 0;
}
- return dget(s->s_root);
+ fc->root = dget(s->s_root);
+ return 0;
failed_super:
deactivate_locked_super(s);
- return ERR_PTR(err);
+ return err;
+}
+
+static void nilfs_free_fc(struct fs_context *fc)
+{
+ kfree(fc->fs_private);
+}
+
+static const struct fs_context_operations nilfs_context_ops = {
+ .parse_param = nilfs_parse_param,
+ .get_tree = nilfs_get_tree,
+ .reconfigure = nilfs_reconfigure,
+ .free = nilfs_free_fc,
+};
+
+static int nilfs_init_fs_context(struct fs_context *fc)
+{
+ struct nilfs_fs_context *ctx;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->ns_mount_opt = NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER;
+ fc->fs_private = ctx;
+ fc->ops = &nilfs_context_ops;
+
+ return 0;
}
struct file_system_type nilfs_fs_type = {
.owner = THIS_MODULE,
.name = "nilfs2",
- .mount = nilfs_mount,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
+ .init_fs_context = nilfs_init_fs_context,
+ .parameters = nilfs_param_spec,
};
MODULE_ALIAS_FS("nilfs2");
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 2ae2c1bbf6d17c..db322068678f40 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -659,7 +659,6 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
* init_nilfs - initialize a NILFS instance.
* @nilfs: the_nilfs structure
* @sb: super block
- * @data: mount options
*
* init_nilfs() performs common initialization per block device (e.g.
* reading the super block, getting disk layout information, initializing
@@ -668,7 +667,7 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
* Return Value: On success, 0 is returned. On error, a negative error
* code is returned.
*/
-int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
+int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
{
struct nilfs_super_block *sbp;
int blocksize;
@@ -686,7 +685,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
if (err)
goto out;
- err = nilfs_store_magic_and_option(sb, sbp, data);
+ err = nilfs_store_magic(sb, sbp);
if (err)
goto failed_sbh;
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index cd4ae1b8ae165a..85da0629415dfe 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -219,10 +219,6 @@ THE_NILFS_FNS(PURGING, purging)
#define nilfs_set_opt(nilfs, opt) \
((nilfs)->ns_mount_opt |= NILFS_MOUNT_##opt)
#define nilfs_test_opt(nilfs, opt) ((nilfs)->ns_mount_opt & NILFS_MOUNT_##opt)
-#define nilfs_write_opt(nilfs, mask, opt) \
- ((nilfs)->ns_mount_opt = \
- (((nilfs)->ns_mount_opt & ~NILFS_MOUNT_##mask) | \
- NILFS_MOUNT_##opt)) \
/**
* struct nilfs_root - nilfs root object
@@ -276,7 +272,7 @@ static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs)
void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
struct the_nilfs *alloc_nilfs(struct super_block *sb);
void destroy_nilfs(struct the_nilfs *nilfs);
-int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data);
+int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb);
int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb);
unsigned long nilfs_nrsvsegs(struct the_nilfs *nilfs, unsigned long nsegs);
void nilfs_set_nsegments(struct the_nilfs *nilfs, unsigned long nsegs);
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 2e0a2f3382829c..2018501b224937 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1527,7 +1527,6 @@ static void dlm_send_join_asserts(struct dlm_ctxt *dlm,
{
int status, node, live;
- status = 0;
node = -1;
while ((node = find_next_bit(node_map, O2NM_MAX_NODES,
node + 1)) < O2NM_MAX_NODES) {
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 8ff79bd427ec68..81fbecfe5ff6c7 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -730,19 +730,20 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
{
struct mem_size_stats *mss = walk->private;
struct vm_area_struct *vma = walk->vma;
- struct page *page = NULL;
- pte_t ptent = ptep_get(pte);
+ pte_t ptent = huge_ptep_get(pte);
+ struct folio *folio = NULL;
if (pte_present(ptent)) {
- page = vm_normal_page(vma, addr, ptent);
+ folio = page_folio(pte_page(ptent));
} else if (is_swap_pte(ptent)) {
swp_entry_t swpent = pte_to_swp_entry(ptent);
if (is_pfn_swap_entry(swpent))
- page = pfn_swap_entry_to_page(swpent);
+ folio = pfn_swap_entry_folio(swpent);
}
- if (page) {
- if (page_mapcount(page) >= 2 || hugetlb_pmd_shared(pte))
+ if (folio) {
+ if (folio_likely_mapped_shared(folio) ||
+ hugetlb_pmd_shared(pte))
mss->shared_hugetlb += huge_page_size(hstate_vma(vma));
else
mss->private_hugetlb += huge_page_size(hstate_vma(vma));
@@ -870,8 +871,8 @@ static int show_smap(struct seq_file *m, void *v)
__show_smap(m, &mss, false);
seq_printf(m, "THPeligible: %8u\n",
- !!thp_vma_allowable_orders(vma, vma->vm_flags, true, false,
- true, THP_ORDERS_ALL));
+ !!thp_vma_allowable_orders(vma, vma->vm_flags,
+ TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL));
if (arch_pkeys_enabled())
seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma));
@@ -1578,12 +1579,13 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
pte = huge_ptep_get(ptep);
if (pte_present(pte)) {
- struct page *page = pte_page(pte);
+ struct folio *folio = page_folio(pte_page(pte));
- if (!PageAnon(page))
+ if (!folio_test_anon(folio))
flags |= PM_FILE;
- if (page_mapcount(page) == 1)
+ if (!folio_likely_mapped_shared(folio) &&
+ !hugetlb_pmd_shared(ptep))
flags |= PM_MMAP_EXCLUSIVE;
if (huge_pte_uffd_wp(pte))
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 2ba557e067fe69..ffeb71e7215457 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -254,16 +254,12 @@ static inline unsigned long __ffs64(u64 word)
*/
static inline unsigned long fns(unsigned long word, unsigned int n)
{
- unsigned int bit;
+ unsigned int i;
- while (word) {
- bit = __ffs(word);
- if (n-- == 0)
- return bit;
- __clear_bit(bit, &word);
- }
+ for (i = 0; word && i < n; i++)
+ word &= word - 1;
- return BITS_PER_LONG;
+ return word ? __ffs(word) : BITS_PER_LONG;
}
/**
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index d78454a4dd1f0f..a1c0bdd0cca66b 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -303,12 +303,38 @@ static inline void put_bh(struct buffer_head *bh)
atomic_dec(&bh->b_count);
}
+/**
+ * brelse - Release a buffer.
+ * @bh: The buffer to release.
+ *
+ * Decrement a buffer_head's reference count. If @bh is NULL, this
+ * function is a no-op.
+ *
+ * If all buffers on a folio have zero reference count, are clean
+ * and unlocked, and if the folio is unlocked and not under writeback
+ * then try_to_free_buffers() may strip the buffers from the folio in
+ * preparation for freeing it (sometimes, rarely, buffers are removed
+ * from a folio but it ends up not being freed, and buffers may later
+ * be reattached).
+ *
+ * Context: Any context.
+ */
static inline void brelse(struct buffer_head *bh)
{
if (bh)
__brelse(bh);
}
+/**
+ * bforget - Discard any dirty data in a buffer.
+ * @bh: The buffer to forget.
+ *
+ * Call this function instead of brelse() if the data written to a buffer
+ * no longer needs to be written back. It will clear the buffer's dirty
+ * flag so writeback of this buffer will be skipped.
+ *
+ * Context: Any context.
+ */
static inline void bforget(struct buffer_head *bh)
{
if (bh)
@@ -437,17 +463,21 @@ static inline void bh_readahead_batch(int nr, struct buffer_head *bhs[],
}
/**
- * __bread() - reads a specified block and returns the bh
- * @bdev: the block_device to read from
- * @block: number of block
- * @size: size (in bytes) to read
+ * __bread() - Read a block.
+ * @bdev: The block device to read from.
+ * @block: Block number in units of block size.
+ * @size: The block size of this device in bytes.
*
- * Reads a specified block, and returns buffer head that contains it.
- * The page cache is allocated from movable area so that it can be migrated.
- * It returns NULL if the block was unreadable.
+ * Read a specified block, and return the buffer head that refers
+ * to it. The memory is allocated from the movable area so that it can
+ * be migrated. The returned buffer head has its refcount increased.
+ * The caller should call brelse() when it has finished with the buffer.
+ *
+ * Context: May sleep waiting for I/O.
+ * Return: NULL if the block was unreadable.
*/
-static inline struct buffer_head *
-__bread(struct block_device *bdev, sector_t block, unsigned size)
+static inline struct buffer_head *__bread(struct block_device *bdev,
+ sector_t block, unsigned size)
{
return __bread_gfp(bdev, block, size, __GFP_MOVABLE);
}
diff --git a/include/linux/damon.h b/include/linux/damon.h
index 886d07294f4e7c..f7da65e1ac041e 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -297,6 +297,7 @@ struct damos_stat {
* enum damos_filter_type - Type of memory for &struct damos_filter
* @DAMOS_FILTER_TYPE_ANON: Anonymous pages.
* @DAMOS_FILTER_TYPE_MEMCG: Specific memcg's pages.
+ * @DAMOS_FILTER_TYPE_YOUNG: Recently accessed pages.
* @DAMOS_FILTER_TYPE_ADDR: Address range.
* @DAMOS_FILTER_TYPE_TARGET: Data Access Monitoring target.
* @NR_DAMOS_FILTER_TYPES: Number of filter types.
@@ -315,6 +316,7 @@ struct damos_stat {
enum damos_filter_type {
DAMOS_FILTER_TYPE_ANON,
DAMOS_FILTER_TYPE_MEMCG,
+ DAMOS_FILTER_TYPE_YOUNG,
DAMOS_FILTER_TYPE_ADDR,
DAMOS_FILTER_TYPE_TARGET,
NR_DAMOS_FILTER_TYPES,
diff --git a/include/linux/fpu.h b/include/linux/fpu.h
new file mode 100644
index 00000000000000..2fb63e22913bcf
--- /dev/null
+++ b/include/linux/fpu.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _LINUX_FPU_H
+#define _LINUX_FPU_H
+
+#ifdef _LINUX_FPU_COMPILATION_UNIT
+#error FP code must be compiled separately. See Documentation/core-api/floating-point.rst.
+#endif
+
+#include <asm/fpu.h>
+
+#endif
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index e896ca4760f673..c8d3ec116e291b 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -81,8 +81,12 @@ extern struct kobj_attribute shmem_enabled_attr;
*/
#define THP_ORDERS_ALL (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE)
-#define thp_vma_allowable_order(vma, vm_flags, smaps, in_pf, enforce_sysfs, order) \
- (!!thp_vma_allowable_orders(vma, vm_flags, smaps, in_pf, enforce_sysfs, BIT(order)))
+#define TVA_SMAPS (1 << 0) /* Will be used for procfs */
+#define TVA_IN_PF (1 << 1) /* Page fault handler */
+#define TVA_ENFORCE_SYSFS (1 << 2) /* Obey sysfs configuration */
+
+#define thp_vma_allowable_order(vma, vm_flags, tva_flags, order) \
+ (!!thp_vma_allowable_orders(vma, vm_flags, tva_flags, BIT(order)))
#ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES
#define HPAGE_PMD_SHIFT PMD_SHIFT
@@ -218,17 +222,15 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma)
}
unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
- unsigned long vm_flags, bool smaps,
- bool in_pf, bool enforce_sysfs,
+ unsigned long vm_flags,
+ unsigned long tva_flags,
unsigned long orders);
/**
* thp_vma_allowable_orders - determine hugepage orders that are allowed for vma
* @vma: the vm area to check
* @vm_flags: use these vm_flags instead of vma->vm_flags
- * @smaps: whether answer will be used for smaps file
- * @in_pf: whether answer will be used by page fault handler
- * @enforce_sysfs: whether sysfs config should be taken into account
+ * @tva_flags: Which TVA flags to honour
* @orders: bitfield of all orders to consider
*
* Calculates the intersection of the requested hugepage orders and the allowed
@@ -241,12 +243,12 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
*/
static inline
unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
- unsigned long vm_flags, bool smaps,
- bool in_pf, bool enforce_sysfs,
+ unsigned long vm_flags,
+ unsigned long tva_flags,
unsigned long orders)
{
/* Optimization to check if required orders are enabled early. */
- if (enforce_sysfs && vma_is_anonymous(vma)) {
+ if ((tva_flags & TVA_ENFORCE_SYSFS) && vma_is_anonymous(vma)) {
unsigned long mask = READ_ONCE(huge_anon_orders_always);
if (vm_flags & VM_HUGEPAGE)
@@ -260,8 +262,30 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
return 0;
}
- return __thp_vma_allowable_orders(vma, vm_flags, smaps, in_pf,
- enforce_sysfs, orders);
+ return __thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders);
+}
+
+enum mthp_stat_item {
+ MTHP_STAT_ANON_FAULT_ALLOC,
+ MTHP_STAT_ANON_FAULT_FALLBACK,
+ MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
+ MTHP_STAT_ANON_SWPOUT,
+ MTHP_STAT_ANON_SWPOUT_FALLBACK,
+ __MTHP_STAT_COUNT
+};
+
+struct mthp_stat {
+ unsigned long stats[ilog2(MAX_PTRS_PER_PTE) + 1][__MTHP_STAT_COUNT];
+};
+
+DECLARE_PER_CPU(struct mthp_stat, mthp_stats);
+
+static inline void count_mthp_stat(int order, enum mthp_stat_item item)
+{
+ if (order <= 0 || order > PMD_ORDER)
+ return;
+
+ this_cpu_inc(mthp_stats.stats[order][item]);
}
#define transparent_hugepage_use_zero_page() \
@@ -405,8 +429,8 @@ static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma,
}
static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
- unsigned long vm_flags, bool smaps,
- bool in_pf, bool enforce_sysfs,
+ unsigned long vm_flags,
+ unsigned long tva_flags,
unsigned long orders)
{
return 0;
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 3f3e628802792a..68244bb3637a87 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -178,7 +178,7 @@ bool hugetlbfs_pagecache_present(struct hstate *h,
struct vm_area_struct *vma,
unsigned long address);
-struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage);
+struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio);
extern int sysctl_hugetlb_shm_group;
extern struct list_head huge_boot_pages[MAX_NUMNODES];
@@ -297,8 +297,8 @@ static inline unsigned long hugetlb_total_pages(void)
return 0;
}
-static inline struct address_space *hugetlb_page_mapping_lock_write(
- struct page *hpage)
+static inline struct address_space *hugetlb_folio_mapping_lock_write(
+ struct folio *folio)
{
return NULL;
}
@@ -861,8 +861,8 @@ static inline int hstate_index(struct hstate *h)
return h - hstates;
}
-extern int dissolve_free_huge_page(struct page *page);
-extern int dissolve_free_huge_pages(unsigned long start_pfn,
+int dissolve_free_hugetlb_folio(struct folio *folio);
+int dissolve_free_hugetlb_folios(unsigned long start_pfn,
unsigned long end_pfn);
#ifdef CONFIG_MEMORY_FAILURE
@@ -1148,12 +1148,12 @@ static inline int hstate_index(struct hstate *h)
return 0;
}
-static inline int dissolve_free_huge_page(struct page *page)
+static inline int dissolve_free_hugetlb_folio(struct folio *folio)
{
return 0;
}
-static inline int dissolve_free_huge_pages(unsigned long start_pfn,
+static inline int dissolve_free_hugetlb_folios(unsigned long start_pfn,
unsigned long end_pfn)
{
return 0;
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 060835bb82d52f..f31bd304df4518 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -461,10 +461,8 @@ static inline void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) {
extern bool kexec_file_dbg_print;
-#define kexec_dprintk(fmt, ...) \
- printk("%s" fmt, \
- kexec_file_dbg_print ? KERN_INFO : KERN_DEBUG, \
- ##__VA_ARGS__)
+#define kexec_dprintk(fmt, arg...) \
+ do { if (kexec_file_dbg_print) pr_info(fmt, ##arg); } while (0)
#else /* !CONFIG_KEXEC_CORE */
struct pt_regs;
diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h
index 0b35a41440ff13..6b28d642f3325e 100644
--- a/include/linux/kfifo.h
+++ b/include/linux/kfifo.h
@@ -36,10 +36,15 @@
* to lock the reader.
*/
-#include <linux/kernel.h>
+#include <linux/array_size.h>
#include <linux/spinlock.h>
#include <linux/stddef.h>
-#include <linux/scatterlist.h>
+#include <linux/types.h>
+
+#include <asm/barrier.h>
+#include <asm/errno.h>
+
+struct scatterlist;
struct __kfifo {
unsigned int in;
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 358803cfd4d5eb..52c63a9c5a9cdf 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -81,15 +81,9 @@ struct folio *ksm_might_need_to_copy(struct folio *folio,
void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc);
void folio_migrate_ksm(struct folio *newfolio, struct folio *folio);
-
-#ifdef CONFIG_MEMORY_FAILURE
-void collect_procs_ksm(struct page *page, struct list_head *to_kill,
- int force_early);
-#endif
-
-#ifdef CONFIG_PROC_FS
+void collect_procs_ksm(struct folio *folio, struct page *page,
+ struct list_head *to_kill, int force_early);
long ksm_process_profit(struct mm_struct *);
-#endif /* CONFIG_PROC_FS */
#else /* !CONFIG_KSM */
@@ -120,12 +114,10 @@ static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte)
{
}
-#ifdef CONFIG_MEMORY_FAILURE
-static inline void collect_procs_ksm(struct page *page,
+static inline void collect_procs_ksm(struct folio *folio, struct page *page,
struct list_head *to_kill, int force_early)
{
}
-#endif
#ifdef CONFIG_MMU
static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 8f332b4ae84ca8..9aba0d0462ca69 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1077,8 +1077,6 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
void mem_cgroup_flush_stats(struct mem_cgroup *memcg);
void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg);
-void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
- int val);
void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val);
static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
@@ -1091,16 +1089,6 @@ static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
local_irq_restore(flags);
}
-static inline void mod_memcg_lruvec_state(struct lruvec *lruvec,
- enum node_stat_item idx, int val)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- __mod_memcg_lruvec_state(lruvec, idx, val);
- local_irq_restore(flags);
-}
-
void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
unsigned long count);
@@ -1594,11 +1582,6 @@ static inline void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg)
{
}
-static inline void __mod_memcg_lruvec_state(struct lruvec *lruvec,
- enum node_stat_item idx, int val)
-{
-}
-
static inline void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
int val)
{
diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index 69e78190008271..0d70788558f463 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -48,6 +48,9 @@ int mt_calc_adistance(int node, int *adist);
int mt_set_default_dram_perf(int nid, struct access_coordinate *perf,
const char *source);
int mt_perf_to_adistance(struct access_coordinate *perf, int *adist);
+struct memory_dev_type *mt_find_alloc_memory_type(int adist,
+ struct list_head *memory_types);
+void mt_put_memory_types(struct list_head *memory_types);
#ifdef CONFIG_MIGRATION
int next_demotion_node(int node);
void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
@@ -136,5 +139,15 @@ static inline int mt_perf_to_adistance(struct access_coordinate *perf, int *adis
{
return -EIO;
}
+
+static inline struct memory_dev_type *mt_find_alloc_memory_type(int adist,
+ struct list_head *memory_types)
+{
+ return NULL;
+}
+
+static inline void mt_put_memory_types(struct list_head *memory_types)
+{
+}
#endif /* CONFIG_NUMA */
#endif /* _LINUX_MEMORY_TIERS_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index cc9410d772ddf0..9849dfda44d43c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1232,7 +1232,7 @@ static inline int page_mapcount(struct page *page)
int mapcount = atomic_read(&page->_mapcount) + 1;
/* Handle page_has_type() pages */
- if (mapcount < 0)
+ if (mapcount < PAGE_MAPCOUNT_RESERVE + 1)
mapcount = 0;
if (unlikely(PageCompound(page)))
mapcount += folio_entire_mapcount(page_folio(page));
@@ -1240,34 +1240,44 @@ static inline int page_mapcount(struct page *page)
return mapcount;
}
-int folio_total_mapcount(const struct folio *folio);
+static inline int folio_large_mapcount(const struct folio *folio)
+{
+ VM_WARN_ON_FOLIO(!folio_test_large(folio), folio);
+ return atomic_read(&folio->_large_mapcount) + 1;
+}
/**
- * folio_mapcount() - Calculate the number of mappings of this folio.
+ * folio_mapcount() - Number of mappings of this folio.
* @folio: The folio.
*
- * A large folio tracks both how many times the entire folio is mapped,
- * and how many times each individual page in the folio is mapped.
- * This function calculates the total number of times the folio is
- * mapped.
+ * The folio mapcount corresponds to the number of present user page table
+ * entries that reference any part of a folio. Each such present user page
+ * table entry must be paired with exactly on folio reference.
+ *
+ * For ordindary folios, each user page table entry (PTE/PMD/PUD/...) counts
+ * exactly once.
+ *
+ * For hugetlb folios, each abstracted "hugetlb" user page table entry that
+ * references the entire folio counts exactly once, even when such special
+ * page table entries are comprised of multiple ordinary page table entries.
+ *
+ * Will report 0 for pages which cannot be mapped into userspace, such as
+ * slab, page tables and similar.
*
* Return: The number of times this folio is mapped.
*/
static inline int folio_mapcount(const struct folio *folio)
{
- if (likely(!folio_test_large(folio)))
- return atomic_read(&folio->_mapcount) + 1;
- return folio_total_mapcount(folio);
-}
+ int mapcount;
-static inline bool folio_large_is_mapped(const struct folio *folio)
-{
- /*
- * Reading _entire_mapcount below could be omitted if hugetlb
- * participated in incrementing nr_pages_mapped when compound mapped.
- */
- return atomic_read(&folio->_nr_pages_mapped) > 0 ||
- atomic_read(&folio->_entire_mapcount) >= 0;
+ if (likely(!folio_test_large(folio))) {
+ mapcount = atomic_read(&folio->_mapcount) + 1;
+ /* Handle page_has_type() pages */
+ if (mapcount < PAGE_MAPCOUNT_RESERVE + 1)
+ mapcount = 0;
+ return mapcount;
+ }
+ return folio_large_mapcount(folio);
}
/**
@@ -1276,11 +1286,9 @@ static inline bool folio_large_is_mapped(const struct folio *folio)
*
* Return: True if any page in this folio is referenced by user page tables.
*/
-static inline bool folio_mapped(struct folio *folio)
+static inline bool folio_mapped(const struct folio *folio)
{
- if (likely(!folio_test_large(folio)))
- return atomic_read(&folio->_mapcount) >= 0;
- return folio_large_is_mapped(folio);
+ return folio_mapcount(folio) >= 1;
}
/*
@@ -1290,9 +1298,7 @@ static inline bool folio_mapped(struct folio *folio)
*/
static inline bool page_mapped(const struct page *page)
{
- if (likely(!PageCompound(page)))
- return atomic_read(&page->_mapcount) >= 0;
- return folio_large_is_mapped(page_folio(page));
+ return folio_mapped(page_folio(page));
}
static inline struct page *virt_to_head_page(const void *x)
@@ -1435,27 +1441,22 @@ vm_fault_t finish_fault(struct vm_fault *vmf);
#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX)
DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
-bool __put_devmap_managed_page_refs(struct page *page, int refs);
-static inline bool put_devmap_managed_page_refs(struct page *page, int refs)
+bool __put_devmap_managed_folio_refs(struct folio *folio, int refs);
+static inline bool put_devmap_managed_folio_refs(struct folio *folio, int refs)
{
if (!static_branch_unlikely(&devmap_managed_key))
return false;
- if (!is_zone_device_page(page))
+ if (!folio_is_zone_device(folio))
return false;
- return __put_devmap_managed_page_refs(page, refs);
+ return __put_devmap_managed_folio_refs(folio, refs);
}
#else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
-static inline bool put_devmap_managed_page_refs(struct page *page, int refs)
+static inline bool put_devmap_managed_folio_refs(struct folio *folio, int refs)
{
return false;
}
#endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
-static inline bool put_devmap_managed_page(struct page *page)
-{
- return put_devmap_managed_page_refs(page, 1);
-}
-
/* 127: arbitrary random number, small enough to assemble well */
#define folio_ref_zero_or_close_to_overflow(folio) \
((unsigned int) folio_ref_count(folio) + 127u <= 127u)
@@ -1574,7 +1575,7 @@ static inline void put_page(struct page *page)
* For some devmap managed pages we need to catch refcount transition
* from 2 to 1:
*/
- if (put_devmap_managed_page(&folio->page))
+ if (put_devmap_managed_folio_refs(folio, 1))
return;
folio_put(folio);
}
@@ -2188,7 +2189,7 @@ static inline size_t folio_size(struct folio *folio)
* indicate "mapped shared" (false positive) when two VMAs in the same MM
* cover the same file range.
* #. For (small) KSM folios, the return value can wrongly indicate "mapped
- * shared" (false negative), when the folio is mapped multiple times into
+ * shared" (false positive), when the folio is mapped multiple times into
* the same MM.
*
* Further, this function only considers current page table mappings that
@@ -2205,7 +2206,22 @@ static inline size_t folio_size(struct folio *folio)
*/
static inline bool folio_likely_mapped_shared(struct folio *folio)
{
- return page_mapcount(folio_page(folio, 0)) > 1;
+ int mapcount = folio_mapcount(folio);
+
+ /* Only partially-mappable folios require more care. */
+ if (!folio_test_large(folio) || unlikely(folio_test_hugetlb(folio)))
+ return mapcount > 1;
+
+ /* A single mapping implies "mapped exclusively". */
+ if (mapcount <= 1)
+ return false;
+
+ /* If any page is mapped more than once we treat it "mapped shared". */
+ if (folio_entire_mapcount(folio) || mapcount > folio_nr_pages(folio))
+ return true;
+
+ /* Let's guess based on the first subpage. */
+ return atomic_read(&folio->_mapcount) > 0;
}
#ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE
@@ -2420,7 +2436,7 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
unsigned long end, unsigned long floor, unsigned long ceiling);
int
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
-int follow_pte(struct mm_struct *mm, unsigned long address,
+int follow_pte(struct vm_area_struct *vma, unsigned long address,
pte_t **ptepp, spinlock_t **ptlp);
int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
void *buf, int len, int write);
@@ -2575,7 +2591,7 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma,
MM_CP_UFFD_WP_RESOLVE)
bool vma_needs_dirty_tracking(struct vm_area_struct *vma);
-int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
+bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma)
{
/*
@@ -4012,7 +4028,6 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
extern int memory_failure(unsigned long pfn, int flags);
extern void memory_failure_queue_kick(int cpu);
extern int unpoison_memory(unsigned long pfn);
-extern void shake_page(struct page *p);
extern atomic_long_t num_poisoned_pages __read_mostly;
extern int soft_offline_page(unsigned long pfn, int flags);
#ifdef CONFIG_MEMORY_FAILURE
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index fa0d6995706f4f..24323c7d0bd484 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -289,7 +289,8 @@ typedef struct {
* @virtual: Virtual address in the kernel direct map.
* @_last_cpupid: IDs of last CPU and last process that accessed the folio.
* @_entire_mapcount: Do not use directly, call folio_entire_mapcount().
- * @_nr_pages_mapped: Do not use directly, call folio_mapcount().
+ * @_large_mapcount: Do not use directly, call folio_mapcount().
+ * @_nr_pages_mapped: Do not use outside of rmap and debug code.
* @_pincount: Do not use directly, call folio_maybe_dma_pinned().
* @_folio_nr_pages: Do not use directly, call folio_nr_pages().
* @_hugetlb_subpool: Do not use directly, use accessor in hugetlb.h.
@@ -348,8 +349,8 @@ struct folio {
struct {
unsigned long _flags_1;
unsigned long _head_1;
- unsigned long _folio_avail;
/* public: */
+ atomic_t _large_mapcount;
atomic_t _entire_mapcount;
atomic_t _nr_pages_mapped;
atomic_t _pincount;
@@ -1367,6 +1368,15 @@ enum fault_flag {
typedef unsigned int __bitwise zap_flags_t;
+/* Flags for clear_young_dirty_ptes(). */
+typedef int __bitwise cydp_t;
+
+/* Clear the access bit */
+#define CYDP_CLEAR_YOUNG ((__force cydp_t)BIT(0))
+
+/* Clear the dirty bit */
+#define CYDP_CLEAR_DIRTY ((__force cydp_t)BIT(1))
+
/*
* FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each
* other. Here is what they mean, and how to use them:
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index f83331684e470d..104078afe0b16f 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -512,9 +512,9 @@ static inline int TestClearPage##uname(struct page *page) { return 0; }
__PAGEFLAG(Locked, locked, PF_NO_TAIL)
FOLIO_FLAG(waiters, FOLIO_HEAD_PAGE)
PAGEFLAG(Error, error, PF_NO_TAIL) TESTCLEARFLAG(Error, error, PF_NO_TAIL)
-PAGEFLAG(Referenced, referenced, PF_HEAD)
- TESTCLEARFLAG(Referenced, referenced, PF_HEAD)
- __SETPAGEFLAG(Referenced, referenced, PF_HEAD)
+FOLIO_FLAG(referenced, FOLIO_HEAD_PAGE)
+ FOLIO_TEST_CLEAR_FLAG(referenced, FOLIO_HEAD_PAGE)
+ __FOLIO_SET_FLAG(referenced, FOLIO_HEAD_PAGE)
PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD)
__CLEARPAGEFLAG(Dirty, dirty, PF_HEAD)
PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD)
@@ -789,7 +789,7 @@ static inline bool folio_test_uptodate(const struct folio *folio)
return ret;
}
-static inline int PageUptodate(const struct page *page)
+static inline bool PageUptodate(const struct page *page)
{
return folio_test_uptodate(page_folio(page));
}
diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h
index d7c2d33baa7f8e..1acf5bac7f503b 100644
--- a/include/linux/page_ref.h
+++ b/include/linux/page_ref.h
@@ -139,20 +139,15 @@ static inline void folio_ref_sub(struct folio *folio, int nr)
page_ref_sub(&folio->page, nr);
}
-static inline int page_ref_sub_return(struct page *page, int nr)
+static inline int folio_ref_sub_return(struct folio *folio, int nr)
{
- int ret = atomic_sub_return(nr, &page->_refcount);
+ int ret = atomic_sub_return(nr, &folio->_refcount);
if (page_ref_tracepoint_active(page_ref_mod_and_return))
- __page_ref_mod_and_return(page, -nr, ret);
+ __page_ref_mod_and_return(&folio->page, -nr, ret);
return ret;
}
-static inline int folio_ref_sub_return(struct folio *folio, int nr)
-{
- return page_ref_sub_return(&folio->page, nr);
-}
-
static inline void page_ref_inc(struct page *page)
{
atomic_inc(&page->_refcount);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 65e332df4d7696..4e85f33721c42f 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -399,7 +399,6 @@ static inline void filemap_nr_thps_dec(struct address_space *mapping)
#endif
}
-struct address_space *page_mapping(struct page *);
struct address_space *folio_mapping(struct folio *);
struct address_space *swapcache_mapping(struct folio *);
@@ -558,11 +557,6 @@ static inline struct page *__page_cache_alloc(gfp_t gfp)
return &filemap_alloc_folio(gfp, 0)->page;
}
-static inline struct page *page_cache_alloc(struct address_space *x)
-{
- return __page_cache_alloc(mapping_gfp_mask(x));
-}
-
static inline gfp_t readahead_gfp_mask(struct address_space *x)
{
return mapping_gfp_mask(x) | __GFP_NORETRY | __GFP_NOWARN;
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index e2f45e22a6d138..18019f037baeca 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -361,36 +361,6 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
}
#endif
-#ifndef mkold_ptes
-/**
- * mkold_ptes - Mark PTEs that map consecutive pages of the same folio as old.
- * @vma: VMA the pages are mapped into.
- * @addr: Address the first page is mapped at.
- * @ptep: Page table pointer for the first entry.
- * @nr: Number of entries to mark old.
- *
- * May be overridden by the architecture; otherwise, implemented as a simple
- * loop over ptep_test_and_clear_young().
- *
- * Note that PTE bits in the PTE range besides the PFN can differ. For example,
- * some PTEs might be write-protected.
- *
- * Context: The caller holds the page table lock. The PTEs map consecutive
- * pages that belong to the same folio. The PTEs are all in the same PMD.
- */
-static inline void mkold_ptes(struct vm_area_struct *vma, unsigned long addr,
- pte_t *ptep, unsigned int nr)
-{
- for (;;) {
- ptep_test_and_clear_young(vma, addr, ptep);
- if (--nr == 0)
- break;
- ptep++;
- addr += PAGE_SIZE;
- }
-}
-#endif
-
#ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
@@ -489,6 +459,50 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
}
#endif
+#ifndef clear_young_dirty_ptes
+/**
+ * clear_young_dirty_ptes - Mark PTEs that map consecutive pages of the
+ * same folio as old/clean.
+ * @mm: Address space the pages are mapped into.
+ * @addr: Address the first page is mapped at.
+ * @ptep: Page table pointer for the first entry.
+ * @nr: Number of entries to mark old/clean.
+ * @flags: Flags to modify the PTE batch semantics.
+ *
+ * May be overridden by the architecture; otherwise, implemented by
+ * get_and_clear/modify/set for each pte in the range.
+ *
+ * Note that PTE bits in the PTE range besides the PFN can differ. For example,
+ * some PTEs might be write-protected.
+ *
+ * Context: The caller holds the page table lock. The PTEs map consecutive
+ * pages that belong to the same folio. The PTEs are all in the same PMD.
+ */
+static inline void clear_young_dirty_ptes(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ unsigned int nr, cydp_t flags)
+{
+ pte_t pte;
+
+ for (;;) {
+ if (flags == CYDP_CLEAR_YOUNG)
+ ptep_test_and_clear_young(vma, addr, ptep);
+ else {
+ pte = ptep_get_and_clear(vma->vm_mm, addr, ptep);
+ if (flags & CYDP_CLEAR_YOUNG)
+ pte = pte_mkold(pte);
+ if (flags & CYDP_CLEAR_DIRTY)
+ pte = pte_mkclean(pte);
+ set_pte_at(vma->vm_mm, addr, ptep, pte);
+ }
+ if (--nr == 0)
+ break;
+ ptep++;
+ addr += PAGE_SIZE;
+ }
+}
+#endif
+
static inline void ptep_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 9bf9324214fc3f..7229b9baf20d87 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -273,6 +273,7 @@ static inline int hugetlb_try_dup_anon_rmap(struct folio *folio,
ClearPageAnonExclusive(&folio->page);
}
atomic_inc(&folio->_entire_mapcount);
+ atomic_inc(&folio->_large_mapcount);
return 0;
}
@@ -306,6 +307,7 @@ static inline void hugetlb_add_file_rmap(struct folio *folio)
VM_WARN_ON_FOLIO(folio_test_anon(folio), folio);
atomic_inc(&folio->_entire_mapcount);
+ atomic_inc(&folio->_large_mapcount);
}
static inline void hugetlb_remove_rmap(struct folio *folio)
@@ -313,21 +315,31 @@ static inline void hugetlb_remove_rmap(struct folio *folio)
VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
atomic_dec(&folio->_entire_mapcount);
+ atomic_dec(&folio->_large_mapcount);
}
static __always_inline void __folio_dup_file_rmap(struct folio *folio,
struct page *page, int nr_pages, enum rmap_level level)
{
+ const int orig_nr_pages = nr_pages;
+
__folio_rmap_sanity_checks(folio, page, nr_pages, level);
switch (level) {
case RMAP_LEVEL_PTE:
+ if (!folio_test_large(folio)) {
+ atomic_inc(&page->_mapcount);
+ break;
+ }
+
do {
atomic_inc(&page->_mapcount);
} while (page++, --nr_pages > 0);
+ atomic_add(orig_nr_pages, &folio->_large_mapcount);
break;
case RMAP_LEVEL_PMD:
atomic_inc(&folio->_entire_mapcount);
+ atomic_inc(&folio->_large_mapcount);
break;
}
}
@@ -347,8 +359,12 @@ static inline void folio_dup_file_rmap_ptes(struct folio *folio,
{
__folio_dup_file_rmap(folio, page, nr_pages, RMAP_LEVEL_PTE);
}
-#define folio_dup_file_rmap_pte(folio, page) \
- folio_dup_file_rmap_ptes(folio, page, 1)
+
+static __always_inline void folio_dup_file_rmap_pte(struct folio *folio,
+ struct page *page)
+{
+ __folio_dup_file_rmap(folio, page, 1, RMAP_LEVEL_PTE);
+}
/**
* folio_dup_file_rmap_pmd - duplicate a PMD mapping of a page range of a folio
@@ -373,6 +389,7 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
struct page *page, int nr_pages, struct vm_area_struct *src_vma,
enum rmap_level level)
{
+ const int orig_nr_pages = nr_pages;
bool maybe_pinned;
int i;
@@ -401,11 +418,20 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
if (PageAnonExclusive(page + i))
return -EBUSY;
}
+
+ if (!folio_test_large(folio)) {
+ if (PageAnonExclusive(page))
+ ClearPageAnonExclusive(page);
+ atomic_inc(&page->_mapcount);
+ break;
+ }
+
do {
if (PageAnonExclusive(page))
ClearPageAnonExclusive(page);
atomic_inc(&page->_mapcount);
} while (page++, --nr_pages > 0);
+ atomic_add(orig_nr_pages, &folio->_large_mapcount);
break;
case RMAP_LEVEL_PMD:
if (PageAnonExclusive(page)) {
@@ -414,6 +440,7 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
ClearPageAnonExclusive(page);
}
atomic_inc(&folio->_entire_mapcount);
+ atomic_inc(&folio->_large_mapcount);
break;
}
return 0;
@@ -448,8 +475,13 @@ static inline int folio_try_dup_anon_rmap_ptes(struct folio *folio,
return __folio_try_dup_anon_rmap(folio, page, nr_pages, src_vma,
RMAP_LEVEL_PTE);
}
-#define folio_try_dup_anon_rmap_pte(folio, page, vma) \
- folio_try_dup_anon_rmap_ptes(folio, page, 1, vma)
+
+static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio,
+ struct page *page, struct vm_area_struct *src_vma)
+{
+ return __folio_try_dup_anon_rmap(folio, page, 1, src_vma,
+ RMAP_LEVEL_PTE);
+}
/**
* folio_try_dup_anon_rmap_pmd - try duplicating a PMD mapping of a page range
@@ -698,7 +730,7 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked);
-int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
+unsigned long page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
/*
* rmap_walk_control: To control rmap traversing for specific needs
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 6b672a67ae5d9d..11c53692f65fb7 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -406,10 +406,13 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
#define MEMCG_RECLAIM_MAY_SWAP (1 << 1)
#define MEMCG_RECLAIM_PROACTIVE (1 << 2)
+#define MIN_SWAPPINESS 0
+#define MAX_SWAPPINESS 200
extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
unsigned long nr_pages,
gfp_t gfp_mask,
- unsigned int reclaim_options);
+ unsigned int reclaim_options,
+ int *swappiness);
extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem,
gfp_t gfp_mask, bool noswap,
pg_data_t *pgdat,
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index e619ac10cd234c..9104952d323d1d 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -821,6 +821,7 @@ asmlinkage long sys_process_mrelease(int pidfd, unsigned int flags);
asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
unsigned long prot, unsigned long pgoff,
unsigned long flags);
+asmlinkage long sys_mseal(unsigned long start, size_t len, unsigned long flags);
asmlinkage long sys_mbind(unsigned long start, unsigned long len,
unsigned long mode,
const unsigned long __user *nmask,
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 9845cb62e40b2d..112d806ddbe49d 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -355,6 +355,7 @@ int dirtytime_interval_handler(struct ctl_table *table, int write,
void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh);
+unsigned long cgwb_calc_thresh(struct bdi_writeback *wb);
void wb_update_bandwidth(struct bdi_writeback *wb);
diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index d9d479334c9e65..d2e4d36246890c 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -12,14 +12,18 @@
#include <linux/bitmap.h>
#include <linux/bug.h>
#include <linux/compiler.h>
+#include <linux/err.h>
#include <linux/gfp.h>
#include <linux/kconfig.h>
-#include <linux/kernel.h>
+#include <linux/limits.h>
+#include <linux/lockdep.h>
#include <linux/rcupdate.h>
#include <linux/sched/mm.h>
#include <linux/spinlock.h>
#include <linux/types.h>
+struct list_lru;
+
/*
* The bottom two bits of the entry determine how the XArray interprets
* the contents:
@@ -1146,7 +1150,7 @@ static inline void xa_release(struct xarray *xa, unsigned long index)
#define XA_CHUNK_SIZE (1UL << XA_CHUNK_SHIFT)
#define XA_CHUNK_MASK (XA_CHUNK_SIZE - 1)
#define XA_MAX_MARKS 3
-#define XA_MARK_LONGS DIV_ROUND_UP(XA_CHUNK_SIZE, BITS_PER_LONG)
+#define XA_MARK_LONGS BITS_TO_LONGS(XA_CHUNK_SIZE)
/*
* @count is the count of every non-NULL element in the ->slots array
diff --git a/include/trace/events/page_ref.h b/include/trace/events/page_ref.h
index 8a99c1cd417b81..fe33a255b7d093 100644
--- a/include/trace/events/page_ref.h
+++ b/include/trace/events/page_ref.h
@@ -30,7 +30,7 @@ DECLARE_EVENT_CLASS(page_ref_mod_template,
__entry->pfn = page_to_pfn(page);
__entry->flags = page->flags;
__entry->count = page_ref_count(page);
- __entry->mapcount = page_mapcount(page);
+ __entry->mapcount = atomic_read(&page->_mapcount);
__entry->mapping = page->mapping;
__entry->mt = get_pageblock_migratetype(page);
__entry->val = v;
@@ -79,7 +79,7 @@ DECLARE_EVENT_CLASS(page_ref_mod_and_test_template,
__entry->pfn = page_to_pfn(page);
__entry->flags = page->flags;
__entry->count = page_ref_count(page);
- __entry->mapcount = page_mapcount(page);
+ __entry->mapcount = atomic_read(&page->_mapcount);
__entry->mapping = page->mapping;
__entry->mt = get_pageblock_migratetype(page);
__entry->val = v;
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 75f00965ab1586..d983c48a3b6af1 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -842,8 +842,11 @@ __SYSCALL(__NR_lsm_set_self_attr, sys_lsm_set_self_attr)
#define __NR_lsm_list_modules 461
__SYSCALL(__NR_lsm_list_modules, sys_lsm_list_modules)
+#define __NR_mseal 462
+__SYSCALL(__NR_mseal, sys_mseal)
+
#undef __NR_syscalls
-#define __NR_syscalls 462
+#define __NR_syscalls 463
/*
* 32 bit systems traditionally used different
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index faad00cce269c3..d7eee421d4bc33 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -196,6 +196,7 @@ COND_SYSCALL(migrate_pages);
COND_SYSCALL(move_pages);
COND_SYSCALL(set_mempolicy_home_node);
COND_SYSCALL(cachestat);
+COND_SYSCALL(mseal);
COND_SYSCALL(perf_event_open);
COND_SYSCALL(accept4);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index ca33eaf04398b8..c4b6b77f36bf81 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2922,7 +2922,7 @@ config TEST_FREE_PAGES
config TEST_FPU
tristate "Test floating point operations in kernel space"
- depends on X86 && !KCOV_INSTRUMENT_ALL
+ depends on ARCH_HAS_KERNEL_FPU_SUPPORT && !KCOV_INSTRUMENT_ALL
help
Enable this option to add /sys/kernel/debug/selftest_helpers/test_fpu
which will trigger a sequence of floating point operations. This is used
diff --git a/lib/Makefile b/lib/Makefile
index 2f4e17bfb2990d..1f6c51420bf6b3 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -110,30 +110,10 @@ CFLAGS_test_fprobe.o += $(CC_FLAGS_FTRACE)
obj-$(CONFIG_FPROBE_SANITY_TEST) += test_fprobe.o
obj-$(CONFIG_TEST_OBJPOOL) += test_objpool.o
-#
-# CFLAGS for compiling floating point code inside the kernel. x86/Makefile turns
-# off the generation of FPU/SSE* instructions for kernel proper but FPU_FLAGS
-# get appended last to CFLAGS and thus override those previous compiler options.
-#
-FPU_CFLAGS := -msse -msse2
-ifdef CONFIG_CC_IS_GCC
-# Stack alignment mismatch, proceed with caution.
-# GCC < 7.1 cannot compile code using `double` and -mpreferred-stack-boundary=3
-# (8B stack alignment).
-# See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383
-#
-# The "-msse" in the first argument is there so that the
-# -mpreferred-stack-boundary=3 build error:
-#
-# -mpreferred-stack-boundary=3 is not between 4 and 12
-#
-# can be triggered. Otherwise gcc doesn't complain.
-FPU_CFLAGS += -mhard-float
-FPU_CFLAGS += $(call cc-option,-msse -mpreferred-stack-boundary=3,-mpreferred-stack-boundary=4)
-endif
-
obj-$(CONFIG_TEST_FPU) += test_fpu.o
-CFLAGS_test_fpu.o += $(FPU_CFLAGS)
+test_fpu-y := test_fpu_glue.o test_fpu_impl.o
+CFLAGS_test_fpu_impl.o += $(CC_FLAGS_FPU)
+CFLAGS_REMOVE_test_fpu_impl.o += $(CC_FLAGS_NO_FPU)
# Some KUnit files (hooks.o) need to be built-in even when KUnit is a module,
# so we can't just use obj-$(CONFIG_KUNIT).
diff --git a/lib/kfifo.c b/lib/kfifo.c
index 12f5a347aa1370..15acdee4a8f325 100644
--- a/lib/kfifo.c
+++ b/lib/kfifo.c
@@ -5,13 +5,13 @@
* Copyright (C) 2009/2010 Stefani Seibold <stefani@seibold.net>
*/
-#include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/slab.h>
#include <linux/err.h>
+#include <linux/export.h>
+#include <linux/kfifo.h>
#include <linux/log2.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
#include <linux/uaccess.h>
-#include <linux/kfifo.h>
/*
* internal helper to calculate the unused elements in a fifo
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
index 385a94aa0b999b..0e88bfe6445bfb 100644
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -33,25 +33,6 @@ CFLAGS_REMOVE_vpermxor8.o += -msoft-float
endif
endif
-# The GCC option -ffreestanding is required in order to compile code containing
-# ARM/NEON intrinsics in a non C99-compliant environment (such as the kernel)
-ifeq ($(CONFIG_KERNEL_MODE_NEON),y)
-NEON_FLAGS := -ffreestanding
-# Enable <arm_neon.h>
-NEON_FLAGS += -isystem $(shell $(CC) -print-file-name=include)
-ifeq ($(ARCH),arm)
-NEON_FLAGS += -march=armv7-a -mfloat-abi=softfp -mfpu=neon
-endif
-CFLAGS_recov_neon_inner.o += $(NEON_FLAGS)
-ifeq ($(ARCH),arm64)
-CFLAGS_REMOVE_recov_neon_inner.o += -mgeneral-regs-only
-CFLAGS_REMOVE_neon1.o += -mgeneral-regs-only
-CFLAGS_REMOVE_neon2.o += -mgeneral-regs-only
-CFLAGS_REMOVE_neon4.o += -mgeneral-regs-only
-CFLAGS_REMOVE_neon8.o += -mgeneral-regs-only
-endif
-endif
-
quiet_cmd_unroll = UNROLL $@
cmd_unroll = $(AWK) -v N=$* -f $(srctree)/$(src)/unroll.awk < $< > $@
@@ -75,10 +56,16 @@ targets += vpermxor1.c vpermxor2.c vpermxor4.c vpermxor8.c
$(obj)/vpermxor%.c: $(src)/vpermxor.uc $(src)/unroll.awk FORCE
$(call if_changed,unroll)
-CFLAGS_neon1.o += $(NEON_FLAGS)
-CFLAGS_neon2.o += $(NEON_FLAGS)
-CFLAGS_neon4.o += $(NEON_FLAGS)
-CFLAGS_neon8.o += $(NEON_FLAGS)
+CFLAGS_neon1.o += $(CC_FLAGS_FPU)
+CFLAGS_neon2.o += $(CC_FLAGS_FPU)
+CFLAGS_neon4.o += $(CC_FLAGS_FPU)
+CFLAGS_neon8.o += $(CC_FLAGS_FPU)
+CFLAGS_recov_neon_inner.o += $(CC_FLAGS_FPU)
+CFLAGS_REMOVE_neon1.o += $(CC_FLAGS_NO_FPU)
+CFLAGS_REMOVE_neon2.o += $(CC_FLAGS_NO_FPU)
+CFLAGS_REMOVE_neon4.o += $(CC_FLAGS_NO_FPU)
+CFLAGS_REMOVE_neon8.o += $(CC_FLAGS_NO_FPU)
+CFLAGS_REMOVE_recov_neon_inner.o += $(CC_FLAGS_NO_FPU)
targets += neon1.c neon2.c neon4.c neon8.c
$(obj)/neon%.c: $(src)/neon.uc $(src)/unroll.awk FORCE
$(call if_changed,unroll)
diff --git a/lib/test_fpu.h b/lib/test_fpu.h
new file mode 100644
index 00000000000000..4459807084bcee
--- /dev/null
+++ b/lib/test_fpu.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+
+#ifndef _LIB_TEST_FPU_H
+#define _LIB_TEST_FPU_H
+
+int test_fpu(void);
+
+#endif
diff --git a/lib/test_fpu.c b/lib/test_fpu_glue.c
index e82db19fed84ac..eef282a2715f21 100644
--- a/lib/test_fpu.c
+++ b/lib/test_fpu_glue.c
@@ -17,39 +17,9 @@
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/debugfs.h>
-#include <asm/fpu/api.h>
+#include <linux/fpu.h>
-static int test_fpu(void)
-{
- /*
- * This sequence of operations tests that rounding mode is
- * to nearest and that denormal numbers are supported.
- * Volatile variables are used to avoid compiler optimizing
- * the calculations away.
- */
- volatile double a, b, c, d, e, f, g;
-
- a = 4.0;
- b = 1e-15;
- c = 1e-310;
-
- /* Sets precision flag */
- d = a + b;
-
- /* Result depends on rounding mode */
- e = a + b / 2;
-
- /* Denormal and very large values */
- f = b / c;
-
- /* Depends on denormal support */
- g = a + c * f;
-
- if (d > a && e > a && g > a)
- return 0;
- else
- return -EINVAL;
-}
+#include "test_fpu.h"
static int test_fpu_get(void *data, u64 *val)
{
@@ -68,6 +38,9 @@ static struct dentry *selftest_dir;
static int __init test_fpu_init(void)
{
+ if (!kernel_fpu_available())
+ return -EINVAL;
+
selftest_dir = debugfs_create_dir("selftest_helpers", NULL);
if (!selftest_dir)
return -ENOMEM;
diff --git a/lib/test_fpu_impl.c b/lib/test_fpu_impl.c
new file mode 100644
index 00000000000000..777894dbbe86fa
--- /dev/null
+++ b/lib/test_fpu_impl.c
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+#include <linux/errno.h>
+
+#include "test_fpu.h"
+
+int test_fpu(void)
+{
+ /*
+ * This sequence of operations tests that rounding mode is
+ * to nearest and that denormal numbers are supported.
+ * Volatile variables are used to avoid compiler optimizing
+ * the calculations away.
+ */
+ volatile double a, b, c, d, e, f, g;
+
+ a = 4.0;
+ b = 1e-15;
+ c = 1e-310;
+
+ /* Sets precision flag */
+ d = a + b;
+
+ /* Result depends on rounding mode */
+ e = a + b / 2;
+
+ /* Denormal and very large values */
+ f = b / c;
+
+ /* Depends on denormal support */
+ g = a + c * f;
+
+ if (d > a && e > a && g > a)
+ return 0;
+ else
+ return -EINVAL;
+}
diff --git a/lib/xarray.c b/lib/xarray.c
index da79128ad754fc..1c87d871cacfa0 100644
--- a/lib/xarray.c
+++ b/lib/xarray.c
@@ -200,7 +200,8 @@ static void *xas_start(struct xa_state *xas)
return entry;
}
-static void *xas_descend(struct xa_state *xas, struct xa_node *node)
+static __always_inline void *xas_descend(struct xa_state *xas,
+ struct xa_node *node)
{
unsigned int offset = get_offset(xas->xa_index, node);
void *entry = xa_entry(xas->xa, node, offset);
diff --git a/mm/Makefile b/mm/Makefile
index e164eed35d467d..25da205becdd24 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -43,6 +43,10 @@ ifdef CONFIG_CROSS_MEMORY_ATTACH
mmu-$(CONFIG_MMU) += process_vm_access.o
endif
+ifdef CONFIG_64BIT
+mmu-$(CONFIG_MMU) += mseal.o
+endif
+
obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
maccess.o page-writeback.o folio-compat.o \
readahead.o swap.o truncate.o vmscan.o shrinker.o \
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 5fa3666356f957..e61bbb1bd62218 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -39,6 +39,19 @@ struct workqueue_struct *bdi_wq;
#include <linux/debugfs.h>
#include <linux/seq_file.h>
+struct wb_stats {
+ unsigned long nr_dirty;
+ unsigned long nr_io;
+ unsigned long nr_more_io;
+ unsigned long nr_dirty_time;
+ unsigned long nr_writeback;
+ unsigned long nr_reclaimable;
+ unsigned long nr_dirtied;
+ unsigned long nr_written;
+ unsigned long dirty_thresh;
+ unsigned long wb_thresh;
+};
+
static struct dentry *bdi_debug_root;
static void bdi_debug_init(void)
@@ -46,31 +59,68 @@ static void bdi_debug_init(void)
bdi_debug_root = debugfs_create_dir("bdi", NULL);
}
-static int bdi_debug_stats_show(struct seq_file *m, void *v)
+static void collect_wb_stats(struct wb_stats *stats,
+ struct bdi_writeback *wb)
{
- struct backing_dev_info *bdi = m->private;
- struct bdi_writeback *wb = &bdi->wb;
- unsigned long background_thresh;
- unsigned long dirty_thresh;
- unsigned long wb_thresh;
- unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time;
struct inode *inode;
- nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0;
spin_lock(&wb->list_lock);
list_for_each_entry(inode, &wb->b_dirty, i_io_list)
- nr_dirty++;
+ stats->nr_dirty++;
list_for_each_entry(inode, &wb->b_io, i_io_list)
- nr_io++;
+ stats->nr_io++;
list_for_each_entry(inode, &wb->b_more_io, i_io_list)
- nr_more_io++;
+ stats->nr_more_io++;
list_for_each_entry(inode, &wb->b_dirty_time, i_io_list)
if (inode->i_state & I_DIRTY_TIME)
- nr_dirty_time++;
+ stats->nr_dirty_time++;
spin_unlock(&wb->list_lock);
+ stats->nr_writeback += wb_stat(wb, WB_WRITEBACK);
+ stats->nr_reclaimable += wb_stat(wb, WB_RECLAIMABLE);
+ stats->nr_dirtied += wb_stat(wb, WB_DIRTIED);
+ stats->nr_written += wb_stat(wb, WB_WRITTEN);
+ stats->wb_thresh += wb_calc_thresh(wb, stats->dirty_thresh);
+}
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+static void bdi_collect_stats(struct backing_dev_info *bdi,
+ struct wb_stats *stats)
+{
+ struct bdi_writeback *wb;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) {
+ if (!wb_tryget(wb))
+ continue;
+
+ collect_wb_stats(stats, wb);
+ wb_put(wb);
+ }
+ rcu_read_unlock();
+}
+#else
+static void bdi_collect_stats(struct backing_dev_info *bdi,
+ struct wb_stats *stats)
+{
+ collect_wb_stats(stats, &bdi->wb);
+}
+#endif
+
+static int bdi_debug_stats_show(struct seq_file *m, void *v)
+{
+ struct backing_dev_info *bdi = m->private;
+ unsigned long background_thresh;
+ unsigned long dirty_thresh;
+ struct wb_stats stats;
+ unsigned long tot_bw;
+
global_dirty_limits(&background_thresh, &dirty_thresh);
- wb_thresh = wb_calc_thresh(wb, dirty_thresh);
+
+ memset(&stats, 0, sizeof(stats));
+ stats.dirty_thresh = dirty_thresh;
+ bdi_collect_stats(bdi, &stats);
+ tot_bw = atomic_long_read(&bdi->tot_write_bandwidth);
seq_printf(m,
"BdiWriteback: %10lu kB\n"
@@ -87,37 +137,114 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
"b_dirty_time: %10lu\n"
"bdi_list: %10u\n"
"state: %10lx\n",
- (unsigned long) K(wb_stat(wb, WB_WRITEBACK)),
- (unsigned long) K(wb_stat(wb, WB_RECLAIMABLE)),
- K(wb_thresh),
+ K(stats.nr_writeback),
+ K(stats.nr_reclaimable),
+ K(stats.wb_thresh),
K(dirty_thresh),
K(background_thresh),
- (unsigned long) K(wb_stat(wb, WB_DIRTIED)),
- (unsigned long) K(wb_stat(wb, WB_WRITTEN)),
- (unsigned long) K(wb->write_bandwidth),
- nr_dirty,
- nr_io,
- nr_more_io,
- nr_dirty_time,
+ K(stats.nr_dirtied),
+ K(stats.nr_written),
+ K(tot_bw),
+ stats.nr_dirty,
+ stats.nr_io,
+ stats.nr_more_io,
+ stats.nr_dirty_time,
!list_empty(&bdi->bdi_list), bdi->wb.state);
return 0;
}
DEFINE_SHOW_ATTRIBUTE(bdi_debug_stats);
+static void wb_stats_show(struct seq_file *m, struct bdi_writeback *wb,
+ struct wb_stats *stats)
+{
+
+ seq_printf(m,
+ "WbCgIno: %10lu\n"
+ "WbWriteback: %10lu kB\n"
+ "WbReclaimable: %10lu kB\n"
+ "WbDirtyThresh: %10lu kB\n"
+ "WbDirtied: %10lu kB\n"
+ "WbWritten: %10lu kB\n"
+ "WbWriteBandwidth: %10lu kBps\n"
+ "b_dirty: %10lu\n"
+ "b_io: %10lu\n"
+ "b_more_io: %10lu\n"
+ "b_dirty_time: %10lu\n"
+ "state: %10lx\n\n",
+#ifdef CONFIG_CGROUP_WRITEBACK
+ cgroup_ino(wb->memcg_css->cgroup),
+#else
+ 1ul,
+#endif
+ K(stats->nr_writeback),
+ K(stats->nr_reclaimable),
+ K(stats->wb_thresh),
+ K(stats->nr_dirtied),
+ K(stats->nr_written),
+ K(wb->avg_write_bandwidth),
+ stats->nr_dirty,
+ stats->nr_io,
+ stats->nr_more_io,
+ stats->nr_dirty_time,
+ wb->state);
+}
+
+static int cgwb_debug_stats_show(struct seq_file *m, void *v)
+{
+ struct backing_dev_info *bdi = m->private;
+ unsigned long background_thresh;
+ unsigned long dirty_thresh;
+ struct bdi_writeback *wb;
+
+ global_dirty_limits(&background_thresh, &dirty_thresh);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) {
+ struct wb_stats stats = { .dirty_thresh = dirty_thresh };
+
+ if (!wb_tryget(wb))
+ continue;
+
+ collect_wb_stats(&stats, wb);
+
+ /*
+ * Calculate thresh of wb in writeback cgroup which is min of
+ * thresh in global domain and thresh in cgroup domain. Drop
+ * rcu lock because cgwb_calc_thresh may sleep in
+ * cgroup_rstat_flush. We can do so here because we have a ref.
+ */
+ if (mem_cgroup_wb_domain(wb)) {
+ rcu_read_unlock();
+ stats.wb_thresh = min(stats.wb_thresh, cgwb_calc_thresh(wb));
+ rcu_read_lock();
+ }
+
+ wb_stats_show(m, wb, &stats);
+
+ wb_put(wb);
+ }
+ rcu_read_unlock();
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(cgwb_debug_stats);
+
static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
{
bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);
debugfs_create_file("stats", 0444, bdi->debug_dir, bdi,
&bdi_debug_stats_fops);
+ debugfs_create_file("wb_stats", 0444, bdi->debug_dir, bdi,
+ &cgwb_debug_stats_fops);
}
static void bdi_debug_unregister(struct backing_dev_info *bdi)
{
debugfs_remove_recursive(bdi->debug_dir);
}
-#else
+#else /* CONFIG_DEBUG_FS */
static inline void bdi_debug_init(void)
{
}
@@ -128,7 +255,7 @@ static inline void bdi_debug_register(struct backing_dev_info *bdi,
static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
{
}
-#endif
+#endif /* CONFIG_DEBUG_FS */
static ssize_t read_ahead_kb_store(struct device *dev,
struct device_attribute *attr,
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 5e6dc312072cd0..5685ba485097d2 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -16,8 +16,8 @@
#include "../internal.h"
#include "ops-common.h"
-static bool __damon_pa_mkold(struct folio *folio, struct vm_area_struct *vma,
- unsigned long addr, void *arg)
+static bool damon_folio_mkold_one(struct folio *folio,
+ struct vm_area_struct *vma, unsigned long addr, void *arg)
{
DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0);
@@ -31,33 +31,38 @@ static bool __damon_pa_mkold(struct folio *folio, struct vm_area_struct *vma,
return true;
}
-static void damon_pa_mkold(unsigned long paddr)
+static void damon_folio_mkold(struct folio *folio)
{
- struct folio *folio = damon_get_folio(PHYS_PFN(paddr));
struct rmap_walk_control rwc = {
- .rmap_one = __damon_pa_mkold,
+ .rmap_one = damon_folio_mkold_one,
.anon_lock = folio_lock_anon_vma_read,
};
bool need_lock;
- if (!folio)
- return;
-
if (!folio_mapped(folio) || !folio_raw_mapping(folio)) {
folio_set_idle(folio);
- goto out;
+ return;
}
need_lock = !folio_test_anon(folio) || folio_test_ksm(folio);
if (need_lock && !folio_trylock(folio))
- goto out;
+ return;
rmap_walk(folio, &rwc);
if (need_lock)
folio_unlock(folio);
-out:
+}
+
+static void damon_pa_mkold(unsigned long paddr)
+{
+ struct folio *folio = damon_get_folio(PHYS_PFN(paddr));
+
+ if (!folio)
+ return;
+
+ damon_folio_mkold(folio);
folio_put(folio);
}
@@ -79,8 +84,8 @@ static void damon_pa_prepare_access_checks(struct damon_ctx *ctx)
}
}
-static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma,
- unsigned long addr, void *arg)
+static bool damon_folio_young_one(struct folio *folio,
+ struct vm_area_struct *vma, unsigned long addr, void *arg)
{
bool *accessed = arg;
DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0);
@@ -111,38 +116,44 @@ static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma,
return *accessed == false;
}
-static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz)
+static bool damon_folio_young(struct folio *folio)
{
- struct folio *folio = damon_get_folio(PHYS_PFN(paddr));
bool accessed = false;
struct rmap_walk_control rwc = {
.arg = &accessed,
- .rmap_one = __damon_pa_young,
+ .rmap_one = damon_folio_young_one,
.anon_lock = folio_lock_anon_vma_read,
};
bool need_lock;
- if (!folio)
- return false;
-
if (!folio_mapped(folio) || !folio_raw_mapping(folio)) {
if (folio_test_idle(folio))
- accessed = false;
+ return false;
else
- accessed = true;
- goto out;
+ return true;
}
need_lock = !folio_test_anon(folio) || folio_test_ksm(folio);
if (need_lock && !folio_trylock(folio))
- goto out;
+ return false;
rmap_walk(folio, &rwc);
if (need_lock)
folio_unlock(folio);
-out:
+ return accessed;
+}
+
+static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz)
+{
+ struct folio *folio = damon_get_folio(PHYS_PFN(paddr));
+ bool accessed;
+
+ if (!folio)
+ return false;
+
+ accessed = damon_folio_young(folio);
*folio_sz = folio_size(folio);
folio_put(folio);
return accessed;
@@ -203,6 +214,11 @@ static bool __damos_pa_filter_out(struct damos_filter *filter,
matched = filter->memcg_id == mem_cgroup_id(memcg);
rcu_read_unlock();
break;
+ case DAMOS_FILTER_TYPE_YOUNG:
+ matched = damon_folio_young(folio);
+ if (matched)
+ damon_folio_mkold(folio);
+ break;
default:
break;
}
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 53a90ac678fb98..bea5bc52846a69 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -343,6 +343,7 @@ static struct damon_sysfs_scheme_filter *damon_sysfs_scheme_filter_alloc(void)
static const char * const damon_sysfs_scheme_filter_type_strs[] = {
"anon",
"memcg",
+ "young",
"addr",
"target",
};
diff --git a/mm/debug.c b/mm/debug.c
index b71186f1fb0b1b..69e524c3e6012e 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -55,21 +55,17 @@ static void __dump_folio(struct folio *folio, struct page *page,
unsigned long pfn, unsigned long idx)
{
struct address_space *mapping = folio_mapping(folio);
- int mapcount = atomic_read(&page->_mapcount) + 1;
+ int mapcount = atomic_read(&page->_mapcount);
char *type = "";
- /* Open-code page_mapcount() to avoid looking up a stale folio */
- if (mapcount < 0)
- mapcount = 0;
- if (folio_test_large(folio))
- mapcount += folio_entire_mapcount(folio);
-
+ mapcount = page_type_has_type(mapcount) ? 0 : mapcount + 1;
pr_warn("page: refcount:%d mapcount:%d mapping:%p index:%#lx pfn:%#lx\n",
folio_ref_count(folio), mapcount, mapping,
folio->index + idx, pfn);
if (folio_test_large(folio)) {
- pr_warn("head: order:%u entire_mapcount:%d nr_pages_mapped:%d pincount:%d\n",
+ pr_warn("head: order:%u mapcount:%d entire_mapcount:%d nr_pages_mapped:%d pincount:%d\n",
folio_order(folio),
+ folio_mapcount(folio),
folio_entire_mapcount(folio),
folio_nr_pages_mapped(folio),
atomic_read(&folio->_pincount));
diff --git a/mm/filemap.c b/mm/filemap.c
index 18f4bce3741ec3..ec273b00ce5fd6 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -168,7 +168,7 @@ static void filemap_unaccount_folio(struct address_space *mapping,
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
if (mapping_exiting(mapping) && !folio_test_large(folio)) {
- int mapcount = page_mapcount(&folio->page);
+ int mapcount = folio_mapcount(folio);
if (folio_ref_count(folio) >= mapcount + 2) {
/*
@@ -3231,7 +3231,8 @@ static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf)
if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
return 0;
- ptep = pte_offset_map(vmf->pmd, vmf->address);
+ ptep = pte_offset_map_nolock(vma->vm_mm, vmf->pmd, vmf->address,
+ &vmf->ptl);
if (unlikely(!ptep))
return VM_FAULT_NOPAGE;
@@ -3505,7 +3506,7 @@ skip:
static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
struct folio *folio, unsigned long start,
unsigned long addr, unsigned int nr_pages,
- unsigned int *mmap_miss)
+ unsigned long *rss, unsigned int *mmap_miss)
{
vm_fault_t ret = 0;
struct page *page = folio_page(folio, start);
@@ -3539,6 +3540,7 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
skip:
if (count) {
set_pte_range(vmf, folio, page, count, addr);
+ *rss += count;
folio_ref_add(folio, count);
if (in_range(vmf->address, addr, count * PAGE_SIZE))
ret = VM_FAULT_NOPAGE;
@@ -3553,6 +3555,7 @@ skip:
if (count) {
set_pte_range(vmf, folio, page, count, addr);
+ *rss += count;
folio_ref_add(folio, count);
if (in_range(vmf->address, addr, count * PAGE_SIZE))
ret = VM_FAULT_NOPAGE;
@@ -3565,7 +3568,7 @@ skip:
static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf,
struct folio *folio, unsigned long addr,
- unsigned int *mmap_miss)
+ unsigned long *rss, unsigned int *mmap_miss)
{
vm_fault_t ret = 0;
struct page *page = &folio->page;
@@ -3589,6 +3592,7 @@ static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf,
ret = VM_FAULT_NOPAGE;
set_pte_range(vmf, folio, page, 1, addr);
+ (*rss)++;
folio_ref_inc(folio);
return ret;
@@ -3605,7 +3609,8 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
XA_STATE(xas, &mapping->i_pages, start_pgoff);
struct folio *folio;
vm_fault_t ret = 0;
- unsigned int nr_pages = 0, mmap_miss = 0, mmap_miss_saved;
+ unsigned long rss = 0;
+ unsigned int nr_pages = 0, mmap_miss = 0, mmap_miss_saved, folio_type;
rcu_read_lock();
folio = next_uptodate_folio(&xas, mapping, end_pgoff);
@@ -3624,6 +3629,8 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
folio_put(folio);
goto out;
}
+
+ folio_type = mm_counter_file(folio);
do {
unsigned long end;
@@ -3635,15 +3642,16 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
if (!folio_test_large(folio))
ret |= filemap_map_order0_folio(vmf,
- folio, addr, &mmap_miss);
+ folio, addr, &rss, &mmap_miss);
else
ret |= filemap_map_folio_range(vmf, folio,
xas.xa_index - folio->index, addr,
- nr_pages, &mmap_miss);
+ nr_pages, &rss, &mmap_miss);
folio_unlock(folio);
folio_put(folio);
} while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL);
+ add_mm_counter(vma->vm_mm, folio_type, rss);
pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
rcu_read_unlock();
diff --git a/mm/folio-compat.c b/mm/folio-compat.c
index f31e0ce65b11d4..f05906006b3c33 100644
--- a/mm/folio-compat.c
+++ b/mm/folio-compat.c
@@ -10,12 +10,6 @@
#include <linux/swap.h>
#include "internal.h"
-struct address_space *page_mapping(struct page *page)
-{
- return folio_mapping(page_folio(page));
-}
-EXPORT_SYMBOL(page_mapping);
-
void unlock_page(struct page *page)
{
return folio_unlock(page_folio(page));
diff --git a/mm/gup.c b/mm/gup.c
index 8dcbeae714e268..2f7baf96f65593 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -89,7 +89,7 @@ retry:
* belongs to this folio.
*/
if (unlikely(page_folio(page) != folio)) {
- if (!put_devmap_managed_page_refs(&folio->page, refs))
+ if (!put_devmap_managed_folio_refs(folio, refs))
folio_put_refs(folio, refs);
goto retry;
}
@@ -156,7 +156,7 @@ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags)
*/
if (unlikely((flags & FOLL_LONGTERM) &&
!folio_is_longterm_pinnable(folio))) {
- if (!put_devmap_managed_page_refs(&folio->page, refs))
+ if (!put_devmap_managed_folio_refs(folio, refs))
folio_put_refs(folio, refs);
return NULL;
}
@@ -198,7 +198,7 @@ static void gup_put_folio(struct folio *folio, int refs, unsigned int flags)
refs *= GUP_PIN_COUNTING_BIAS;
}
- if (!put_devmap_managed_page_refs(&folio->page, refs))
+ if (!put_devmap_managed_folio_refs(folio, refs))
folio_put_refs(folio, refs);
}
@@ -2877,13 +2877,10 @@ static void __maybe_unused gup_fast_undo_dev_pagemap(int *nr, int nr_start,
unsigned int flags, struct page **pages)
{
while ((*nr) - nr_start) {
- struct page *page = pages[--(*nr)];
+ struct folio *folio = page_folio(pages[--(*nr)]);
- ClearPageReferenced(page);
- if (flags & FOLL_PIN)
- unpin_user_page(page);
- else
- put_page(page);
+ folio_clear_referenced(folio);
+ gup_put_folio(folio, 1, flags);
}
}
@@ -3024,6 +3021,7 @@ static int gup_fast_devmap_leaf(unsigned long pfn, unsigned long addr,
struct dev_pagemap *pgmap = NULL;
do {
+ struct folio *folio;
struct page *page = pfn_to_page(pfn);
pgmap = get_dev_pagemap(pfn, pgmap);
@@ -3037,12 +3035,13 @@ static int gup_fast_devmap_leaf(unsigned long pfn, unsigned long addr,
break;
}
- SetPageReferenced(page);
- pages[*nr] = page;
- if (unlikely(try_grab_page(page, flags))) {
+ folio = try_grab_folio(page, 1, flags);
+ if (!folio) {
gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
break;
}
+ folio_set_referenced(folio);
+ pages[*nr] = page;
(*nr)++;
pfn++;
} while (addr += PAGE_SIZE, addr != end);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 14f04fbc22d9aa..8261b56693978f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -81,10 +81,13 @@ unsigned long huge_anon_orders_madvise __read_mostly;
unsigned long huge_anon_orders_inherit __read_mostly;
unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
- unsigned long vm_flags, bool smaps,
- bool in_pf, bool enforce_sysfs,
+ unsigned long vm_flags,
+ unsigned long tva_flags,
unsigned long orders)
{
+ bool smaps = tva_flags & TVA_SMAPS;
+ bool in_pf = tva_flags & TVA_IN_PF;
+ bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS;
/* Check the intersection of requested and supported orders. */
orders &= vma_is_anonymous(vma) ?
THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE;
@@ -526,6 +529,52 @@ static const struct kobj_type thpsize_ktype = {
.sysfs_ops = &kobj_sysfs_ops,
};
+DEFINE_PER_CPU(struct mthp_stat, mthp_stats) = {{{0}}};
+
+static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item)
+{
+ unsigned long sum = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct mthp_stat *this = &per_cpu(mthp_stats, cpu);
+
+ sum += this->stats[order][item];
+ }
+
+ return sum;
+}
+
+#define DEFINE_MTHP_STAT_ATTR(_name, _index) \
+static ssize_t _name##_show(struct kobject *kobj, \
+ struct kobj_attribute *attr, char *buf) \
+{ \
+ int order = to_thpsize(kobj)->order; \
+ \
+ return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index)); \
+} \
+static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+
+DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
+DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
+DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
+DEFINE_MTHP_STAT_ATTR(anon_swpout, MTHP_STAT_ANON_SWPOUT);
+DEFINE_MTHP_STAT_ATTR(anon_swpout_fallback, MTHP_STAT_ANON_SWPOUT_FALLBACK);
+
+static struct attribute *stats_attrs[] = {
+ &anon_fault_alloc_attr.attr,
+ &anon_fault_fallback_attr.attr,
+ &anon_fault_fallback_charge_attr.attr,
+ &anon_swpout_attr.attr,
+ &anon_swpout_fallback_attr.attr,
+ NULL,
+};
+
+static struct attribute_group stats_attr_group = {
+ .name = "stats",
+ .attrs = stats_attrs,
+};
+
static struct thpsize *thpsize_create(int order, struct kobject *parent)
{
unsigned long size = (PAGE_SIZE << order) / SZ_1K;
@@ -549,6 +598,12 @@ static struct thpsize *thpsize_create(int order, struct kobject *parent)
return ERR_PTR(ret);
}
+ ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group);
+ if (ret) {
+ kobject_put(&thpsize->kobj);
+ return ERR_PTR(ret);
+ }
+
thpsize->order = order;
return thpsize;
}
@@ -880,6 +935,8 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
folio_put(folio);
count_vm_event(THP_FAULT_FALLBACK);
count_vm_event(THP_FAULT_FALLBACK_CHARGE);
+ count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
+ count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
return VM_FAULT_FALLBACK;
}
folio_throttle_swaprate(folio, gfp);
@@ -929,6 +986,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
mm_inc_nr_ptes(vma->vm_mm);
spin_unlock(vmf->ptl);
count_vm_event(THP_FAULT_ALLOC);
+ count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
}
@@ -999,11 +1057,13 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
gfp_t gfp;
struct folio *folio;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
+ vm_fault_t ret;
if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
return VM_FAULT_FALLBACK;
- if (unlikely(anon_vma_prepare(vma)))
- return VM_FAULT_OOM;
+ ret = vmf_anon_prepare(vmf);
+ if (ret)
+ return ret;
khugepaged_enter_vma(vma, vma->vm_flags);
if (!(vmf->flags & FAULT_FLAG_WRITE) &&
@@ -1050,6 +1110,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
if (unlikely(!folio)) {
count_vm_event(THP_FAULT_FALLBACK);
+ count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
}
return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
@@ -1791,7 +1852,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
folio = page_folio(page);
folio_remove_rmap_pmd(folio, page, vma);
- VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
+ WARN_ON_ONCE(folio_mapcount(folio) < 0);
VM_BUG_ON_PAGE(!PageHead(page), page);
} else if (thp_migration_supported()) {
swp_entry_t entry;
@@ -2144,7 +2205,7 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
}
folio_move_anon_rmap(src_folio, dst_vma);
- WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr));
+ src_folio->index = linear_page_index(dst_vma, dst_addr);
_dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot);
/* Follow mremap() behavior and treat the entry dirty after the move */
@@ -2389,12 +2450,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
* for this pmd), then we flush the SMP TLB and finally we write the
* non-huge version of the pmd entry with pmd_populate.
*/
- old_pmd = pmdp_invalidate(vma, haddr, pmd);
- pmd_migration = is_pmd_migration_entry(old_pmd);
+ pmd_migration = is_pmd_migration_entry(*pmd);
if (unlikely(pmd_migration)) {
swp_entry_t entry;
+ old_pmd = *pmd;
entry = pmd_to_swp_entry(old_pmd);
page = pfn_swap_entry_to_page(entry);
write = is_writable_migration_entry(entry);
@@ -2405,6 +2466,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
soft_dirty = pmd_swp_soft_dirty(old_pmd);
uffd_wp = pmd_swp_uffd_wp(old_pmd);
} else {
+ old_pmd = pmdp_invalidate(vma, haddr, pmd);
page = pmd_page(old_pmd);
folio = page_folio(page);
if (pmd_dirty(old_pmd)) {
@@ -2900,7 +2962,7 @@ bool can_split_folio(struct folio *folio, int *pextra_pins)
*
* 3) The folio must not be pinned. Any unexpected folio references, including
* GUP pins, will result in the folio not getting split; instead, the caller
- * will receive an -EBUSY.
+ * will receive an -EAGAIN.
*
* 4) @new_order > 1, usually. Splitting to order-1 anonymous folios is not
* supported for non-file-backed folios, because folio->_deferred_list, which
@@ -2919,8 +2981,16 @@ bool can_split_folio(struct folio *folio, int *pextra_pins)
*
* Returns 0 if the huge page was split successfully.
*
- * Returns -EBUSY if @page's folio is pinned, or if the anon_vma disappeared
- * from under us.
+ * Returns -EAGAIN if the folio has unexpected reference (e.g., GUP) or if
+ * the folio was concurrently removed from the page cache.
+ *
+ * Returns -EBUSY when trying to split the huge zeropage, if the folio is
+ * under writeback, if fs-specific folio metadata cannot currently be
+ * released, or if some unexpected race happened (e.g., anon VMA disappeared,
+ * truncation).
+ *
+ * Returns -EINVAL when trying to split to an order that is incompatible
+ * with the folio. Splitting to order 0 is compatible with all folios.
*/
int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
unsigned int new_order)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 5dc3f5ea3a2ef2..417fc5cdb6eebd 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1517,7 +1517,7 @@ static void __destroy_compound_gigantic_folio(struct folio *folio,
struct page *p;
atomic_set(&folio->_entire_mapcount, 0);
- atomic_set(&folio->_nr_pages_mapped, 0);
+ atomic_set(&folio->_large_mapcount, 0);
atomic_set(&folio->_pincount, 0);
for (i = 1; i < nr_pages; i++) {
@@ -2120,7 +2120,7 @@ static bool __prep_compound_gigantic_folio(struct folio *folio,
/* we rely on prep_new_hugetlb_folio to set the hugetlb flag */
folio_set_order(folio, order);
atomic_set(&folio->_entire_mapcount, -1);
- atomic_set(&folio->_nr_pages_mapped, 0);
+ atomic_set(&folio->_large_mapcount, -1);
atomic_set(&folio->_pincount, 0);
return true;
@@ -2155,13 +2155,13 @@ static bool prep_compound_gigantic_folio_for_demote(struct folio *folio,
/*
* Find and lock address space (mapping) in write mode.
*
- * Upon entry, the page is locked which means that page_mapping() is
+ * Upon entry, the folio is locked which means that folio_mapping() is
* stable. Due to locking order, we can only trylock_write. If we can
* not get the lock, simply return NULL to caller.
*/
-struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage)
+struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio)
{
- struct address_space *mapping = page_mapping(hpage);
+ struct address_space *mapping = folio_mapping(folio);
if (!mapping)
return mapping;
@@ -2377,8 +2377,8 @@ static struct folio *remove_pool_hugetlb_folio(struct hstate *h,
}
/*
- * Dissolve a given free hugepage into free buddy pages. This function does
- * nothing for in-use hugepages and non-hugepages.
+ * Dissolve a given free hugetlb folio into free buddy pages. This function
+ * does nothing for in-use hugetlb folios and non-hugetlb folios.
* This function returns values like below:
*
* -ENOMEM: failed to allocate vmemmap pages to free the freed hugepages
@@ -2390,10 +2390,9 @@ static struct folio *remove_pool_hugetlb_folio(struct hstate *h,
* 0: successfully dissolved free hugepages or the page is not a
* hugepage (considered as already dissolved)
*/
-int dissolve_free_huge_page(struct page *page)
+int dissolve_free_hugetlb_folio(struct folio *folio)
{
int rc = -EBUSY;
- struct folio *folio = page_folio(page);
retry:
/* Not to disrupt normal path by vainly holding hugetlb_lock */
@@ -2470,13 +2469,13 @@ out:
* make specified memory blocks removable from the system.
* Note that this will dissolve a free gigantic hugepage completely, if any
* part of it lies within the given range.
- * Also note that if dissolve_free_huge_page() returns with an error, all
- * free hugepages that were dissolved before that error are lost.
+ * Also note that if dissolve_free_hugetlb_folio() returns with an error, all
+ * free hugetlb folios that were dissolved before that error are lost.
*/
-int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
+int dissolve_free_hugetlb_folios(unsigned long start_pfn, unsigned long end_pfn)
{
unsigned long pfn;
- struct page *page;
+ struct folio *folio;
int rc = 0;
unsigned int order;
struct hstate *h;
@@ -2489,8 +2488,8 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
order = min(order, huge_page_order(h));
for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) {
- page = pfn_to_page(pfn);
- rc = dissolve_free_huge_page(page);
+ folio = pfn_folio(pfn);
+ rc = dissolve_free_hugetlb_folio(folio);
if (rc)
break;
}
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index aa4486bd390493..e20339a346b935 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -308,7 +308,7 @@ static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
{
if (hugetlb_cgroup_disabled() || !h_cg)
return;
-
+ lockdep_assert_held(&hugetlb_lock);
__set_hugetlb_cgroup(folio, h_cg, rsvd);
if (!rsvd) {
unsigned long usage =
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index d0548e382b6ba2..c9d653f51e45bb 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -15,7 +15,7 @@ static int hwpoison_inject(void *data, u64 val)
{
unsigned long pfn = val;
struct page *p;
- struct page *hpage;
+ struct folio *folio;
int err;
if (!capable(CAP_SYS_ADMIN))
@@ -25,16 +25,17 @@ static int hwpoison_inject(void *data, u64 val)
return -ENXIO;
p = pfn_to_page(pfn);
- hpage = compound_head(p);
+ folio = page_folio(p);
if (!hwpoison_filter_enable)
goto inject;
- shake_page(hpage);
+ shake_folio(folio);
/*
* This implies unable to support non-LRU pages except free page.
*/
- if (!PageLRU(hpage) && !PageHuge(p) && !is_free_buddy_page(p))
+ if (!folio_test_lru(folio) && !folio_test_hugetlb(folio) &&
+ !is_free_buddy_page(p))
return 0;
/*
@@ -42,7 +43,7 @@ static int hwpoison_inject(void *data, u64 val)
* the targeted owner (or on a free page).
* memory_failure() will redo the check reliably inside page lock.
*/
- err = hwpoison_filter(hpage);
+ err = hwpoison_filter(&folio->page);
if (err)
return 0;
diff --git a/mm/internal.h b/mm/internal.h
index 22152e0c8494a8..c5552d35d99551 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -72,6 +72,8 @@ void page_writeback_init(void);
/*
* How many individual pages have an elevated _mapcount. Excludes
* the folio's entire_mapcount.
+ *
+ * Don't use this function outside of debugging code.
*/
static inline int folio_nr_pages_mapped(const struct folio *folio)
{
@@ -132,6 +134,8 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
* first one is writable.
* @any_young: Optional pointer to indicate whether any entry except the
* first one is young.
+ * @any_dirty: Optional pointer to indicate whether any entry except the
+ * first one is dirty.
*
* Detect a PTE batch: consecutive (present) PTEs that map consecutive
* pages of the same large folio.
@@ -147,18 +151,20 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
*/
static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags,
- bool *any_writable, bool *any_young)
+ bool *any_writable, bool *any_young, bool *any_dirty)
{
unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio);
const pte_t *end_ptep = start_ptep + max_nr;
pte_t expected_pte, *ptep;
- bool writable, young;
+ bool writable, young, dirty;
int nr;
if (any_writable)
*any_writable = false;
if (any_young)
*any_young = false;
+ if (any_dirty)
+ *any_dirty = false;
VM_WARN_ON_FOLIO(!pte_present(pte), folio);
VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio);
@@ -174,6 +180,8 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
writable = !!pte_write(pte);
if (any_young)
young = !!pte_young(pte);
+ if (any_dirty)
+ dirty = !!pte_dirty(pte);
pte = __pte_batch_clear_ignored(pte, flags);
if (!pte_same(pte, expected_pte))
@@ -191,6 +199,8 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
*any_writable |= writable;
if (any_young)
*any_young |= young;
+ if (any_dirty)
+ *any_dirty |= dirty;
nr = pte_batch_hint(ptep, pte);
expected_pte = pte_advance_pfn(expected_pte, nr);
@@ -611,6 +621,7 @@ static inline void prep_compound_head(struct page *page, unsigned int order)
struct folio *folio = (struct folio *)page;
folio_set_order(folio, order);
+ atomic_set(&folio->_large_mapcount, -1);
atomic_set(&folio->_entire_mapcount, -1);
atomic_set(&folio->_nr_pages_mapped, 0);
atomic_set(&folio->_pincount, 0);
@@ -1026,6 +1037,7 @@ static inline int find_next_best_node(int node, nodemask_t *used_node_mask)
/*
* mm/memory-failure.c
*/
+void shake_folio(struct folio *folio);
extern int hwpoison_filter(struct page *p);
extern u32 hwpoison_filter_dev_major;
@@ -1423,6 +1435,43 @@ void __meminit __init_single_page(struct page *page, unsigned long pfn,
unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg,
int priority);
+#ifdef CONFIG_64BIT
+/* VM is sealed, in vm_flags */
+#define VM_SEALED _BITUL(63)
+#endif
+
+#ifdef CONFIG_64BIT
+static inline int can_do_mseal(unsigned long flags)
+{
+ if (flags)
+ return -EINVAL;
+
+ return 0;
+}
+
+bool can_modify_mm(struct mm_struct *mm, unsigned long start,
+ unsigned long end);
+bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start,
+ unsigned long end, int behavior);
+#else
+static inline int can_do_mseal(unsigned long flags)
+{
+ return -EPERM;
+}
+
+static inline bool can_modify_mm(struct mm_struct *mm, unsigned long start,
+ unsigned long end)
+{
+ return true;
+}
+
+static inline bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start,
+ unsigned long end, int behavior)
+{
+ return true;
+}
+#endif
+
#ifdef CONFIG_SHRINKER_DEBUG
static inline __printf(2, 0) int shrinker_debugfs_name_alloc(
struct shrinker *shrinker, const char *fmt, va_list ap)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 89e2624fb3ff11..774a97e6e2da39 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -453,7 +453,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma,
{
if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
hugepage_flags_enabled()) {
- if (thp_vma_allowable_order(vma, vm_flags, false, false, true,
+ if (thp_vma_allowable_order(vma, vm_flags, TVA_ENFORCE_SYSFS,
PMD_ORDER))
__khugepaged_enter(vma->vm_mm);
}
@@ -583,7 +583,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
folio = page_folio(page);
VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio);
- if (page_mapcount(page) > 1) {
+ /* See hpage_collapse_scan_pmd(). */
+ if (folio_likely_mapped_shared(folio)) {
++shared;
if (cc->is_khugepaged &&
shared > khugepaged_max_ptes_shared) {
@@ -899,6 +900,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
struct collapse_control *cc)
{
struct vm_area_struct *vma;
+ unsigned long tva_flags = cc->is_khugepaged ? TVA_ENFORCE_SYSFS : 0;
if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
return SCAN_ANY_PROCESS;
@@ -909,8 +911,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
if (!thp_vma_suitable_order(vma, address, PMD_ORDER))
return SCAN_ADDRESS_RANGE;
- if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false,
- cc->is_khugepaged, PMD_ORDER))
+ if (!thp_vma_allowable_order(vma, vma->vm_flags, tva_flags, PMD_ORDER))
return SCAN_VMA_CHECK;
/*
* Anon VMA expected, the address may be unmapped then
@@ -1317,8 +1318,20 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
result = SCAN_PAGE_NULL;
goto out_unmap;
}
+ folio = page_folio(page);
- if (page_mapcount(page) > 1) {
+ if (!folio_test_anon(folio)) {
+ result = SCAN_PAGE_ANON;
+ goto out_unmap;
+ }
+
+ /*
+ * We treat a single page as shared if any part of the THP
+ * is shared. "False negatives" from
+ * folio_likely_mapped_shared() are not expected to matter
+ * much in practice.
+ */
+ if (folio_likely_mapped_shared(folio)) {
++shared;
if (cc->is_khugepaged &&
shared > khugepaged_max_ptes_shared) {
@@ -1328,7 +1341,6 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
}
}
- folio = page_folio(page);
/*
* Record which node the original page is from and save this
* information to cc->node_load[].
@@ -1349,16 +1361,12 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
result = SCAN_PAGE_LOCK;
goto out_unmap;
}
- if (!folio_test_anon(folio)) {
- result = SCAN_PAGE_ANON;
- goto out_unmap;
- }
/*
* Check if the page has any GUP (or other external) pins.
*
* Here the check may be racy:
- * it may see total_mapcount > refcount in some cases?
+ * it may see folio_mapcount() > folio_ref_count().
* But such case is ephemeral we could always retry collapse
* later. However it may report false positive if the page
* has excessive GUP pins (i.e. 512). Anyway the same check
@@ -1493,8 +1501,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
* and map it by a PMD, regardless of sysfs THP settings. As such, let's
* analogously elide sysfs THP settings here.
*/
- if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, false,
- PMD_ORDER))
+ if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER))
return SCAN_VMA_CHECK;
/* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
@@ -2355,8 +2362,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
progress++;
break;
}
- if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false,
- true, PMD_ORDER)) {
+ if (!thp_vma_allowable_order(vma, vma->vm_flags,
+ TVA_ENFORCE_SYSFS, PMD_ORDER)) {
skip:
progress++;
continue;
@@ -2693,8 +2700,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
*prev = vma;
- if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, false,
- PMD_ORDER))
+ if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER))
return -EINVAL;
cc = kmalloc(sizeof(*cc), GFP_KERNEL);
diff --git a/mm/ksm.c b/mm/ksm.c
index 8c001819cf10f5..e1034bf1c937e9 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -890,14 +890,14 @@ static void remove_node_from_stable_tree(struct ksm_stable_node *stable_node)
free_stable_node(stable_node);
}
-enum get_ksm_page_flags {
- GET_KSM_PAGE_NOLOCK,
- GET_KSM_PAGE_LOCK,
- GET_KSM_PAGE_TRYLOCK
+enum ksm_get_folio_flags {
+ KSM_GET_FOLIO_NOLOCK,
+ KSM_GET_FOLIO_LOCK,
+ KSM_GET_FOLIO_TRYLOCK
};
/*
- * get_ksm_page: checks if the page indicated by the stable node
+ * ksm_get_folio: checks if the page indicated by the stable node
* is still its ksm page, despite having held no reference to it.
* In which case we can trust the content of the page, and it
* returns the gotten page; but if the page has now been zapped,
@@ -915,10 +915,10 @@ enum get_ksm_page_flags {
* a page to put something that might look like our key in page->mapping.
* is on its way to being freed; but it is an anomaly to bear in mind.
*/
-static struct page *get_ksm_page(struct ksm_stable_node *stable_node,
- enum get_ksm_page_flags flags)
+static struct folio *ksm_get_folio(struct ksm_stable_node *stable_node,
+ enum ksm_get_folio_flags flags)
{
- struct page *page;
+ struct folio *folio;
void *expected_mapping;
unsigned long kpfn;
@@ -926,8 +926,8 @@ static struct page *get_ksm_page(struct ksm_stable_node *stable_node,
PAGE_MAPPING_KSM);
again:
kpfn = READ_ONCE(stable_node->kpfn); /* Address dependency. */
- page = pfn_to_page(kpfn);
- if (READ_ONCE(page->mapping) != expected_mapping)
+ folio = pfn_folio(kpfn);
+ if (READ_ONCE(folio->mapping) != expected_mapping)
goto stale;
/*
@@ -940,41 +940,41 @@ again:
* in folio_migrate_mapping(), it might still be our page,
* in which case it's essential to keep the node.
*/
- while (!get_page_unless_zero(page)) {
+ while (!folio_try_get(folio)) {
/*
* Another check for page->mapping != expected_mapping would
* work here too. We have chosen the !PageSwapCache test to
* optimize the common case, when the page is or is about to
* be freed: PageSwapCache is cleared (under spin_lock_irq)
* in the ref_freeze section of __remove_mapping(); but Anon
- * page->mapping reset to NULL later, in free_pages_prepare().
+ * folio->mapping reset to NULL later, in free_pages_prepare().
*/
- if (!PageSwapCache(page))
+ if (!folio_test_swapcache(folio))
goto stale;
cpu_relax();
}
- if (READ_ONCE(page->mapping) != expected_mapping) {
- put_page(page);
+ if (READ_ONCE(folio->mapping) != expected_mapping) {
+ folio_put(folio);
goto stale;
}
- if (flags == GET_KSM_PAGE_TRYLOCK) {
- if (!trylock_page(page)) {
- put_page(page);
+ if (flags == KSM_GET_FOLIO_TRYLOCK) {
+ if (!folio_trylock(folio)) {
+ folio_put(folio);
return ERR_PTR(-EBUSY);
}
- } else if (flags == GET_KSM_PAGE_LOCK)
- lock_page(page);
+ } else if (flags == KSM_GET_FOLIO_LOCK)
+ folio_lock(folio);
- if (flags != GET_KSM_PAGE_NOLOCK) {
- if (READ_ONCE(page->mapping) != expected_mapping) {
- unlock_page(page);
- put_page(page);
+ if (flags != KSM_GET_FOLIO_NOLOCK) {
+ if (READ_ONCE(folio->mapping) != expected_mapping) {
+ folio_unlock(folio);
+ folio_put(folio);
goto stale;
}
}
- return page;
+ return folio;
stale:
/*
@@ -998,16 +998,16 @@ static void remove_rmap_item_from_tree(struct ksm_rmap_item *rmap_item)
{
if (rmap_item->address & STABLE_FLAG) {
struct ksm_stable_node *stable_node;
- struct page *page;
+ struct folio *folio;
stable_node = rmap_item->head;
- page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
- if (!page)
+ folio = ksm_get_folio(stable_node, KSM_GET_FOLIO_LOCK);
+ if (!folio)
goto out;
hlist_del(&rmap_item->hlist);
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
if (!hlist_empty(&stable_node->hlist))
ksm_pages_sharing--;
@@ -1094,11 +1094,11 @@ static inline struct ksm_stable_node *page_stable_node(struct page *page)
return folio_stable_node(page_folio(page));
}
-static inline void set_page_stable_node(struct page *page,
- struct ksm_stable_node *stable_node)
+static inline void folio_set_stable_node(struct folio *folio,
+ struct ksm_stable_node *stable_node)
{
- VM_BUG_ON_PAGE(PageAnon(page) && PageAnonExclusive(page), page);
- page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
+ VM_WARN_ON_FOLIO(folio_test_anon(folio) && PageAnonExclusive(&folio->page), folio);
+ folio->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
}
#ifdef CONFIG_SYSFS
@@ -1107,13 +1107,13 @@ static inline void set_page_stable_node(struct page *page,
*/
static int remove_stable_node(struct ksm_stable_node *stable_node)
{
- struct page *page;
+ struct folio *folio;
int err;
- page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
- if (!page) {
+ folio = ksm_get_folio(stable_node, KSM_GET_FOLIO_LOCK);
+ if (!folio) {
/*
- * get_ksm_page did remove_node_from_stable_tree itself.
+ * ksm_get_folio did remove_node_from_stable_tree itself.
*/
return 0;
}
@@ -1124,22 +1124,22 @@ static int remove_stable_node(struct ksm_stable_node *stable_node)
* merge_across_nodes/max_page_sharing be switched.
*/
err = -EBUSY;
- if (!page_mapped(page)) {
+ if (!folio_mapped(folio)) {
/*
- * The stable node did not yet appear stale to get_ksm_page(),
- * since that allows for an unmapped ksm page to be recognized
+ * The stable node did not yet appear stale to ksm_get_folio(),
+ * since that allows for an unmapped ksm folio to be recognized
* right up until it is freed; but the node is safe to remove.
- * This page might be in an LRU cache waiting to be freed,
- * or it might be PageSwapCache (perhaps under writeback),
+ * This folio might be in an LRU cache waiting to be freed,
+ * or it might be in the swapcache (perhaps under writeback),
* or it might have been removed from swapcache a moment ago.
*/
- set_page_stable_node(page, NULL);
+ folio_set_stable_node(folio, NULL);
remove_node_from_stable_tree(stable_node);
err = 0;
}
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return err;
}
@@ -1275,23 +1275,24 @@ static u32 calc_checksum(struct page *page)
return checksum;
}
-static int write_protect_page(struct vm_area_struct *vma, struct page *page,
+static int write_protect_page(struct vm_area_struct *vma, struct folio *folio,
pte_t *orig_pte)
{
struct mm_struct *mm = vma->vm_mm;
- DEFINE_PAGE_VMA_WALK(pvmw, page, vma, 0, 0);
+ DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, 0, 0);
int swapped;
int err = -EFAULT;
struct mmu_notifier_range range;
bool anon_exclusive;
pte_t entry;
- pvmw.address = page_address_in_vma(page, vma);
+ if (WARN_ON_ONCE(folio_test_large(folio)))
+ return err;
+
+ pvmw.address = page_address_in_vma(&folio->page, vma);
if (pvmw.address == -EFAULT)
goto out;
- BUG_ON(PageTransCompound(page));
-
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, pvmw.address,
pvmw.address + PAGE_SIZE);
mmu_notifier_invalidate_range_start(&range);
@@ -1301,12 +1302,12 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
goto out_unlock;
- anon_exclusive = PageAnonExclusive(page);
+ anon_exclusive = PageAnonExclusive(&folio->page);
entry = ptep_get(pvmw.pte);
if (pte_write(entry) || pte_dirty(entry) ||
anon_exclusive || mm_tlb_flush_pending(mm)) {
- swapped = PageSwapCache(page);
- flush_cache_page(vma, pvmw.address, page_to_pfn(page));
+ swapped = folio_test_swapcache(folio);
+ flush_cache_page(vma, pvmw.address, folio_pfn(folio));
/*
* Ok this is tricky, when get_user_pages_fast() run it doesn't
* take any lock, therefore the check that we are going to make
@@ -1326,20 +1327,20 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
* Check that no O_DIRECT or similar I/O is in progress on the
* page
*/
- if (page_mapcount(page) + 1 + swapped != page_count(page)) {
+ if (folio_mapcount(folio) + 1 + swapped != folio_ref_count(folio)) {
set_pte_at(mm, pvmw.address, pvmw.pte, entry);
goto out_unlock;
}
/* See folio_try_share_anon_rmap_pte(): clear PTE first. */
if (anon_exclusive &&
- folio_try_share_anon_rmap_pte(page_folio(page), page)) {
+ folio_try_share_anon_rmap_pte(folio, &folio->page)) {
set_pte_at(mm, pvmw.address, pvmw.pte, entry);
goto out_unlock;
}
if (pte_dirty(entry))
- set_page_dirty(page);
+ folio_mark_dirty(folio);
entry = pte_mkclean(entry);
if (pte_write(entry))
@@ -1505,14 +1506,14 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
* ptes are necessarily already write-protected. But in either
* case, we need to lock and check page_count is not raised.
*/
- if (write_protect_page(vma, page, &orig_pte) == 0) {
+ if (write_protect_page(vma, page_folio(page), &orig_pte) == 0) {
if (!kpage) {
/*
* While we hold page lock, upgrade page from
* PageAnon+anon_vma to PageKsm+NULL stable_node:
* stable_tree_insert() will update stable_node.
*/
- set_page_stable_node(page, NULL);
+ folio_set_stable_node(page_folio(page), NULL);
mark_page_accessed(page);
/*
* Page reclaim just frees a clean page with no dirty
@@ -1617,14 +1618,14 @@ bool is_page_sharing_candidate(struct ksm_stable_node *stable_node)
return __is_page_sharing_candidate(stable_node, 0);
}
-static struct page *stable_node_dup(struct ksm_stable_node **_stable_node_dup,
- struct ksm_stable_node **_stable_node,
- struct rb_root *root,
- bool prune_stale_stable_nodes)
+static struct folio *stable_node_dup(struct ksm_stable_node **_stable_node_dup,
+ struct ksm_stable_node **_stable_node,
+ struct rb_root *root,
+ bool prune_stale_stable_nodes)
{
struct ksm_stable_node *dup, *found = NULL, *stable_node = *_stable_node;
struct hlist_node *hlist_safe;
- struct page *_tree_page, *tree_page = NULL;
+ struct folio *folio, *tree_folio = NULL;
int nr = 0;
int found_rmap_hlist_len;
@@ -1643,24 +1644,24 @@ static struct page *stable_node_dup(struct ksm_stable_node **_stable_node_dup,
* We must walk all stable_node_dup to prune the stale
* stable nodes during lookup.
*
- * get_ksm_page can drop the nodes from the
+ * ksm_get_folio can drop the nodes from the
* stable_node->hlist if they point to freed pages
* (that's why we do a _safe walk). The "dup"
* stable_node parameter itself will be freed from
* under us if it returns NULL.
*/
- _tree_page = get_ksm_page(dup, GET_KSM_PAGE_NOLOCK);
- if (!_tree_page)
+ folio = ksm_get_folio(dup, KSM_GET_FOLIO_NOLOCK);
+ if (!folio)
continue;
nr += 1;
if (is_page_sharing_candidate(dup)) {
if (!found ||
dup->rmap_hlist_len > found_rmap_hlist_len) {
if (found)
- put_page(tree_page);
+ folio_put(tree_folio);
found = dup;
found_rmap_hlist_len = found->rmap_hlist_len;
- tree_page = _tree_page;
+ tree_folio = folio;
/* skip put_page for found dup */
if (!prune_stale_stable_nodes)
@@ -1668,7 +1669,7 @@ static struct page *stable_node_dup(struct ksm_stable_node **_stable_node_dup,
continue;
}
}
- put_page(_tree_page);
+ folio_put(folio);
}
if (found) {
@@ -1733,7 +1734,7 @@ static struct page *stable_node_dup(struct ksm_stable_node **_stable_node_dup,
}
*_stable_node_dup = found;
- return tree_page;
+ return tree_folio;
}
static struct ksm_stable_node *stable_node_dup_any(struct ksm_stable_node *stable_node,
@@ -1750,7 +1751,7 @@ static struct ksm_stable_node *stable_node_dup_any(struct ksm_stable_node *stabl
}
/*
- * Like for get_ksm_page, this function can free the *_stable_node and
+ * Like for ksm_get_folio, this function can free the *_stable_node and
* *_stable_node_dup if the returned tree_page is NULL.
*
* It can also free and overwrite *_stable_node with the found
@@ -1763,16 +1764,16 @@ static struct ksm_stable_node *stable_node_dup_any(struct ksm_stable_node *stabl
* function and will be overwritten in all cases, the caller doesn't
* need to initialize it.
*/
-static struct page *__stable_node_chain(struct ksm_stable_node **_stable_node_dup,
- struct ksm_stable_node **_stable_node,
- struct rb_root *root,
- bool prune_stale_stable_nodes)
+static struct folio *__stable_node_chain(struct ksm_stable_node **_stable_node_dup,
+ struct ksm_stable_node **_stable_node,
+ struct rb_root *root,
+ bool prune_stale_stable_nodes)
{
struct ksm_stable_node *stable_node = *_stable_node;
if (!is_stable_node_chain(stable_node)) {
if (is_page_sharing_candidate(stable_node)) {
*_stable_node_dup = stable_node;
- return get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK);
+ return ksm_get_folio(stable_node, KSM_GET_FOLIO_NOLOCK);
}
/*
* _stable_node_dup set to NULL means the stable_node
@@ -1785,24 +1786,24 @@ static struct page *__stable_node_chain(struct ksm_stable_node **_stable_node_du
prune_stale_stable_nodes);
}
-static __always_inline struct page *chain_prune(struct ksm_stable_node **s_n_d,
- struct ksm_stable_node **s_n,
- struct rb_root *root)
+static __always_inline struct folio *chain_prune(struct ksm_stable_node **s_n_d,
+ struct ksm_stable_node **s_n,
+ struct rb_root *root)
{
return __stable_node_chain(s_n_d, s_n, root, true);
}
-static __always_inline struct page *chain(struct ksm_stable_node **s_n_d,
- struct ksm_stable_node *s_n,
- struct rb_root *root)
+static __always_inline struct folio *chain(struct ksm_stable_node **s_n_d,
+ struct ksm_stable_node *s_n,
+ struct rb_root *root)
{
struct ksm_stable_node *old_stable_node = s_n;
- struct page *tree_page;
+ struct folio *tree_folio;
- tree_page = __stable_node_chain(s_n_d, &s_n, root, false);
+ tree_folio = __stable_node_chain(s_n_d, &s_n, root, false);
/* not pruning dups so s_n cannot have changed */
VM_BUG_ON(s_n != old_stable_node);
- return tree_page;
+ return tree_folio;
}
/*
@@ -1822,28 +1823,30 @@ static struct page *stable_tree_search(struct page *page)
struct rb_node *parent;
struct ksm_stable_node *stable_node, *stable_node_dup, *stable_node_any;
struct ksm_stable_node *page_node;
+ struct folio *folio;
- page_node = page_stable_node(page);
+ folio = page_folio(page);
+ page_node = folio_stable_node(folio);
if (page_node && page_node->head != &migrate_nodes) {
/* ksm page forked */
- get_page(page);
- return page;
+ folio_get(folio);
+ return &folio->page;
}
- nid = get_kpfn_nid(page_to_pfn(page));
+ nid = get_kpfn_nid(folio_pfn(folio));
root = root_stable_tree + nid;
again:
new = &root->rb_node;
parent = NULL;
while (*new) {
- struct page *tree_page;
+ struct folio *tree_folio;
int ret;
cond_resched();
stable_node = rb_entry(*new, struct ksm_stable_node, node);
stable_node_any = NULL;
- tree_page = chain_prune(&stable_node_dup, &stable_node, root);
+ tree_folio = chain_prune(&stable_node_dup, &stable_node, root);
/*
* NOTE: stable_node may have been freed by
* chain_prune() if the returned stable_node_dup is
@@ -1877,14 +1880,14 @@ again:
* write protected at all times. Any will work
* fine to continue the walk.
*/
- tree_page = get_ksm_page(stable_node_any,
- GET_KSM_PAGE_NOLOCK);
+ tree_folio = ksm_get_folio(stable_node_any,
+ KSM_GET_FOLIO_NOLOCK);
}
VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
- if (!tree_page) {
+ if (!tree_folio) {
/*
* If we walked over a stale stable_node,
- * get_ksm_page() will call rb_erase() and it
+ * ksm_get_folio() will call rb_erase() and it
* may rebalance the tree from under us. So
* restart the search from scratch. Returning
* NULL would be safe too, but we'd generate
@@ -1894,8 +1897,8 @@ again:
goto again;
}
- ret = memcmp_pages(page, tree_page);
- put_page(tree_page);
+ ret = memcmp_pages(page, &tree_folio->page);
+ folio_put(tree_folio);
parent = *new;
if (ret < 0)
@@ -1906,12 +1909,15 @@ again:
if (page_node) {
VM_BUG_ON(page_node->head != &migrate_nodes);
/*
- * Test if the migrated page should be merged
- * into a stable node dup. If the mapcount is
- * 1 we can migrate it with another KSM page
- * without adding it to the chain.
+ * If the mapcount of our migrated KSM folio is
+ * at most 1, we can merge it with another
+ * KSM folio where we know that we have space
+ * for one more mapping without exceeding the
+ * ksm_max_page_sharing limit: see
+ * chain_prune(). This way, we can avoid adding
+ * this stable node to the chain.
*/
- if (page_mapcount(page) > 1)
+ if (folio_mapcount(folio) > 1)
goto chain_append;
}
@@ -1938,26 +1944,26 @@ again:
* It would be more elegant to return stable_node
* than kpage, but that involves more changes.
*/
- tree_page = get_ksm_page(stable_node_dup,
- GET_KSM_PAGE_TRYLOCK);
+ tree_folio = ksm_get_folio(stable_node_dup,
+ KSM_GET_FOLIO_TRYLOCK);
- if (PTR_ERR(tree_page) == -EBUSY)
+ if (PTR_ERR(tree_folio) == -EBUSY)
return ERR_PTR(-EBUSY);
- if (unlikely(!tree_page))
+ if (unlikely(!tree_folio))
/*
* The tree may have been rebalanced,
* so re-evaluate parent and new.
*/
goto again;
- unlock_page(tree_page);
+ folio_unlock(tree_folio);
if (get_kpfn_nid(stable_node_dup->kpfn) !=
NUMA(stable_node_dup->nid)) {
- put_page(tree_page);
+ folio_put(tree_folio);
goto replace;
}
- return tree_page;
+ return &tree_folio->page;
}
}
@@ -1970,8 +1976,8 @@ again:
rb_insert_color(&page_node->node, root);
out:
if (is_page_sharing_candidate(page_node)) {
- get_page(page);
- return page;
+ folio_get(folio);
+ return &folio->page;
} else
return NULL;
@@ -1996,12 +2002,12 @@ replace:
&page_node->node,
root);
if (is_page_sharing_candidate(page_node))
- get_page(page);
+ folio_get(folio);
else
- page = NULL;
+ folio = NULL;
} else {
rb_erase(&stable_node_dup->node, root);
- page = NULL;
+ folio = NULL;
}
} else {
VM_BUG_ON(!is_stable_node_chain(stable_node));
@@ -2012,16 +2018,16 @@ replace:
DO_NUMA(page_node->nid = nid);
stable_node_chain_add_dup(page_node, stable_node);
if (is_page_sharing_candidate(page_node))
- get_page(page);
+ folio_get(folio);
else
- page = NULL;
+ folio = NULL;
} else {
- page = NULL;
+ folio = NULL;
}
}
stable_node_dup->head = &migrate_nodes;
list_add(&stable_node_dup->list, stable_node_dup->head);
- return page;
+ return &folio->page;
chain_append:
/* stable_node_dup could be null if it reached the limit */
@@ -2064,7 +2070,7 @@ chain_append:
* This function returns the stable tree node just allocated on success,
* NULL otherwise.
*/
-static struct ksm_stable_node *stable_tree_insert(struct page *kpage)
+static struct ksm_stable_node *stable_tree_insert(struct folio *kfolio)
{
int nid;
unsigned long kpfn;
@@ -2074,7 +2080,7 @@ static struct ksm_stable_node *stable_tree_insert(struct page *kpage)
struct ksm_stable_node *stable_node, *stable_node_dup, *stable_node_any;
bool need_chain = false;
- kpfn = page_to_pfn(kpage);
+ kpfn = folio_pfn(kfolio);
nid = get_kpfn_nid(kpfn);
root = root_stable_tree + nid;
again:
@@ -2082,13 +2088,13 @@ again:
new = &root->rb_node;
while (*new) {
- struct page *tree_page;
+ struct folio *tree_folio;
int ret;
cond_resched();
stable_node = rb_entry(*new, struct ksm_stable_node, node);
stable_node_any = NULL;
- tree_page = chain(&stable_node_dup, stable_node, root);
+ tree_folio = chain(&stable_node_dup, stable_node, root);
if (!stable_node_dup) {
/*
* Either all stable_node dups were full in
@@ -2110,14 +2116,14 @@ again:
* write protected at all times. Any will work
* fine to continue the walk.
*/
- tree_page = get_ksm_page(stable_node_any,
- GET_KSM_PAGE_NOLOCK);
+ tree_folio = ksm_get_folio(stable_node_any,
+ KSM_GET_FOLIO_NOLOCK);
}
VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
- if (!tree_page) {
+ if (!tree_folio) {
/*
* If we walked over a stale stable_node,
- * get_ksm_page() will call rb_erase() and it
+ * ksm_get_folio() will call rb_erase() and it
* may rebalance the tree from under us. So
* restart the search from scratch. Returning
* NULL would be safe too, but we'd generate
@@ -2127,8 +2133,8 @@ again:
goto again;
}
- ret = memcmp_pages(kpage, tree_page);
- put_page(tree_page);
+ ret = memcmp_pages(&kfolio->page, &tree_folio->page);
+ folio_put(tree_folio);
parent = *new;
if (ret < 0)
@@ -2147,7 +2153,7 @@ again:
INIT_HLIST_HEAD(&stable_node_dup->hlist);
stable_node_dup->kpfn = kpfn;
- set_page_stable_node(kpage, stable_node_dup);
+ folio_set_stable_node(kfolio, stable_node_dup);
stable_node_dup->rmap_hlist_len = 0;
DO_NUMA(stable_node_dup->nid = nid);
if (!need_chain) {
@@ -2425,7 +2431,7 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite
* node in the stable tree and add both rmap_items.
*/
lock_page(kpage);
- stable_node = stable_tree_insert(kpage);
+ stable_node = stable_tree_insert(page_folio(kpage));
if (stable_node) {
stable_tree_append(tree_rmap_item, stable_node,
false);
@@ -2597,14 +2603,14 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
*/
if (!ksm_merge_across_nodes) {
struct ksm_stable_node *stable_node, *next;
- struct page *page;
+ struct folio *folio;
list_for_each_entry_safe(stable_node, next,
&migrate_nodes, list) {
- page = get_ksm_page(stable_node,
- GET_KSM_PAGE_NOLOCK);
- if (page)
- put_page(page);
+ folio = ksm_get_folio(stable_node,
+ KSM_GET_FOLIO_NOLOCK);
+ if (folio)
+ folio_put(folio);
cond_resched();
}
}
@@ -3172,12 +3178,11 @@ again:
/*
* Collect processes when the error hit an ksm page.
*/
-void collect_procs_ksm(struct page *page, struct list_head *to_kill,
- int force_early)
+void collect_procs_ksm(struct folio *folio, struct page *page,
+ struct list_head *to_kill, int force_early)
{
struct ksm_stable_node *stable_node;
struct ksm_rmap_item *rmap_item;
- struct folio *folio = page_folio(page);
struct vm_area_struct *vma;
struct task_struct *tsk;
@@ -3229,11 +3234,11 @@ void folio_migrate_ksm(struct folio *newfolio, struct folio *folio)
/*
* newfolio->mapping was set in advance; now we need smp_wmb()
* to make sure that the new stable_node->kpfn is visible
- * to get_ksm_page() before it can see that folio->mapping
+ * to ksm_get_folio() before it can see that folio->mapping
* has gone stale (or that folio_test_swapcache has been cleared).
*/
smp_wmb();
- set_page_stable_node(&folio->page, NULL);
+ folio_set_stable_node(folio, NULL);
}
}
#endif /* CONFIG_MIGRATION */
@@ -3256,7 +3261,7 @@ static bool stable_node_dup_remove_range(struct ksm_stable_node *stable_node,
if (stable_node->kpfn >= start_pfn &&
stable_node->kpfn < end_pfn) {
/*
- * Don't get_ksm_page, page has already gone:
+ * Don't ksm_get_folio, page has already gone:
* which is why we keep kpfn instead of page*
*/
remove_node_from_stable_tree(stable_node);
@@ -3344,7 +3349,7 @@ static int ksm_memory_callback(struct notifier_block *self,
* Most of the work is done by page migration; but there might
* be a few stable_nodes left over, still pointing to struct
* pages which have been offlined: prune those from the tree,
- * otherwise get_ksm_page() might later try to access a
+ * otherwise ksm_get_folio() might later try to access a
* non-existent struct page.
*/
ksm_check_stable_tree(mn->start_pfn,
diff --git a/mm/madvise.c b/mm/madvise.c
index f59169888b8ee2..c49fef453491d0 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -321,6 +321,18 @@ static inline bool can_do_file_pageout(struct vm_area_struct *vma)
file_permission(vma->vm_file, MAY_WRITE) == 0;
}
+static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end,
+ struct folio *folio, pte_t *ptep,
+ pte_t pte, bool *any_young,
+ bool *any_dirty)
+{
+ const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
+ int max_nr = (end - addr) / PAGE_SIZE;
+
+ return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL,
+ any_young, any_dirty);
+}
+
static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
@@ -456,13 +468,10 @@ restart:
* next pte in the range.
*/
if (folio_test_large(folio)) {
- const fpb_t fpb_flags = FPB_IGNORE_DIRTY |
- FPB_IGNORE_SOFT_DIRTY;
- int max_nr = (end - addr) / PAGE_SIZE;
bool any_young;
- nr = folio_pte_batch(folio, addr, pte, ptent, max_nr,
- fpb_flags, NULL, &any_young);
+ nr = madvise_folio_pte_batch(addr, end, folio, pte,
+ ptent, &any_young, NULL);
if (any_young)
ptent = pte_mkyoung(ptent);
@@ -507,7 +516,8 @@ restart:
continue;
if (!pageout && pte_young(ptent)) {
- mkold_ptes(vma, addr, pte, nr);
+ clear_young_dirty_ptes(vma, addr, pte, nr,
+ CYDP_CLEAR_YOUNG);
tlb_remove_tlb_entries(tlb, pte, nr, addr);
}
@@ -633,6 +643,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
+ const cydp_t cydp_flags = CYDP_CLEAR_YOUNG | CYDP_CLEAR_DIRTY;
struct mmu_gather *tlb = walk->private;
struct mm_struct *mm = tlb->mm;
struct vm_area_struct *vma = walk->vma;
@@ -687,44 +698,57 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
continue;
/*
- * If pmd isn't transhuge but the folio is large and
- * is owned by only this process, split it and
- * deactivate all pages.
+ * If we encounter a large folio, only split it if it is not
+ * fully mapped within the range we are operating on. Otherwise
+ * leave it as is so that it can be marked as lazyfree. If we
+ * fail to split a folio, leave it in place and advance to the
+ * next pte in the range.
*/
if (folio_test_large(folio)) {
- int err;
+ bool any_young, any_dirty;
- if (folio_likely_mapped_shared(folio))
- break;
- if (!folio_trylock(folio))
- break;
- folio_get(folio);
- arch_leave_lazy_mmu_mode();
- pte_unmap_unlock(start_pte, ptl);
- start_pte = NULL;
- err = split_folio(folio);
- folio_unlock(folio);
- folio_put(folio);
- if (err)
- break;
- start_pte = pte =
- pte_offset_map_lock(mm, pmd, addr, &ptl);
- if (!start_pte)
- break;
- arch_enter_lazy_mmu_mode();
- pte--;
- addr -= PAGE_SIZE;
- continue;
+ nr = madvise_folio_pte_batch(addr, end, folio, pte,
+ ptent, &any_young, &any_dirty);
+
+ if (nr < folio_nr_pages(folio)) {
+ int err;
+
+ if (folio_likely_mapped_shared(folio))
+ continue;
+ if (!folio_trylock(folio))
+ continue;
+ folio_get(folio);
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(start_pte, ptl);
+ start_pte = NULL;
+ err = split_folio(folio);
+ folio_unlock(folio);
+ folio_put(folio);
+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ start_pte = pte;
+ if (!start_pte)
+ break;
+ arch_enter_lazy_mmu_mode();
+ if (!err)
+ nr = 0;
+ continue;
+ }
+
+ if (any_young)
+ ptent = pte_mkyoung(ptent);
+ if (any_dirty)
+ ptent = pte_mkdirty(ptent);
}
if (folio_test_swapcache(folio) || folio_test_dirty(folio)) {
if (!folio_trylock(folio))
continue;
/*
- * If folio is shared with others, we mustn't clear
- * the folio's dirty flag.
+ * If we have a large folio at this point, we know it is
+ * fully mapped so if its mapcount is the same as its
+ * number of pages, it must be exclusive.
*/
- if (folio_mapcount(folio) != 1) {
+ if (folio_mapcount(folio) != folio_nr_pages(folio)) {
folio_unlock(folio);
continue;
}
@@ -740,19 +764,8 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
}
if (pte_young(ptent) || pte_dirty(ptent)) {
- /*
- * Some of architecture(ex, PPC) don't update TLB
- * with set_pte_at and tlb_remove_tlb_entry so for
- * the portability, remap the pte with old|clean
- * after pte clearing.
- */
- ptent = ptep_get_and_clear_full(mm, addr, pte,
- tlb->fullmm);
-
- ptent = pte_mkold(ptent);
- ptent = pte_mkclean(ptent);
- set_pte_at(mm, addr, pte, ptent);
- tlb_remove_tlb_entry(tlb, pte, addr);
+ clear_young_dirty_ptes(vma, addr, pte, nr, cydp_flags);
+ tlb_remove_tlb_entries(tlb, pte, nr, addr);
}
folio_mark_lazyfree(folio);
}
@@ -1388,6 +1401,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
* -EIO - an I/O error occurred while paging in data.
* -EBADF - map exists, but area maps something that isn't a file.
* -EAGAIN - a kernel resource was temporarily unavailable.
+ * -EPERM - memory is sealed.
*/
int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
{
@@ -1431,6 +1445,15 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
start = untagged_addr_remote(mm, start);
end = start + len;
+ /*
+ * Check if the address range is sealed for do_madvise().
+ * can_modify_mm_madv assumes we have acquired the lock on MM.
+ */
+ if (unlikely(!can_modify_mm_madv(mm, start, end, behavior))) {
+ error = -EPERM;
+ goto out;
+ }
+
blk_start_plug(&plug);
switch (behavior) {
case MADV_POPULATE_READ:
@@ -1443,6 +1466,8 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
break;
}
blk_finish_plug(&plug);
+
+out:
if (write)
mmap_write_unlock(mm);
else
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 29fa5b17a1ef58..602ad5faad4dd1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -53,6 +53,7 @@
#include <linux/sort.h>
#include <linux/fs.h>
#include <linux/seq_file.h>
+#include <linux/parser.h>
#include <linux/vmpressure.h>
#include <linux/memremap.h>
#include <linux/mm_inline.h>
@@ -715,6 +716,7 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
{
struct memcg_vmstats_percpu *statc;
int cpu = smp_processor_id();
+ unsigned int stats_updates;
if (!val)
return;
@@ -722,8 +724,9 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
cgroup_rstat_updated(memcg->css.cgroup, cpu);
statc = this_cpu_ptr(memcg->vmstats_percpu);
for (; statc; statc = statc->parent) {
- statc->stats_updates += abs(val);
- if (statc->stats_updates < MEMCG_CHARGE_BATCH)
+ stats_updates = READ_ONCE(statc->stats_updates) + abs(val);
+ WRITE_ONCE(statc->stats_updates, stats_updates);
+ if (stats_updates < MEMCG_CHARGE_BATCH)
continue;
/*
@@ -731,9 +734,9 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
* redundant. Avoid the overhead of the atomic update.
*/
if (!memcg_vmstats_needs_flush(statc->vmstats))
- atomic64_add(statc->stats_updates,
+ atomic64_add(stats_updates,
&statc->vmstats->stats_updates);
- statc->stats_updates = 0;
+ WRITE_ONCE(statc->stats_updates, 0);
}
}
@@ -836,8 +839,9 @@ static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
return x;
}
-void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
- int val)
+static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
+ enum node_stat_item idx,
+ int val)
{
struct mem_cgroup_per_node *pn;
struct mem_cgroup *memcg;
@@ -859,10 +863,13 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
case NR_ANON_THPS:
case NR_SHMEM_PMDMAPPED:
case NR_FILE_PMDMAPPED:
- WARN_ON_ONCE(!in_task());
+ if (WARN_ON_ONCE(!in_task()))
+ pr_warn("stat item index: %d\n", idx);
break;
default:
VM_WARN_ON_IRQS_ENABLED();
+ if (!irqs_disabled())
+ pr_warn("stat item index: %d\n", idx);
}
}
@@ -2474,7 +2481,8 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
psi_memstall_enter(&pflags);
nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
gfp_mask,
- MEMCG_RECLAIM_MAY_SWAP);
+ MEMCG_RECLAIM_MAY_SWAP,
+ NULL);
psi_memstall_leave(&pflags);
} while ((memcg = parent_mem_cgroup(memcg)) &&
!mem_cgroup_is_root(memcg));
@@ -2780,7 +2788,7 @@ retry:
psi_memstall_enter(&pflags);
nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
- gfp_mask, reclaim_options);
+ gfp_mask, reclaim_options, NULL);
psi_memstall_leave(&pflags);
if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
@@ -2978,21 +2986,19 @@ void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
#ifdef CONFIG_MEMCG_KMEM
-/*
- * mod_objcg_mlstate() may be called with irq enabled, so
- * mod_memcg_lruvec_state() should be used.
- */
-static inline void mod_objcg_mlstate(struct obj_cgroup *objcg,
- struct pglist_data *pgdat,
- enum node_stat_item idx, int nr)
+static inline void __mod_objcg_mlstate(struct obj_cgroup *objcg,
+ struct pglist_data *pgdat,
+ enum node_stat_item idx, int nr)
{
struct mem_cgroup *memcg;
struct lruvec *lruvec;
+ lockdep_assert_irqs_disabled();
+
rcu_read_lock();
memcg = obj_cgroup_memcg(objcg);
lruvec = mem_cgroup_lruvec(memcg, pgdat);
- mod_memcg_lruvec_state(lruvec, idx, nr);
+ __mod_memcg_lruvec_state(lruvec, idx, nr);
rcu_read_unlock();
}
@@ -3312,7 +3318,7 @@ void __memcg_kmem_uncharge_page(struct page *page, int order)
obj_cgroup_put(objcg);
}
-void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
+static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
enum node_stat_item idx, int nr)
{
struct memcg_stock_pcp *stock;
@@ -3340,12 +3346,12 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
struct pglist_data *oldpg = stock->cached_pgdat;
if (stock->nr_slab_reclaimable_b) {
- mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B,
+ __mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B,
stock->nr_slab_reclaimable_b);
stock->nr_slab_reclaimable_b = 0;
}
if (stock->nr_slab_unreclaimable_b) {
- mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B,
+ __mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B,
stock->nr_slab_unreclaimable_b);
stock->nr_slab_unreclaimable_b = 0;
}
@@ -3371,7 +3377,7 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
}
}
if (nr)
- mod_objcg_mlstate(objcg, pgdat, idx, nr);
+ __mod_objcg_mlstate(objcg, pgdat, idx, nr);
local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
obj_cgroup_put(old);
@@ -3437,13 +3443,13 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
*/
if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) {
if (stock->nr_slab_reclaimable_b) {
- mod_objcg_mlstate(old, stock->cached_pgdat,
+ __mod_objcg_mlstate(old, stock->cached_pgdat,
NR_SLAB_RECLAIMABLE_B,
stock->nr_slab_reclaimable_b);
stock->nr_slab_reclaimable_b = 0;
}
if (stock->nr_slab_unreclaimable_b) {
- mod_objcg_mlstate(old, stock->cached_pgdat,
+ __mod_objcg_mlstate(old, stock->cached_pgdat,
NR_SLAB_UNRECLAIMABLE_B,
stock->nr_slab_unreclaimable_b);
stock->nr_slab_unreclaimable_b = 0;
@@ -3753,7 +3759,7 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
}
if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
- memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) {
+ memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, NULL)) {
ret = -EBUSY;
break;
}
@@ -3867,7 +3873,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
return -EINTR;
if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
- MEMCG_RECLAIM_MAY_SWAP))
+ MEMCG_RECLAIM_MAY_SWAP, NULL))
nr_retries--;
}
@@ -4430,7 +4436,7 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- if (val > 200)
+ if (val > MAX_SWAPPINESS)
return -EINVAL;
if (!mem_cgroup_is_root(memcg))
@@ -5888,7 +5894,7 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
}
}
}
- statc->stats_updates = 0;
+ WRITE_ONCE(statc->stats_updates, 0);
/* We are in a per-cpu loop here, only do the atomic write once */
if (atomic64_read(&memcg->vmstats->stats_updates))
atomic64_set(&memcg->vmstats->stats_updates, 0);
@@ -6831,7 +6837,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
}
reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
- GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP);
+ GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL);
if (!reclaimed && !nr_retries--)
break;
@@ -6880,7 +6886,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
if (nr_reclaims) {
if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
- GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP))
+ GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL))
nr_reclaims--;
continue;
}
@@ -7010,19 +7016,50 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
return nbytes;
}
+enum {
+ MEMORY_RECLAIM_SWAPPINESS = 0,
+ MEMORY_RECLAIM_NULL,
+};
+
+static const match_table_t tokens = {
+ { MEMORY_RECLAIM_SWAPPINESS, "swappiness=%d"},
+ { MEMORY_RECLAIM_NULL, NULL },
+};
+
static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
unsigned int nr_retries = MAX_RECLAIM_RETRIES;
unsigned long nr_to_reclaim, nr_reclaimed = 0;
+ int swappiness = -1;
unsigned int reclaim_options;
- int err;
+ char *old_buf, *start;
+ substring_t args[MAX_OPT_ARGS];
buf = strstrip(buf);
- err = page_counter_memparse(buf, "", &nr_to_reclaim);
- if (err)
- return err;
+
+ old_buf = buf;
+ nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE;
+ if (buf == old_buf)
+ return -EINVAL;
+
+ buf = strstrip(buf);
+
+ while ((start = strsep(&buf, " ")) != NULL) {
+ if (!strlen(start))
+ continue;
+ switch (match_token(start, tokens, args)) {
+ case MEMORY_RECLAIM_SWAPPINESS:
+ if (match_int(&args[0], &swappiness))
+ return -EINVAL;
+ if (swappiness < MIN_SWAPPINESS || swappiness > MAX_SWAPPINESS)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE;
while (nr_reclaimed < nr_to_reclaim) {
@@ -7042,7 +7079,9 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
lru_add_drain_all();
reclaimed = try_to_free_mem_cgroup_pages(memcg,
- batch_size, GFP_KERNEL, reclaim_options);
+ batch_size, GFP_KERNEL,
+ reclaim_options,
+ swappiness == -1 ? NULL : &swappiness);
if (!reclaimed && !nr_retries--)
return -EAGAIN;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 4f22330600ffbc..16ada4fb02b799 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -155,7 +155,7 @@ static int __page_handle_poison(struct page *page)
/*
* zone_pcp_disable() can't be used here. It will
- * hold pcp_batch_high_lock and dissolve_free_huge_page() might hold
+ * hold pcp_batch_high_lock and dissolve_free_hugetlb_folio() might hold
* cpu_hotplug_lock via static_key_slow_dec() when hugetlb vmemmap
* optimization is enabled. This will break current lock dependency
* chain and leads to deadlock.
@@ -165,7 +165,7 @@ static int __page_handle_poison(struct page *page)
* but nothing guarantees that those pages do not get back to a PCP
* queue if we need to refill those.
*/
- ret = dissolve_free_huge_page(page);
+ ret = dissolve_free_hugetlb_folio(page_folio(page));
if (!ret) {
drain_all_pages(page_zone(page));
ret = take_page_off_buddy(page);
@@ -178,8 +178,8 @@ static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, boo
{
if (hugepage_or_freepage) {
/*
- * Doing this check for free pages is also fine since dissolve_free_huge_page
- * returns 0 for non-hugetlb pages as well.
+ * Doing this check for free pages is also fine since
+ * dissolve_free_hugetlb_folio() returns 0 for non-hugetlb folios as well.
*/
if (__page_handle_poison(page) <= 0)
/*
@@ -216,6 +216,7 @@ EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
static int hwpoison_filter_dev(struct page *p)
{
+ struct folio *folio = page_folio(p);
struct address_space *mapping;
dev_t dev;
@@ -223,7 +224,7 @@ static int hwpoison_filter_dev(struct page *p)
hwpoison_filter_dev_minor == ~0U)
return 0;
- mapping = page_mapping(p);
+ mapping = folio_mapping(folio);
if (mapping == NULL || mapping->host == NULL)
return -EINVAL;
@@ -369,20 +370,25 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
* Unknown page type encountered. Try to check whether it can turn PageLRU by
* lru_add_drain_all.
*/
-void shake_page(struct page *p)
+void shake_folio(struct folio *folio)
{
- if (PageHuge(p))
+ if (folio_test_hugetlb(folio))
return;
/*
* TODO: Could shrink slab caches here if a lightweight range-based
* shrinker will be available.
*/
- if (PageSlab(p))
+ if (folio_test_slab(folio))
return;
lru_add_drain_all();
}
-EXPORT_SYMBOL_GPL(shake_page);
+EXPORT_SYMBOL_GPL(shake_folio);
+
+static void shake_page(struct page *page)
+{
+ shake_folio(page_folio(page));
+}
static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
unsigned long address)
@@ -427,21 +433,13 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
* not much we can do. We just print a message and ignore otherwise.
*/
-#define FSDAX_INVALID_PGOFF ULONG_MAX
-
/*
* Schedule a process for later kill.
* Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
- *
- * Note: @fsdax_pgoff is used only when @p is a fsdax page and a
- * filesystem with a memory failure handler has claimed the
- * memory_failure event. In all other cases, page->index and
- * page->mapping are sufficient for mapping the page back to its
- * corresponding user virtual address.
*/
static void __add_to_kill(struct task_struct *tsk, struct page *p,
struct vm_area_struct *vma, struct list_head *to_kill,
- unsigned long ksm_addr, pgoff_t fsdax_pgoff)
+ unsigned long addr)
{
struct to_kill *tk;
@@ -451,12 +449,10 @@ static void __add_to_kill(struct task_struct *tsk, struct page *p,
return;
}
- tk->addr = ksm_addr ? ksm_addr : page_address_in_vma(p, vma);
- if (is_zone_device_page(p)) {
- if (fsdax_pgoff != FSDAX_INVALID_PGOFF)
- tk->addr = vma_address(vma, fsdax_pgoff, 1);
+ tk->addr = addr;
+ if (is_zone_device_page(p))
tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
- } else
+ else
tk->size_shift = page_shift(compound_head(p));
/*
@@ -483,10 +479,12 @@ static void __add_to_kill(struct task_struct *tsk, struct page *p,
}
static void add_to_kill_anon_file(struct task_struct *tsk, struct page *p,
- struct vm_area_struct *vma,
- struct list_head *to_kill)
+ struct vm_area_struct *vma, struct list_head *to_kill,
+ unsigned long addr)
{
- __add_to_kill(tsk, p, vma, to_kill, 0, FSDAX_INVALID_PGOFF);
+ if (addr == -EFAULT)
+ return;
+ __add_to_kill(tsk, p, vma, to_kill, addr);
}
#ifdef CONFIG_KSM
@@ -502,12 +500,13 @@ static bool task_in_to_kill_list(struct list_head *to_kill,
return false;
}
+
void add_to_kill_ksm(struct task_struct *tsk, struct page *p,
struct vm_area_struct *vma, struct list_head *to_kill,
- unsigned long ksm_addr)
+ unsigned long addr)
{
if (!task_in_to_kill_list(to_kill, tsk))
- __add_to_kill(tsk, p, vma, to_kill, ksm_addr, FSDAX_INVALID_PGOFF);
+ __add_to_kill(tsk, p, vma, to_kill, addr);
}
#endif
/*
@@ -609,7 +608,6 @@ struct task_struct *task_early_kill(struct task_struct *tsk, int force_early)
static void collect_procs_anon(struct folio *folio, struct page *page,
struct list_head *to_kill, int force_early)
{
- struct vm_area_struct *vma;
struct task_struct *tsk;
struct anon_vma *av;
pgoff_t pgoff;
@@ -621,8 +619,10 @@ static void collect_procs_anon(struct folio *folio, struct page *page,
pgoff = page_to_pgoff(page);
rcu_read_lock();
for_each_process(tsk) {
+ struct vm_area_struct *vma;
struct anon_vma_chain *vmac;
struct task_struct *t = task_early_kill(tsk, force_early);
+ unsigned long addr;
if (!t)
continue;
@@ -631,9 +631,8 @@ static void collect_procs_anon(struct folio *folio, struct page *page,
vma = vmac->vma;
if (vma->vm_mm != t->mm)
continue;
- if (!page_mapped_in_vma(page, vma))
- continue;
- add_to_kill_anon_file(t, page, vma, to_kill);
+ addr = page_mapped_in_vma(page, vma);
+ add_to_kill_anon_file(t, page, vma, to_kill, addr);
}
}
rcu_read_unlock();
@@ -656,6 +655,7 @@ static void collect_procs_file(struct folio *folio, struct page *page,
pgoff = page_to_pgoff(page);
for_each_process(tsk) {
struct task_struct *t = task_early_kill(tsk, force_early);
+ unsigned long addr;
if (!t)
continue;
@@ -668,8 +668,10 @@ static void collect_procs_file(struct folio *folio, struct page *page,
* Assume applications who requested early kill want
* to be informed of all such data corruptions.
*/
- if (vma->vm_mm == t->mm)
- add_to_kill_anon_file(t, page, vma, to_kill);
+ if (vma->vm_mm != t->mm)
+ continue;
+ addr = page_address_in_vma(page, vma);
+ add_to_kill_anon_file(t, page, vma, to_kill, addr);
}
}
rcu_read_unlock();
@@ -681,7 +683,8 @@ static void add_to_kill_fsdax(struct task_struct *tsk, struct page *p,
struct vm_area_struct *vma,
struct list_head *to_kill, pgoff_t pgoff)
{
- __add_to_kill(tsk, p, vma, to_kill, 0, pgoff);
+ unsigned long addr = vma_address(vma, pgoff, 1);
+ __add_to_kill(tsk, p, vma, to_kill, addr);
}
/*
@@ -726,9 +729,9 @@ static void collect_procs(struct folio *folio, struct page *page,
{
if (!folio->mapping)
return;
- if (unlikely(PageKsm(page)))
- collect_procs_ksm(page, tokill, force_early);
- else if (PageAnon(page))
+ if (unlikely(folio_test_ksm(folio)))
+ collect_procs_ksm(folio, page, tokill, force_early);
+ else if (folio_test_anon(folio))
collect_procs_anon(folio, page, tokill, force_early);
else
collect_procs_file(folio, page, tokill, force_early);
@@ -1088,7 +1091,8 @@ out:
*/
static int me_pagecache_dirty(struct page_state *ps, struct page *p)
{
- struct address_space *mapping = page_mapping(p);
+ struct folio *folio = page_folio(p);
+ struct address_space *mapping = folio_mapping(folio);
SetPageError(p);
/* TBD: print more information about the file. */
@@ -1557,24 +1561,24 @@ static int get_hwpoison_page(struct page *p, unsigned long flags)
* Do all that is necessary to remove user space mappings. Unmap
* the pages and send SIGBUS to the processes if the data was dirty.
*/
-static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
- int flags, struct page *hpage)
+static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
+ unsigned long pfn, int flags)
{
- struct folio *folio = page_folio(hpage);
enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON;
struct address_space *mapping;
LIST_HEAD(tokill);
bool unmap_success;
int forcekill;
- bool mlocked = PageMlocked(hpage);
+ bool mlocked = folio_test_mlocked(folio);
/*
* Here we are interested only in user-mapped pages, so skip any
* other types of pages.
*/
- if (PageReserved(p) || PageSlab(p) || PageTable(p) || PageOffline(p))
+ if (folio_test_reserved(folio) || folio_test_slab(folio) ||
+ folio_test_pgtable(folio) || folio_test_offline(folio))
return true;
- if (!(PageLRU(hpage) || PageHuge(p)))
+ if (!(folio_test_lru(folio) || folio_test_hugetlb(folio)))
return true;
/*
@@ -1584,7 +1588,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
if (!page_mapped(p))
return true;
- if (PageSwapCache(p)) {
+ if (folio_test_swapcache(folio)) {
pr_err("%#lx: keeping poisoned page in swap cache\n", pfn);
ttu &= ~TTU_HWPOISON;
}
@@ -1595,11 +1599,11 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
* XXX: the dirty test could be racy: set_page_dirty() may not always
* be called inside page lock (it's recommended but not enforced).
*/
- mapping = page_mapping(hpage);
- if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
+ mapping = folio_mapping(folio);
+ if (!(flags & MF_MUST_KILL) && !folio_test_dirty(folio) && mapping &&
mapping_can_writeback(mapping)) {
- if (page_mkclean(hpage)) {
- SetPageDirty(hpage);
+ if (folio_mkclean(folio)) {
+ folio_set_dirty(folio);
} else {
ttu &= ~TTU_HWPOISON;
pr_info("%#lx: corrupted page was clean: dropped without side effects\n",
@@ -1614,7 +1618,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
*/
collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED);
- if (PageHuge(hpage) && !PageAnon(hpage)) {
+ if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) {
/*
* For hugetlb pages in shared mappings, try_to_unmap
* could potentially call huge_pmd_unshare. Because of
@@ -1622,7 +1626,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
* TTU_RMAP_LOCKED to indicate we have taken the lock
* at this higher level.
*/
- mapping = hugetlb_page_mapping_lock_write(hpage);
+ mapping = hugetlb_folio_mapping_lock_write(folio);
if (mapping) {
try_to_unmap(folio, ttu|TTU_RMAP_LOCKED);
i_mmap_unlock_write(mapping);
@@ -1634,15 +1638,15 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
unmap_success = !page_mapped(p);
if (!unmap_success)
- pr_err("%#lx: failed to unmap page (mapcount=%d)\n",
- pfn, page_mapcount(p));
+ pr_err("%#lx: failed to unmap page (folio mapcount=%d)\n",
+ pfn, folio_mapcount(page_folio(p)));
/*
* try_to_unmap() might put mlocked page in lru cache, so call
* shake_page() again to ensure that it's flushed.
*/
if (mlocked)
- shake_page(hpage);
+ shake_folio(folio);
/*
* Now that the dirty bit has been propagated to the
@@ -1654,7 +1658,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
* use a more force-full uncatchable kill to prevent
* any accesses to the poisoned memory.
*/
- forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL) ||
+ forcekill = folio_test_dirty(folio) || (flags & MF_MUST_KILL) ||
!unmap_success;
kill_procs(&tokill, forcekill, !unmap_success, pfn, flags);
@@ -2098,7 +2102,7 @@ retry:
page_flags = folio->flags;
- if (!hwpoison_user_mappings(p, pfn, flags, &folio->page)) {
+ if (!hwpoison_user_mappings(folio, p, pfn, flags)) {
folio_unlock(folio);
return action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
}
@@ -2187,7 +2191,7 @@ out:
int memory_failure(unsigned long pfn, int flags)
{
struct page *p;
- struct page *hpage;
+ struct folio *folio;
struct dev_pagemap *pgmap;
int res = 0;
unsigned long page_flags;
@@ -2275,8 +2279,8 @@ try_again:
}
}
- hpage = compound_head(p);
- if (PageTransHuge(hpage)) {
+ folio = page_folio(p);
+ if (folio_test_large(folio)) {
/*
* The flag must be set after the refcount is bumped
* otherwise it may race with THP split.
@@ -2290,12 +2294,13 @@ try_again:
* or unhandlable page. The refcount is bumped iff the
* page is a valid handlable page.
*/
- SetPageHasHWPoisoned(hpage);
+ folio_set_has_hwpoisoned(folio);
if (try_to_split_thp_page(p) < 0) {
res = action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
goto unlock_mutex;
}
VM_BUG_ON_PAGE(!page_count(p), p);
+ folio = page_folio(p);
}
/*
@@ -2306,9 +2311,9 @@ try_again:
* The check (unnecessarily) ignores LRU pages being isolated and
* walked by the page reclaim code, however that's not a big loss.
*/
- shake_page(p);
+ shake_folio(folio);
- lock_page(p);
+ folio_lock(folio);
/*
* We're only intended to deal with the non-Compound page here.
@@ -2316,11 +2321,11 @@ try_again:
* race window. If this happens, we could try again to hopefully
* handle the page next round.
*/
- if (PageCompound(p)) {
+ if (folio_test_large(folio)) {
if (retry) {
ClearPageHWPoison(p);
- unlock_page(p);
- put_page(p);
+ folio_unlock(folio);
+ folio_put(folio);
flags &= ~MF_COUNT_INCREASED;
retry = false;
goto try_again;
@@ -2336,35 +2341,35 @@ try_again:
* folio_remove_rmap_*() in try_to_unmap_one(). So to determine page
* status correctly, we save a copy of the page flags at this time.
*/
- page_flags = p->flags;
+ page_flags = folio->flags;
if (hwpoison_filter(p)) {
ClearPageHWPoison(p);
- unlock_page(p);
- put_page(p);
+ folio_unlock(folio);
+ folio_put(folio);
res = -EOPNOTSUPP;
goto unlock_mutex;
}
/*
- * __munlock_folio() may clear a writeback page's LRU flag without
- * page_lock. We need wait writeback completion for this page or it
- * may trigger vfs BUG while evict inode.
+ * __munlock_folio() may clear a writeback folio's LRU flag without
+ * the folio lock. We need to wait for writeback completion for this
+ * folio or it may trigger a vfs BUG while evicting inode.
*/
- if (!PageLRU(p) && !PageWriteback(p))
+ if (!folio_test_lru(folio) && !folio_test_writeback(folio))
goto identify_page_state;
/*
* It's very difficult to mess with pages currently under IO
* and in many cases impossible, so we just avoid it here.
*/
- wait_on_page_writeback(p);
+ folio_wait_writeback(folio);
/*
* Now take care of user space mappings.
* Abort on fail: __filemap_remove_folio() assumes unmapped page.
*/
- if (!hwpoison_user_mappings(p, pfn, flags, p)) {
+ if (!hwpoison_user_mappings(folio, p, pfn, flags)) {
res = action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
goto unlock_page;
}
@@ -2372,7 +2377,8 @@ try_again:
/*
* Torn down by someone else?
*/
- if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
+ if (folio_test_lru(folio) && !folio_test_swapcache(folio) &&
+ folio->mapping == NULL) {
res = action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
goto unlock_page;
}
@@ -2382,7 +2388,7 @@ identify_page_state:
mutex_unlock(&mf_mutex);
return res;
unlock_page:
- unlock_page(p);
+ folio_unlock(folio);
unlock_mutex:
mutex_unlock(&mf_mutex);
return res;
@@ -2552,8 +2558,8 @@ int unpoison_memory(unsigned long pfn)
goto unlock_mutex;
}
- if (folio_test_slab(folio) || PageTable(&folio->page) ||
- folio_test_reserved(folio) || PageOffline(&folio->page))
+ if (folio_test_slab(folio) || folio_test_pgtable(folio) ||
+ folio_test_reserved(folio) || folio_test_offline(folio))
goto unlock_mutex;
/*
@@ -2574,7 +2580,7 @@ int unpoison_memory(unsigned long pfn)
ghp = get_hwpoison_page(p, MF_UNPOISON);
if (!ghp) {
- if (PageHuge(p)) {
+ if (folio_test_hugetlb(folio)) {
huge = true;
count = folio_free_raw_hwp(folio, false);
if (count == 0)
@@ -2590,7 +2596,7 @@ int unpoison_memory(unsigned long pfn)
pfn, &unpoison_rs);
}
} else {
- if (PageHuge(p)) {
+ if (folio_test_hugetlb(folio)) {
huge = true;
count = folio_free_raw_hwp(folio, false);
if (count == 0) {
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 0537664620e5f5..6632102bd5c99d 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -36,6 +36,11 @@ struct node_memory_type_map {
static DEFINE_MUTEX(memory_tier_lock);
static LIST_HEAD(memory_tiers);
+/*
+ * The list is used to store all memory types that are not created
+ * by a device driver.
+ */
+static LIST_HEAD(default_memory_types);
static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
struct memory_dev_type *default_dram_type;
@@ -108,6 +113,8 @@ static struct demotion_nodes *node_demotion __read_mostly;
static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms);
+/* The lock is used to protect `default_dram_perf*` info and nid. */
+static DEFINE_MUTEX(default_dram_perf_lock);
static bool default_dram_perf_error;
static struct access_coordinate default_dram_perf;
static int default_dram_perf_ref_nid = NUMA_NO_NODE;
@@ -505,7 +512,8 @@ static inline void __init_node_memory_type(int node, struct memory_dev_type *mem
static struct memory_tier *set_node_memory_tier(int node)
{
struct memory_tier *memtier;
- struct memory_dev_type *memtype;
+ struct memory_dev_type *memtype = default_dram_type;
+ int adist = MEMTIER_ADISTANCE_DRAM;
pg_data_t *pgdat = NODE_DATA(node);
@@ -514,7 +522,16 @@ static struct memory_tier *set_node_memory_tier(int node)
if (!node_state(node, N_MEMORY))
return ERR_PTR(-EINVAL);
- __init_node_memory_type(node, default_dram_type);
+ mt_calc_adistance(node, &adist);
+ if (!node_memory_types[node].memtype) {
+ memtype = mt_find_alloc_memory_type(adist, &default_memory_types);
+ if (IS_ERR(memtype)) {
+ memtype = default_dram_type;
+ pr_info("Failed to allocate a memory type. Fall back.\n");
+ }
+ }
+
+ __init_node_memory_type(node, memtype);
memtype = node_memory_types[node].memtype;
node_set(node, memtype->nodes);
@@ -623,6 +640,64 @@ void clear_node_memory_type(int node, struct memory_dev_type *memtype)
}
EXPORT_SYMBOL_GPL(clear_node_memory_type);
+struct memory_dev_type *mt_find_alloc_memory_type(int adist, struct list_head *memory_types)
+{
+ struct memory_dev_type *mtype;
+
+ list_for_each_entry(mtype, memory_types, list)
+ if (mtype->adistance == adist)
+ return mtype;
+
+ mtype = alloc_memory_type(adist);
+ if (IS_ERR(mtype))
+ return mtype;
+
+ list_add(&mtype->list, memory_types);
+
+ return mtype;
+}
+EXPORT_SYMBOL_GPL(mt_find_alloc_memory_type);
+
+void mt_put_memory_types(struct list_head *memory_types)
+{
+ struct memory_dev_type *mtype, *mtn;
+
+ list_for_each_entry_safe(mtype, mtn, memory_types, list) {
+ list_del(&mtype->list);
+ put_memory_type(mtype);
+ }
+}
+EXPORT_SYMBOL_GPL(mt_put_memory_types);
+
+/*
+ * This is invoked via `late_initcall()` to initialize memory tiers for
+ * CPU-less memory nodes after driver initialization, which is
+ * expected to provide `adistance` algorithms.
+ */
+static int __init memory_tier_late_init(void)
+{
+ int nid;
+
+ guard(mutex)(&memory_tier_lock);
+ for_each_node_state(nid, N_MEMORY) {
+ /*
+ * Some device drivers may have initialized memory tiers
+ * between `memory_tier_init()` and `memory_tier_late_init()`,
+ * potentially bringing online memory nodes and
+ * configuring memory tiers. Exclude them here.
+ */
+ if (node_memory_types[nid].memtype)
+ continue;
+
+ set_node_memory_tier(nid);
+ }
+
+ establish_demotion_targets();
+
+ return 0;
+}
+late_initcall(memory_tier_late_init);
+
static void dump_hmem_attrs(struct access_coordinate *coord, const char *prefix)
{
pr_info(
@@ -634,25 +709,19 @@ static void dump_hmem_attrs(struct access_coordinate *coord, const char *prefix)
int mt_set_default_dram_perf(int nid, struct access_coordinate *perf,
const char *source)
{
- int rc = 0;
-
- mutex_lock(&memory_tier_lock);
- if (default_dram_perf_error) {
- rc = -EIO;
- goto out;
- }
+ guard(mutex)(&default_dram_perf_lock);
+ if (default_dram_perf_error)
+ return -EIO;
if (perf->read_latency + perf->write_latency == 0 ||
- perf->read_bandwidth + perf->write_bandwidth == 0) {
- rc = -EINVAL;
- goto out;
- }
+ perf->read_bandwidth + perf->write_bandwidth == 0)
+ return -EINVAL;
if (default_dram_perf_ref_nid == NUMA_NO_NODE) {
default_dram_perf = *perf;
default_dram_perf_ref_nid = nid;
default_dram_perf_ref_source = kstrdup(source, GFP_KERNEL);
- goto out;
+ return 0;
}
/*
@@ -680,27 +749,25 @@ int mt_set_default_dram_perf(int nid, struct access_coordinate *perf,
pr_info(
" disable default DRAM node performance based abstract distance algorithm.\n");
default_dram_perf_error = true;
- rc = -EINVAL;
+ return -EINVAL;
}
-out:
- mutex_unlock(&memory_tier_lock);
- return rc;
+ return 0;
}
int mt_perf_to_adistance(struct access_coordinate *perf, int *adist)
{
+ guard(mutex)(&default_dram_perf_lock);
if (default_dram_perf_error)
return -EIO;
- if (default_dram_perf_ref_nid == NUMA_NO_NODE)
- return -ENOENT;
-
if (perf->read_latency + perf->write_latency == 0 ||
perf->read_bandwidth + perf->write_bandwidth == 0)
return -EINVAL;
- mutex_lock(&memory_tier_lock);
+ if (default_dram_perf_ref_nid == NUMA_NO_NODE)
+ return -ENOENT;
+
/*
* The abstract distance of a memory node is in direct proportion to
* its memory latency (read + write) and inversely proportional to its
@@ -713,7 +780,6 @@ int mt_perf_to_adistance(struct access_coordinate *perf, int *adist)
(default_dram_perf.read_latency + default_dram_perf.write_latency) *
(default_dram_perf.read_bandwidth + default_dram_perf.write_bandwidth) /
(perf->read_bandwidth + perf->write_bandwidth);
- mutex_unlock(&memory_tier_lock);
return 0;
}
@@ -826,7 +892,8 @@ static int __init memory_tier_init(void)
* For now we can have 4 faster memory tiers with smaller adistance
* than default DRAM tier.
*/
- default_dram_type = alloc_memory_type(MEMTIER_ADISTANCE_DRAM);
+ default_dram_type = mt_find_alloc_memory_type(MEMTIER_ADISTANCE_DRAM,
+ &default_memory_types);
if (IS_ERR(default_dram_type))
panic("%s() failed to allocate default DRAM tier\n", __func__);
@@ -836,6 +903,14 @@ static int __init memory_tier_init(void)
* types assigned.
*/
for_each_node_state(node, N_MEMORY) {
+ if (!node_state(node, N_CPU))
+ /*
+ * Defer memory tier initialization on
+ * CPUless numa nodes. These will be initialized
+ * after firmware and devices are initialized.
+ */
+ continue;
+
memtier = set_node_memory_tier(node);
if (IS_ERR(memtier))
/*
diff --git a/mm/memory.c b/mm/memory.c
index 59c05dc8b18a7a..eea6e4984eaefa 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -112,8 +112,10 @@ static bool vmf_pte_changed(struct vm_fault *vmf);
* Return true if the original pte was a uffd-wp pte marker (so the pte was
* wr-protected).
*/
-static bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf)
+static __always_inline bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf)
{
+ if (!userfaultfd_wp(vmf->vma))
+ return false;
if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
return false;
@@ -989,7 +991,7 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
flags |= FPB_IGNORE_SOFT_DIRTY;
nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags,
- &any_writable, NULL);
+ &any_writable, NULL, NULL);
folio_ref_add(folio, nr);
if (folio_test_anon(folio)) {
if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page,
@@ -1502,8 +1504,7 @@ static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb,
if (!delay_rmap) {
folio_remove_rmap_ptes(folio, page, nr, vma);
- /* Only sanity-check the first page in a batch. */
- if (unlikely(page_mapcount(page) < 0))
+ if (unlikely(folio_mapcount(folio) < 0))
print_bad_pte(vma, addr, ptent, page);
}
@@ -1559,7 +1560,7 @@ static inline int zap_present_ptes(struct mmu_gather *tlb,
*/
if (unlikely(folio_test_large(folio) && max_nr != 1)) {
nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags,
- NULL, NULL);
+ NULL, NULL, NULL);
zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr,
addr, details, rss, force_flush,
@@ -3213,19 +3214,39 @@ static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf)
return VM_FAULT_RETRY;
}
+/**
+ * vmf_anon_prepare - Prepare to handle an anonymous fault.
+ * @vmf: The vm_fault descriptor passed from the fault handler.
+ *
+ * When preparing to insert an anonymous page into a VMA from a
+ * fault handler, call this function rather than anon_vma_prepare().
+ * If this vma does not already have an associated anon_vma and we are
+ * only protected by the per-VMA lock, the caller must retry with the
+ * mmap_lock held. __anon_vma_prepare() will look at adjacent VMAs to
+ * determine if this VMA can share its anon_vma, and that's not safe to
+ * do with only the per-VMA lock held for this VMA.
+ *
+ * Return: 0 if fault handling can proceed. Any other value should be
+ * returned to the caller.
+ */
vm_fault_t vmf_anon_prepare(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
+ vm_fault_t ret = 0;
if (likely(vma->anon_vma))
return 0;
if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
- vma_end_read(vma);
- return VM_FAULT_RETRY;
+ if (!mmap_read_trylock(vma->vm_mm)) {
+ vma_end_read(vma);
+ return VM_FAULT_RETRY;
+ }
}
if (__anon_vma_prepare(vma))
- return VM_FAULT_OOM;
- return 0;
+ ret = VM_FAULT_OOM;
+ if (vmf->flags & FAULT_FLAG_VMA_LOCK)
+ mmap_read_unlock(vma->vm_mm);
+ return ret;
}
/*
@@ -4333,8 +4354,8 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
* for this vma. Then filter out the orders that can't be allocated over
* the faulting address and still be fully contained in the vma.
*/
- orders = thp_vma_allowable_orders(vma, vma->vm_flags, false, true, true,
- BIT(PMD_ORDER) - 1);
+ orders = thp_vma_allowable_orders(vma, vma->vm_flags,
+ TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1);
orders = thp_vma_suitable_orders(vma, vmf->address, orders);
if (!orders)
@@ -4369,6 +4390,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
folio = vma_alloc_folio(gfp, order, vma, addr, true);
if (folio) {
if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
+ count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
folio_put(folio);
goto next;
}
@@ -4377,6 +4399,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
return folio;
}
next:
+ count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
order = next_order(&orders, order);
}
@@ -4392,7 +4415,6 @@ fallback:
*/
static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
{
- bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
struct vm_area_struct *vma = vmf->vma;
unsigned long addr = vmf->address;
struct folio *folio;
@@ -4437,8 +4459,9 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
}
/* Allocate our own private page. */
- if (unlikely(anon_vma_prepare(vma)))
- goto oom;
+ ret = vmf_anon_prepare(vmf);
+ if (ret)
+ return ret;
/* Returns NULL on OOM or ERR_PTR(-EAGAIN) if we must retry the fault */
folio = alloc_anon_folio(vmf);
if (IS_ERR(folio))
@@ -4486,10 +4509,13 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
folio_ref_add(folio, nr_pages - 1);
add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC);
+#endif
folio_add_new_anon_rmap(folio, vma, addr);
folio_add_lru_vma(folio, vma);
setpte:
- if (uffd_wp)
+ if (vmf_orig_pte_uffd_wp(vmf))
entry = pte_mkuffd_wp(entry);
set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages);
@@ -4664,7 +4690,6 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio,
struct page *page, unsigned int nr, unsigned long addr)
{
struct vm_area_struct *vma = vmf->vma;
- bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
bool write = vmf->flags & FAULT_FLAG_WRITE;
bool prefault = in_range(vmf->address, addr, nr * PAGE_SIZE);
pte_t entry;
@@ -4679,16 +4704,14 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio,
if (write)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- if (unlikely(uffd_wp))
+ if (unlikely(vmf_orig_pte_uffd_wp(vmf)))
entry = pte_mkuffd_wp(entry);
/* copy-on-write page */
if (write && !(vma->vm_flags & VM_SHARED)) {
- add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr);
VM_BUG_ON_FOLIO(nr != 1, folio);
folio_add_new_anon_rmap(folio, vma, addr);
folio_add_lru_vma(folio, vma);
} else {
- add_mm_counter(vma->vm_mm, mm_counter_file(folio), nr);
folio_add_file_rmap_ptes(folio, page, nr, vma);
}
set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr);
@@ -4725,9 +4748,11 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
struct page *page;
vm_fault_t ret;
+ bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) &&
+ !(vma->vm_flags & VM_SHARED);
/* Did we COW the page? */
- if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
+ if (is_cow)
page = vmf->cow_page;
else
page = vmf->page;
@@ -4763,8 +4788,10 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
/* Re-check under ptl */
if (likely(!vmf_pte_changed(vmf))) {
struct folio *folio = page_folio(page);
+ int type = is_cow ? MM_ANONPAGES : mm_counter_file(folio);
set_pte_range(vmf, folio, page, 1, vmf->address);
+ add_mm_counter(vma->vm_mm, type, 1);
ret = 0;
} else {
update_mmu_tlb(vma, vmf->address, vmf->pte);
@@ -5432,7 +5459,8 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
return VM_FAULT_OOM;
retry_pud:
if (pud_none(*vmf.pud) &&
- thp_vma_allowable_order(vma, vm_flags, false, true, true, PUD_ORDER)) {
+ thp_vma_allowable_order(vma, vm_flags,
+ TVA_IN_PF | TVA_ENFORCE_SYSFS, PUD_ORDER)) {
ret = create_huge_pud(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
@@ -5466,7 +5494,8 @@ retry_pud:
goto retry_pud;
if (pmd_none(*vmf.pmd) &&
- thp_vma_allowable_order(vma, vm_flags, false, true, true, PMD_ORDER)) {
+ thp_vma_allowable_order(vma, vm_flags,
+ TVA_IN_PF | TVA_ENFORCE_SYSFS, PMD_ORDER)) {
ret = create_huge_pmd(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
@@ -5820,15 +5849,6 @@ retry:
if (!vma_start_read(vma))
goto inval;
- /*
- * find_mergeable_anon_vma uses adjacent vmas which are not locked.
- * This check must happen after vma_start_read(); otherwise, a
- * concurrent mremap() with MREMAP_DONTUNMAP could dissociate the VMA
- * from its anon_vma.
- */
- if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma))
- goto inval_end_read;
-
/* Check since vm_start/vm_end might change before we lock the VMA */
if (unlikely(address < vma->vm_start || address >= vma->vm_end))
goto inval_end_read;
@@ -5926,34 +5946,48 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
/**
* follow_pte - look up PTE at a user virtual address
- * @mm: the mm_struct of the target address space
+ * @vma: the memory mapping
* @address: user virtual address
* @ptepp: location to store found PTE
* @ptlp: location to store the lock for the PTE
*
* On a successful return, the pointer to the PTE is stored in @ptepp;
* the corresponding lock is taken and its location is stored in @ptlp.
- * The contents of the PTE are only stable until @ptlp is released;
- * any further use, if any, must be protected against invalidation
- * with MMU notifiers.
+ *
+ * The contents of the PTE are only stable until @ptlp is released using
+ * pte_unmap_unlock(). This function will fail if the PTE is non-present.
+ * Present PTEs may include PTEs that map refcounted pages, such as
+ * anonymous folios in COW mappings.
+ *
+ * Callers must be careful when relying on PTE content after
+ * pte_unmap_unlock(). Especially if the PTE maps a refcounted page,
+ * callers must protect against invalidation with MMU notifiers; otherwise
+ * access to the PFN at a later point in time can trigger use-after-free.
*
* Only IO mappings and raw PFN mappings are allowed. The mmap semaphore
* should be taken for read.
*
- * KVM uses this function. While it is arguably less bad than the historic
- * ``follow_pfn``, it is not a good general-purpose API.
+ * This function must not be used to modify PTE content.
*
* Return: zero on success, -ve otherwise.
*/
-int follow_pte(struct mm_struct *mm, unsigned long address,
+int follow_pte(struct vm_area_struct *vma, unsigned long address,
pte_t **ptepp, spinlock_t **ptlp)
{
+ struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *ptep;
+ mmap_assert_locked(mm);
+ if (unlikely(address < vma->vm_start || address >= vma->vm_end))
+ goto out;
+
+ if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+ goto out;
+
pgd = pgd_offset(mm, address);
if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
goto out;
@@ -6007,11 +6041,8 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
int offset = offset_in_page(addr);
int ret = -EINVAL;
- if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
- return -EINVAL;
-
retry:
- if (follow_pte(vma->vm_mm, addr, &ptep, &ptl))
+ if (follow_pte(vma, addr, &ptep, &ptl))
return -EINVAL;
pte = ptep_get(ptep);
pte_unmap_unlock(ptep, ptl);
@@ -6026,7 +6057,7 @@ retry:
if (!maddr)
return -ENOMEM;
- if (follow_pte(vma->vm_mm, addr, &ptep, &ptl))
+ if (follow_pte(vma, addr, &ptep, &ptl))
goto out_unmap;
if (!pte_same(pte, ptep_get(ptep))) {
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b79ba36e09e034..431b1f6753c0b9 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -2051,11 +2051,11 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
}
/*
- * Dissolve free hugepages in the memory block before doing
+ * Dissolve free hugetlb folios in the memory block before doing
* offlining actually in order to make hugetlbfs's object
* counting consistent.
*/
- ret = dissolve_free_huge_pages(start_pfn, end_pfn);
+ ret = dissolve_free_hugetlb_folios(start_pfn, end_pfn);
if (ret) {
reason = "failure to dissolve huge pages";
goto failed_removal_isolated;
diff --git a/mm/memremap.c b/mm/memremap.c
index e1776693e2eaed..40d4547ce5144e 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -512,9 +512,9 @@ void zone_device_page_init(struct page *page)
EXPORT_SYMBOL_GPL(zone_device_page_init);
#ifdef CONFIG_FS_DAX
-bool __put_devmap_managed_page_refs(struct page *page, int refs)
+bool __put_devmap_managed_folio_refs(struct folio *folio, int refs)
{
- if (page->pgmap->type != MEMORY_DEVICE_FS_DAX)
+ if (folio->page.pgmap->type != MEMORY_DEVICE_FS_DAX)
return false;
/*
@@ -522,9 +522,9 @@ bool __put_devmap_managed_page_refs(struct page *page, int refs)
* refcount is 1, then the page is free and the refcount is
* stable because nobody holds a reference on the page.
*/
- if (page_ref_sub_return(page, refs) == 1)
- wake_up_var(&page->_refcount);
+ if (folio_ref_sub_return(folio, refs) == 1)
+ wake_up_var(&folio->_refcount);
return true;
}
-EXPORT_SYMBOL(__put_devmap_managed_page_refs);
+EXPORT_SYMBOL(__put_devmap_managed_folio_refs);
#endif /* CONFIG_FS_DAX */
diff --git a/mm/migrate.c b/mm/migrate.c
index 285072bca29ce2..dd04f578c19c3e 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -616,7 +616,7 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio)
folio_migrate_ksm(newfolio, folio);
/*
* Please do not reorder this without considering how mm/ksm.c's
- * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache().
+ * ksm_get_folio() depends upon ksm_migrate_page() and PageSwapCache().
*/
if (folio_test_swapcache(folio))
folio_clear_swapcache(folio);
@@ -1425,7 +1425,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio,
* semaphore in write mode here and set TTU_RMAP_LOCKED
* to let lower levels know we have taken the lock.
*/
- mapping = hugetlb_page_mapping_lock_write(&src->page);
+ mapping = hugetlb_folio_mapping_lock_write(src);
if (unlikely(!mapping))
goto unlock_put_anon;
@@ -2140,7 +2140,7 @@ static int add_page_for_migration(struct mm_struct *mm, const void __user *p,
goto out_putfolio;
err = -EACCES;
- if (page_mapcount(page) > 1 && !migrate_all)
+ if (folio_likely_mapped_shared(folio) && !migrate_all)
goto out_putfolio;
err = -EBUSY;
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index d40b46ae9d65b7..7e0712493d1f48 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -324,6 +324,8 @@ static void migrate_vma_collect(struct migrate_vma *migrate)
*/
static bool migrate_vma_check_page(struct page *page, struct page *fault_page)
{
+ struct folio *folio = page_folio(page);
+
/*
* One extra ref because caller holds an extra reference, either from
* isolate_lru_page() for a regular page, or migrate_vma_collect() for
@@ -336,18 +338,18 @@ static bool migrate_vma_check_page(struct page *page, struct page *fault_page)
* check them than regular pages, because they can be mapped with a pmd
* or with a pte (split pte mapping).
*/
- if (PageCompound(page))
+ if (folio_test_large(folio))
return false;
/* Page from ZONE_DEVICE have one extra reference */
- if (is_zone_device_page(page))
+ if (folio_is_zone_device(folio))
extra++;
/* For file back page */
- if (page_mapping(page))
- extra += 1 + page_has_private(page);
+ if (folio_mapping(folio))
+ extra += 1 + folio_has_private(folio);
- if ((page_count(page) - extra) > page_mapcount(page))
+ if ((folio_ref_count(folio) - extra) > folio_mapcount(folio))
return false;
return true;
@@ -694,6 +696,7 @@ static void __migrate_device_pages(unsigned long *src_pfns,
struct page *newpage = migrate_pfn_to_page(dst_pfns[i]);
struct page *page = migrate_pfn_to_page(src_pfns[i]);
struct address_space *mapping;
+ struct folio *folio;
int r;
if (!newpage) {
@@ -728,15 +731,12 @@ static void __migrate_device_pages(unsigned long *src_pfns,
continue;
}
- mapping = page_mapping(page);
+ folio = page_folio(page);
+ mapping = folio_mapping(folio);
if (is_device_private_page(newpage) ||
is_device_coherent_page(newpage)) {
if (mapping) {
- struct folio *folio;
-
- folio = page_folio(page);
-
/*
* For now only support anonymous memory migrating to
* device private or coherent memory.
@@ -759,11 +759,10 @@ static void __migrate_device_pages(unsigned long *src_pfns,
if (migrate && migrate->fault_page == page)
r = migrate_folio_extra(mapping, page_folio(newpage),
- page_folio(page),
- MIGRATE_SYNC_NO_COPY, 1);
+ folio, MIGRATE_SYNC_NO_COPY, 1);
else
r = migrate_folio(mapping, page_folio(newpage),
- page_folio(page), MIGRATE_SYNC_NO_COPY);
+ folio, MIGRATE_SYNC_NO_COPY);
if (r != MIGRATEPAGE_SUCCESS)
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
}
diff --git a/mm/mmap.c b/mm/mmap.c
index d3c2ca8efa534c..057270dbe3aa12 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1255,6 +1255,16 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
if (mm->map_count > sysctl_max_map_count)
return -ENOMEM;
+ /*
+ * addr is returned from get_unmapped_area,
+ * There are two cases:
+ * 1> MAP_FIXED == false
+ * unallocated memory, no need to check sealing.
+ * 1> MAP_FIXED == true
+ * sealing is checked inside mmap_region when
+ * do_vmi_munmap is called.
+ */
+
if (prot == PROT_EXEC) {
pkey = execute_only_pkey(mm);
if (pkey < 0)
@@ -1514,32 +1524,32 @@ bool vma_needs_dirty_tracking(struct vm_area_struct *vma)
* to the private version (using protection_map[] without the
* VM_SHARED bit).
*/
-int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
+bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
{
/* If it was private or non-writable, the write bit is already clear */
if (!vma_is_shared_writable(vma))
- return 0;
+ return false;
/* The backer wishes to know when pages are first written to? */
if (vm_ops_needs_writenotify(vma->vm_ops))
- return 1;
+ return true;
/* The open routine did something to the protections that pgprot_modify
* won't preserve? */
if (pgprot_val(vm_page_prot) !=
pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags)))
- return 0;
+ return false;
/*
* Do we need to track softdirty? hugetlb does not support softdirty
* tracking yet.
*/
if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma))
- return 1;
+ return true;
/* Do we need write faults for uffd-wp tracking? */
if (userfaultfd_wp(vma))
- return 1;
+ return true;
/* Can the mapping track the dirty pages? */
return vma_fs_can_writeback(vma);
@@ -1549,14 +1559,14 @@ int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
* We account for memory if it's a private writeable mapping,
* not hugepages and VM_NORESERVE wasn't set.
*/
-static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
+static inline bool accountable_mapping(struct file *file, vm_flags_t vm_flags)
{
/*
* hugetlb has its own accounting separate from the core VM
* VM_HUGETLB may not be set yet so we cannot check for that flag.
*/
if (file && is_file_hugepages(file))
- return 0;
+ return false;
return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
}
@@ -2725,6 +2735,14 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
if (end == start)
return -EINVAL;
+ /*
+ * Check if memory is sealed before arch_unmap.
+ * Prevent unmapping a sealed VMA.
+ * can_modify_mm assumes we have acquired the lock on MM.
+ */
+ if (unlikely(!can_modify_mm(mm, start, end)))
+ return -EPERM;
+
/* arch_unmap() might do unmaps itself. */
arch_unmap(mm, start, end);
@@ -2787,7 +2805,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
}
/* Unmap any existing mapping in the area */
- if (do_vmi_munmap(&vmi, mm, addr, len, uf, false))
+ error = do_vmi_munmap(&vmi, mm, addr, len, uf, false);
+ if (error == -EPERM)
+ return error;
+ else if (error)
return -ENOMEM;
/*
@@ -3137,6 +3158,14 @@ int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
{
struct mm_struct *mm = vma->vm_mm;
+ /*
+ * Check if memory is sealed before arch_unmap.
+ * Prevent unmapping a sealed VMA.
+ * can_modify_mm assumes we have acquired the lock on MM.
+ */
+ if (unlikely(!can_modify_mm(mm, start, end)))
+ return -EPERM;
+
arch_unmap(mm, start, end);
return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock);
}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 94878c39ee32dd..8c6cd88252738d 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -32,6 +32,7 @@
#include <linux/sched/sysctl.h>
#include <linux/userfaultfd_k.h>
#include <linux/memory-tiers.h>
+#include <uapi/linux/mman.h>
#include <asm/cacheflush.h>
#include <asm/mmu_context.h>
#include <asm/tlbflush.h>
@@ -744,6 +745,15 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
}
}
+ /*
+ * checking if memory is sealed.
+ * can_modify_mm assumes we have acquired the lock on MM.
+ */
+ if (unlikely(!can_modify_mm(current->mm, start, end))) {
+ error = -EPERM;
+ goto out;
+ }
+
prev = vma_prev(&vmi);
if (start > vma->vm_start)
prev = vma;
diff --git a/mm/mremap.c b/mm/mremap.c
index f5aba752d35f76..5f96bc5ee91827 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -902,7 +902,25 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
if ((mm->map_count + 2) >= sysctl_max_map_count - 3)
return -ENOMEM;
+ /*
+ * In mremap_to().
+ * Move a VMA to another location, check if src addr is sealed.
+ *
+ * Place can_modify_mm here because mremap_to()
+ * does its own checking for address range, and we only
+ * check the sealing after passing those checks.
+ *
+ * can_modify_mm assumes we have acquired the lock on MM.
+ */
+ if (unlikely(!can_modify_mm(mm, addr, addr + old_len)))
+ return -EPERM;
+
if (flags & MREMAP_FIXED) {
+ /*
+ * In mremap_to().
+ * VMA is moved to dst address, and munmap dst first.
+ * do_munmap will check if dst is sealed.
+ */
ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
if (ret)
goto out;
@@ -1062,6 +1080,19 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
}
/*
+ * Below is shrink/expand case (not mremap_to())
+ * Check if src address is sealed, if so, reject.
+ * In other words, prevent shrinking or expanding a sealed VMA.
+ *
+ * Place can_modify_mm here so we can keep the logic related to
+ * shrink/expand together.
+ */
+ if (unlikely(!can_modify_mm(mm, addr, addr + old_len))) {
+ ret = -EPERM;
+ goto out;
+ }
+
+ /*
* Always allow a shrinking remap: that just unmaps
* the unnecessary pages..
* do_vmi_munmap does all the needed commit accounting, and
diff --git a/mm/mseal.c b/mm/mseal.c
new file mode 100644
index 00000000000000..bf783bba8ed0b9
--- /dev/null
+++ b/mm/mseal.c
@@ -0,0 +1,307 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Implement mseal() syscall.
+ *
+ * Copyright (c) 2023,2024 Google, Inc.
+ *
+ * Author: Jeff Xu <jeffxu@chromium.org>
+ */
+
+#include <linux/mempolicy.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/mm_inline.h>
+#include <linux/mmu_context.h>
+#include <linux/syscalls.h>
+#include <linux/sched.h>
+#include "internal.h"
+
+static inline bool vma_is_sealed(struct vm_area_struct *vma)
+{
+ return (vma->vm_flags & VM_SEALED);
+}
+
+static inline void set_vma_sealed(struct vm_area_struct *vma)
+{
+ vm_flags_set(vma, VM_SEALED);
+}
+
+/*
+ * check if a vma is sealed for modification.
+ * return true, if modification is allowed.
+ */
+static bool can_modify_vma(struct vm_area_struct *vma)
+{
+ if (unlikely(vma_is_sealed(vma)))
+ return false;
+
+ return true;
+}
+
+static bool is_madv_discard(int behavior)
+{
+ return behavior &
+ (MADV_FREE | MADV_DONTNEED | MADV_DONTNEED_LOCKED |
+ MADV_REMOVE | MADV_DONTFORK | MADV_WIPEONFORK);
+}
+
+static bool is_ro_anon(struct vm_area_struct *vma)
+{
+ /* check anonymous mapping. */
+ if (vma->vm_file || vma->vm_flags & VM_SHARED)
+ return false;
+
+ /*
+ * check for non-writable:
+ * PROT=RO or PKRU is not writeable.
+ */
+ if (!(vma->vm_flags & VM_WRITE) ||
+ !arch_vma_access_permitted(vma, true, false, false))
+ return true;
+
+ return false;
+}
+
+/*
+ * Check if the vmas of a memory range are allowed to be modified.
+ * the memory ranger can have a gap (unallocated memory).
+ * return true, if it is allowed.
+ */
+bool can_modify_mm(struct mm_struct *mm, unsigned long start, unsigned long end)
+{
+ struct vm_area_struct *vma;
+
+ VMA_ITERATOR(vmi, mm, start);
+
+ /* going through each vma to check. */
+ for_each_vma_range(vmi, vma, end) {
+ if (unlikely(!can_modify_vma(vma)))
+ return false;
+ }
+
+ /* Allow by default. */
+ return true;
+}
+
+/*
+ * Check if the vmas of a memory range are allowed to be modified by madvise.
+ * the memory ranger can have a gap (unallocated memory).
+ * return true, if it is allowed.
+ */
+bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, unsigned long end,
+ int behavior)
+{
+ struct vm_area_struct *vma;
+
+ VMA_ITERATOR(vmi, mm, start);
+
+ if (!is_madv_discard(behavior))
+ return true;
+
+ /* going through each vma to check. */
+ for_each_vma_range(vmi, vma, end)
+ if (unlikely(is_ro_anon(vma) && !can_modify_vma(vma)))
+ return false;
+
+ /* Allow by default. */
+ return true;
+}
+
+static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ struct vm_area_struct **prev, unsigned long start,
+ unsigned long end, vm_flags_t newflags)
+{
+ int ret = 0;
+ vm_flags_t oldflags = vma->vm_flags;
+
+ if (newflags == oldflags)
+ goto out;
+
+ vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags);
+ if (IS_ERR(vma)) {
+ ret = PTR_ERR(vma);
+ goto out;
+ }
+
+ set_vma_sealed(vma);
+out:
+ *prev = vma;
+ return ret;
+}
+
+/*
+ * Check for do_mseal:
+ * 1> start is part of a valid vma.
+ * 2> end is part of a valid vma.
+ * 3> No gap (unallocated address) between start and end.
+ * 4> map is sealable.
+ */
+static int check_mm_seal(unsigned long start, unsigned long end)
+{
+ struct vm_area_struct *vma;
+ unsigned long nstart = start;
+
+ VMA_ITERATOR(vmi, current->mm, start);
+
+ /* going through each vma to check. */
+ for_each_vma_range(vmi, vma, end) {
+ if (vma->vm_start > nstart)
+ /* unallocated memory found. */
+ return -ENOMEM;
+
+ if (vma->vm_end >= end)
+ return 0;
+
+ nstart = vma->vm_end;
+ }
+
+ return -ENOMEM;
+}
+
+/*
+ * Apply sealing.
+ */
+static int apply_mm_seal(unsigned long start, unsigned long end)
+{
+ unsigned long nstart;
+ struct vm_area_struct *vma, *prev;
+
+ VMA_ITERATOR(vmi, current->mm, start);
+
+ vma = vma_iter_load(&vmi);
+ /*
+ * Note: check_mm_seal should already checked ENOMEM case.
+ * so vma should not be null, same for the other ENOMEM cases.
+ */
+ prev = vma_prev(&vmi);
+ if (start > vma->vm_start)
+ prev = vma;
+
+ nstart = start;
+ for_each_vma_range(vmi, vma, end) {
+ int error;
+ unsigned long tmp;
+ vm_flags_t newflags;
+
+ newflags = vma->vm_flags | VM_SEALED;
+ tmp = vma->vm_end;
+ if (tmp > end)
+ tmp = end;
+ error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags);
+ if (error)
+ return error;
+ nstart = vma_iter_end(&vmi);
+ }
+
+ return 0;
+}
+
+/*
+ * mseal(2) seals the VM's meta data from
+ * selected syscalls.
+ *
+ * addr/len: VM address range.
+ *
+ * The address range by addr/len must meet:
+ * start (addr) must be in a valid VMA.
+ * end (addr + len) must be in a valid VMA.
+ * no gap (unallocated memory) between start and end.
+ * start (addr) must be page aligned.
+ *
+ * len: len will be page aligned implicitly.
+ *
+ * Below VMA operations are blocked after sealing.
+ * 1> Unmapping, moving to another location, and shrinking
+ * the size, via munmap() and mremap(), can leave an empty
+ * space, therefore can be replaced with a VMA with a new
+ * set of attributes.
+ * 2> Moving or expanding a different vma into the current location,
+ * via mremap().
+ * 3> Modifying a VMA via mmap(MAP_FIXED).
+ * 4> Size expansion, via mremap(), does not appear to pose any
+ * specific risks to sealed VMAs. It is included anyway because
+ * the use case is unclear. In any case, users can rely on
+ * merging to expand a sealed VMA.
+ * 5> mprotect and pkey_mprotect.
+ * 6> Some destructive madvice() behavior (e.g. MADV_DONTNEED)
+ * for anonymous memory, when users don't have write permission to the
+ * memory. Those behaviors can alter region contents by discarding pages,
+ * effectively a memset(0) for anonymous memory.
+ *
+ * flags: reserved.
+ *
+ * return values:
+ * zero: success.
+ * -EINVAL:
+ * invalid input flags.
+ * start address is not page aligned.
+ * Address arange (start + len) overflow.
+ * -ENOMEM:
+ * addr is not a valid address (not allocated).
+ * end (start + len) is not a valid address.
+ * a gap (unallocated memory) between start and end.
+ * -EPERM:
+ * - In 32 bit architecture, sealing is not supported.
+ * Note:
+ * user can call mseal(2) multiple times, adding a seal on an
+ * already sealed memory is a no-action (no error).
+ *
+ * unseal() is not supported.
+ */
+static int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
+{
+ size_t len;
+ int ret = 0;
+ unsigned long end;
+ struct mm_struct *mm = current->mm;
+
+ ret = can_do_mseal(flags);
+ if (ret)
+ return ret;
+
+ start = untagged_addr(start);
+ if (!PAGE_ALIGNED(start))
+ return -EINVAL;
+
+ len = PAGE_ALIGN(len_in);
+ /* Check to see whether len was rounded up from small -ve to zero. */
+ if (len_in && !len)
+ return -EINVAL;
+
+ end = start + len;
+ if (end < start)
+ return -EINVAL;
+
+ if (end == start)
+ return 0;
+
+ if (mmap_write_lock_killable(mm))
+ return -EINTR;
+
+ /*
+ * First pass, this helps to avoid
+ * partial sealing in case of error in input address range,
+ * e.g. ENOMEM error.
+ */
+ ret = check_mm_seal(start, end);
+ if (ret)
+ goto out;
+
+ /*
+ * Second pass, this should success, unless there are errors
+ * from vma_modify_flags, e.g. merge/split error, or process
+ * reaching the max supported VMAs, however, those cases shall
+ * be rare.
+ */
+ ret = apply_mm_seal(start, end);
+
+out:
+ mmap_write_unlock(current->mm);
+ return ret;
+}
+
+SYSCALL_DEFINE3(mseal, unsigned long, start, size_t, len, unsigned long,
+ flags)
+{
+ return do_mseal(start, len, flags);
+}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index fba324e1a010d2..692c0da04cbd03 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -838,13 +838,15 @@ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc,
}
/**
- * __wb_calc_thresh - @wb's share of dirty throttling threshold
+ * __wb_calc_thresh - @wb's share of dirty threshold
* @dtc: dirty_throttle_context of interest
+ * @thresh: dirty throttling or dirty background threshold of wb_domain in @dtc
*
- * Note that balance_dirty_pages() will only seriously take it as a hard limit
- * when sleeping max_pause per page is not enough to keep the dirty pages under
- * control. For example, when the device is completely stalled due to some error
- * conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key.
+ * Note that balance_dirty_pages() will only seriously take dirty throttling
+ * threshold as a hard limit when sleeping max_pause per page is not enough
+ * to keep the dirty pages under control. For example, when the device is
+ * completely stalled due to some error conditions, or when there are 1000
+ * dd tasks writing to a slow 10MB/s USB key.
* In the other normal situations, it acts more gently by throttling the tasks
* more (rather than completely block them) when the wb dirty pages go high.
*
@@ -855,19 +857,20 @@ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc,
* The wb's share of dirty limit will be adapting to its throughput and
* bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
*
- * Return: @wb's dirty limit in pages. The term "dirty" in the context of
- * dirty balancing includes all PG_dirty and PG_writeback pages.
+ * Return: @wb's dirty limit in pages. For dirty throttling limit, the term
+ * "dirty" in the context of dirty balancing includes all PG_dirty and
+ * PG_writeback pages.
*/
-static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc)
+static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc,
+ unsigned long thresh)
{
struct wb_domain *dom = dtc_dom(dtc);
- unsigned long thresh = dtc->thresh;
u64 wb_thresh;
unsigned long numerator, denominator;
unsigned long wb_min_ratio, wb_max_ratio;
/*
- * Calculate this BDI's share of the thresh ratio.
+ * Calculate this wb's share of the thresh ratio.
*/
fprop_fraction_percpu(&dom->completions, dtc->wb_completions,
&numerator, &denominator);
@@ -887,9 +890,28 @@ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc)
unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh)
{
- struct dirty_throttle_control gdtc = { GDTC_INIT(wb),
- .thresh = thresh };
- return __wb_calc_thresh(&gdtc);
+ struct dirty_throttle_control gdtc = { GDTC_INIT(wb) };
+
+ return __wb_calc_thresh(&gdtc, thresh);
+}
+
+unsigned long cgwb_calc_thresh(struct bdi_writeback *wb)
+{
+ struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
+ struct dirty_throttle_control mdtc = { MDTC_INIT(wb, &gdtc) };
+ unsigned long filepages = 0, headroom = 0, writeback = 0;
+
+ gdtc.avail = global_dirtyable_memory();
+ gdtc.dirty = global_node_page_state(NR_FILE_DIRTY) +
+ global_node_page_state(NR_WRITEBACK);
+
+ mem_cgroup_wb_stats(wb, &filepages, &headroom,
+ &mdtc.dirty, &writeback);
+ mdtc.dirty += writeback;
+ mdtc_calc_avail(&mdtc, filepages, headroom);
+ domain_dirty_limits(&mdtc);
+
+ return __wb_calc_thresh(&mdtc, mdtc.thresh);
}
/*
@@ -1636,7 +1658,7 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
* wb_position_ratio() will let the dirtier task progress
* at some rate <= (write_bw / 2) for bringing down wb_dirty.
*/
- dtc->wb_thresh = __wb_calc_thresh(dtc);
+ dtc->wb_thresh = __wb_calc_thresh(dtc, dtc->thresh);
dtc->wb_bg_thresh = dtc->thresh ?
div64_u64(dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0;
@@ -1675,7 +1697,7 @@ static int balance_dirty_pages(struct bdi_writeback *wb,
struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
&mdtc_stor : NULL;
struct dirty_throttle_control *sdtc;
- unsigned long nr_reclaimable; /* = file_dirty */
+ unsigned long nr_dirty;
long period;
long pause;
long max_pause;
@@ -1696,9 +1718,9 @@ static int balance_dirty_pages(struct bdi_writeback *wb,
unsigned long m_thresh = 0;
unsigned long m_bg_thresh = 0;
- nr_reclaimable = global_node_page_state(NR_FILE_DIRTY);
+ nr_dirty = global_node_page_state(NR_FILE_DIRTY);
gdtc->avail = global_dirtyable_memory();
- gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK);
+ gdtc->dirty = nr_dirty + global_node_page_state(NR_WRITEBACK);
domain_dirty_limits(gdtc);
@@ -1749,7 +1771,7 @@ static int balance_dirty_pages(struct bdi_writeback *wb,
* In normal mode, we start background writeout at the lower
* background_thresh, to keep the amount of dirty memory low.
*/
- if (!laptop_mode && nr_reclaimable > gdtc->bg_thresh &&
+ if (!laptop_mode && nr_dirty > gdtc->bg_thresh &&
!writeback_in_progress(wb))
wb_start_background_writeback(wb);
@@ -2095,7 +2117,7 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb)
if (gdtc->dirty > gdtc->bg_thresh)
return true;
- thresh = wb_calc_thresh(gdtc->wb, gdtc->bg_thresh);
+ thresh = __wb_calc_thresh(gdtc, gdtc->bg_thresh);
if (thresh < 2 * wb_stat_error())
reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE);
else
@@ -2115,7 +2137,7 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb)
if (mdtc->dirty > mdtc->bg_thresh)
return true;
- thresh = wb_calc_thresh(mdtc->wb, mdtc->bg_thresh);
+ thresh = __wb_calc_thresh(mdtc, mdtc->bg_thresh);
if (thresh < 2 * wb_stat_error())
reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE);
else
@@ -2699,17 +2721,20 @@ void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb)
}
/*
- * Mark the folio dirty, and set it dirty in the page cache, and mark
- * the inode dirty.
+ * Mark the folio dirty, and set it dirty in the page cache.
*
* If warn is true, then emit a warning if the folio is not uptodate and has
* not been truncated.
*
- * The caller must hold folio_memcg_lock(). Most callers have the folio
- * locked. A few have the folio blocked from truncation through other
- * means (eg zap_vma_pages() has it mapped and is holding the page table
- * lock). This can also be called from mark_buffer_dirty(), which I
- * cannot prove is always protected against truncate.
+ * The caller must hold folio_memcg_lock(). It is the caller's
+ * responsibility to prevent the folio from being truncated while
+ * this function is in progress, although it may have been truncated
+ * before this function is called. Most callers have the folio locked.
+ * A few have the folio blocked from truncation through other means (e.g.
+ * zap_vma_pages() has it mapped and is holding the page table lock).
+ * When called from mark_buffer_dirty(), the filesystem should hold a
+ * reference to the buffer_head that is being marked dirty, which causes
+ * try_to_free_buffers() to fail.
*/
void __folio_mark_dirty(struct folio *folio, struct address_space *mapping,
int warn)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 22e8b9f1d710b9..cd584aace6bfde 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -595,12 +595,14 @@ compaction_capture(struct capture_control *capc, struct page *page,
return false;
/*
- * Do not let lower order allocations pollute a movable pageblock.
+ * Do not let lower order allocations pollute a movable pageblock
+ * unless compaction is also requesting movable pages.
* This might let an unmovable request use a reclaimable pageblock
* and vice-versa but no more than normal fallback logic which can
* have trouble finding a high-order free page.
*/
- if (order < pageblock_order && migratetype == MIGRATE_MOVABLE)
+ if (order < pageblock_order && migratetype == MIGRATE_MOVABLE &&
+ capc->cc->migratetype != MIGRATE_MOVABLE)
return false;
capc->page = page;
@@ -935,6 +937,10 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
bad_page(page, "nonzero entire_mapcount");
goto out;
}
+ if (unlikely(folio_large_mapcount(folio))) {
+ bad_page(page, "nonzero large_mapcount");
+ goto out;
+ }
if (unlikely(atomic_read(&folio->_nr_pages_mapped))) {
bad_page(page, "nonzero nr_pages_mapped");
goto out;
@@ -2161,6 +2167,43 @@ do_steal:
return page;
}
+#ifdef CONFIG_CMA
+/*
+ * GFP_MOVABLE allocation could drain UNMOVABLE & RECLAIMABLE page blocks via
+ * the help of CMA which makes GFP_KERNEL failed. Checking if zone_watermark_ok
+ * again without ALLOC_CMA to see if to use CMA first.
+ */
+static bool use_cma_first(struct zone *zone, unsigned int order, unsigned int alloc_flags)
+{
+ unsigned long watermark;
+ bool cma_first = false;
+
+ watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
+ /* check if GFP_MOVABLE pass previous zone_watermark_ok via the help of CMA */
+ if (zone_watermark_ok(zone, order, watermark, 0, alloc_flags & (~ALLOC_CMA))) {
+ /*
+ * Balance movable allocations between regular and CMA areas by
+ * allocating from CMA when over half of the zone's free memory
+ * is in the CMA area.
+ */
+ cma_first = (zone_page_state(zone, NR_FREE_CMA_PAGES) >
+ zone_page_state(zone, NR_FREE_PAGES) / 2);
+ } else {
+ /*
+ * watermark failed means UNMOVABLE & RECLAIMBLE is not enough
+ * now, we should use cma first to keep them stay around the
+ * corresponding watermark
+ */
+ cma_first = true;
+ }
+ return cma_first;
+}
+#else
+static bool use_cma_first(struct zone *zone, unsigned int order, unsigned int alloc_flags)
+{
+ return false;
+}
+#endif
/*
* Do the hard work of removing an element from the buddy allocator.
* Call me with the zone->lock already held.
@@ -2174,12 +2217,11 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype,
if (IS_ENABLED(CONFIG_CMA)) {
/*
* Balance movable allocations between regular and CMA areas by
- * allocating from CMA when over half of the zone's free memory
- * is in the CMA area.
+ * allocating from CMA base on judging zone_watermark_ok again
+ * to see if the latest check got pass via the help of CMA
*/
if (alloc_flags & ALLOC_CMA &&
- zone_page_state(zone, NR_FREE_CMA_PAGES) >
- zone_page_state(zone, NR_FREE_PAGES) / 2) {
+ use_cma_first(zone, order, alloc_flags)) {
page = __rmqueue_cma_fallback(zone, order);
if (page)
return page;
@@ -6343,8 +6385,12 @@ int __alloc_contig_migrate_range(struct compact_control *cc,
if (trace_mm_alloc_contig_migrate_range_info_enabled()) {
total_reclaimed += nr_reclaimed;
- list_for_each_entry(page, &cc->migratepages, lru)
- total_mapped += page_mapcount(page);
+ list_for_each_entry(page, &cc->migratepages, lru) {
+ struct folio *folio = page_folio(page);
+
+ total_mapped += folio_mapped(folio) *
+ folio_nr_pages(folio);
+ }
}
ret = migrate_pages(&cc->migratepages, alloc_migration_target,
diff --git a/mm/page_io.c b/mm/page_io.c
index a9a7c236aeccfb..46c603dddf043d 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -217,6 +217,7 @@ static inline void count_swpout_vm_event(struct folio *folio)
count_memcg_folio_events(folio, THP_SWPOUT, 1);
count_vm_event(THP_SWPOUT);
}
+ count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_SWPOUT);
#endif
count_vm_events(PSWPOUT, folio_nr_pages(folio));
}
diff --git a/mm/page_table_check.c b/mm/page_table_check.c
index af69c3c8f7c2d5..4169576bed7298 100644
--- a/mm/page_table_check.c
+++ b/mm/page_table_check.c
@@ -7,6 +7,8 @@
#include <linux/kstrtox.h>
#include <linux/mm.h>
#include <linux/page_table_check.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
#undef pr_fmt
#define pr_fmt(fmt) "page_table_check: " fmt
@@ -182,6 +184,22 @@ void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud)
}
EXPORT_SYMBOL(__page_table_check_pud_clear);
+/* Whether the swap entry cached writable information */
+static inline bool swap_cached_writable(swp_entry_t entry)
+{
+ return is_writable_device_exclusive_entry(entry) ||
+ is_writable_device_private_entry(entry) ||
+ is_writable_migration_entry(entry);
+}
+
+static inline void page_table_check_pte_flags(pte_t pte)
+{
+ if (pte_present(pte) && pte_uffd_wp(pte))
+ WARN_ON_ONCE(pte_write(pte));
+ else if (is_swap_pte(pte) && pte_swp_uffd_wp(pte))
+ WARN_ON_ONCE(swap_cached_writable(pte_to_swp_entry(pte)));
+}
+
void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte,
unsigned int nr)
{
@@ -190,6 +208,8 @@ void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte,
if (&init_mm == mm)
return;
+ page_table_check_pte_flags(pte);
+
for (i = 0; i < nr; i++)
__page_table_check_pte_clear(mm, ptep_get(ptep + i));
if (pte_user_accessible_page(pte))
@@ -197,11 +217,21 @@ void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte,
}
EXPORT_SYMBOL(__page_table_check_ptes_set);
+static inline void page_table_check_pmd_flags(pmd_t pmd)
+{
+ if (pmd_present(pmd) && pmd_uffd_wp(pmd))
+ WARN_ON_ONCE(pmd_write(pmd));
+ else if (is_swap_pmd(pmd) && pmd_swp_uffd_wp(pmd))
+ WARN_ON_ONCE(swap_cached_writable(pmd_to_swp_entry(pmd)));
+}
+
void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd)
{
if (&init_mm == mm)
return;
+ page_table_check_pmd_flags(pmd);
+
__page_table_check_pmd_clear(mm, *pmdp);
if (pmd_user_accessible_page(pmd)) {
page_table_check_set(pmd_pfn(pmd), PMD_SIZE >> PAGE_SHIFT,
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 53b8868ede61c6..ae5cc42aa2087a 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -314,16 +314,18 @@ next_pte:
return false;
}
+#ifdef CONFIG_MEMORY_FAILURE
/**
* page_mapped_in_vma - check whether a page is really mapped in a VMA
* @page: the page to test
* @vma: the VMA to test
*
- * Returns 1 if the page is mapped into the page tables of the VMA, 0
- * if the page is not mapped into the page tables of this VMA. Only
- * valid for normal file or anonymous VMAs.
+ * Return: The address the page is mapped at if the page is in the range
+ * covered by the VMA and present in the page table. If the page is
+ * outside the VMA or not present, returns -EFAULT.
+ * Only valid for normal file or anonymous VMAs.
*/
-int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
+unsigned long page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
{
struct folio *folio = page_folio(page);
pgoff_t pgoff = folio->index + folio_page_idx(folio, page);
@@ -336,9 +338,11 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
pvmw.address = vma_address(vma, pgoff, 1);
if (pvmw.address == -EFAULT)
- return 0;
+ goto out;
if (!page_vma_mapped_walk(&pvmw))
- return 0;
+ return -EFAULT;
page_vma_mapped_walk_done(&pvmw);
- return 1;
+out:
+ return pvmw.address;
}
+#endif
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 4fcd959dcc4d02..74e34ea90656a5 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -198,6 +198,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp)
{
+ VM_WARN_ON(!pmd_present(*pmdp));
pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
return old;
@@ -208,6 +209,7 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp)
{
+ VM_WARN_ON(!pmd_present(*pmdp));
return pmdp_invalidate(vma, address, pmdp);
}
#endif
diff --git a/mm/rmap.c b/mm/rmap.c
index 56b313aa2ebfb2..7faa60bc3e4db6 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -182,8 +182,6 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
* for the new allocation. At the same time, we do not want
* to do any locking for the common case of already having
* an anon_vma.
- *
- * This must be called with the mmap_lock held for reading.
*/
int __anon_vma_prepare(struct vm_area_struct *vma)
{
@@ -191,6 +189,7 @@ int __anon_vma_prepare(struct vm_area_struct *vma)
struct anon_vma *anon_vma, *allocated;
struct anon_vma_chain *avc;
+ mmap_assert_locked(mm);
might_sleep();
avc = anon_vma_chain_alloc(GFP_KERNEL);
@@ -1138,50 +1137,32 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
return page_vma_mkclean_one(&pvmw);
}
-int folio_total_mapcount(const struct folio *folio)
-{
- int mapcount = folio_entire_mapcount(folio);
- int nr_pages;
- int i;
-
- /* In the common case, avoid the loop when no pages mapped by PTE */
- if (folio_nr_pages_mapped(folio) == 0)
- return mapcount;
- /*
- * Add all the PTE mappings of those pages mapped by PTE.
- * Limit the loop to folio_nr_pages_mapped()?
- * Perhaps: given all the raciness, that may be a good or a bad idea.
- */
- nr_pages = folio_nr_pages(folio);
- for (i = 0; i < nr_pages; i++)
- mapcount += atomic_read(&folio_page(folio, i)->_mapcount);
-
- /* But each of those _mapcounts was based on -1 */
- mapcount += nr_pages;
- return mapcount;
-}
-
static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
struct page *page, int nr_pages, enum rmap_level level,
int *nr_pmdmapped)
{
atomic_t *mapped = &folio->_nr_pages_mapped;
+ const int orig_nr_pages = nr_pages;
int first, nr = 0;
__folio_rmap_sanity_checks(folio, page, nr_pages, level);
switch (level) {
case RMAP_LEVEL_PTE:
+ if (!folio_test_large(folio)) {
+ nr = atomic_inc_and_test(&page->_mapcount);
+ break;
+ }
+
do {
first = atomic_inc_and_test(&page->_mapcount);
- if (first && folio_test_large(folio)) {
+ if (first) {
first = atomic_inc_return_relaxed(mapped);
- first = (first < ENTIRELY_MAPPED);
+ if (first < ENTIRELY_MAPPED)
+ nr++;
}
-
- if (first)
- nr++;
} while (page++, --nr_pages > 0);
+ atomic_add(orig_nr_pages, &folio->_large_mapcount);
break;
case RMAP_LEVEL_PMD:
first = atomic_inc_and_test(&folio->_entire_mapcount);
@@ -1198,6 +1179,7 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
nr = 0;
}
}
+ atomic_inc(&folio->_large_mapcount);
break;
}
return nr;
@@ -1433,10 +1415,14 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
SetPageAnonExclusive(page);
}
+ /* increment count (starts at -1) */
+ atomic_set(&folio->_large_mapcount, nr - 1);
atomic_set(&folio->_nr_pages_mapped, nr);
} else {
/* increment count (starts at -1) */
atomic_set(&folio->_entire_mapcount, 0);
+ /* increment count (starts at -1) */
+ atomic_set(&folio->_large_mapcount, 0);
atomic_set(&folio->_nr_pages_mapped, ENTIRELY_MAPPED);
SetPageAnonExclusive(&folio->page);
__lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr);
@@ -1508,24 +1494,32 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
{
atomic_t *mapped = &folio->_nr_pages_mapped;
int last, nr = 0, nr_pmdmapped = 0;
+ bool partially_mapped = false;
enum node_stat_item idx;
__folio_rmap_sanity_checks(folio, page, nr_pages, level);
switch (level) {
case RMAP_LEVEL_PTE:
+ if (!folio_test_large(folio)) {
+ nr = atomic_add_negative(-1, &page->_mapcount);
+ break;
+ }
+
+ atomic_sub(nr_pages, &folio->_large_mapcount);
do {
last = atomic_add_negative(-1, &page->_mapcount);
- if (last && folio_test_large(folio)) {
+ if (last) {
last = atomic_dec_return_relaxed(mapped);
- last = (last < ENTIRELY_MAPPED);
+ if (last < ENTIRELY_MAPPED)
+ nr++;
}
-
- if (last)
- nr++;
} while (page++, --nr_pages > 0);
+
+ partially_mapped = nr && atomic_read(mapped);
break;
case RMAP_LEVEL_PMD:
+ atomic_dec(&folio->_large_mapcount);
last = atomic_add_negative(-1, &folio->_entire_mapcount);
if (last) {
nr = atomic_sub_return_relaxed(ENTIRELY_MAPPED, mapped);
@@ -1540,6 +1534,8 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
nr = 0;
}
}
+
+ partially_mapped = nr < nr_pmdmapped;
break;
}
@@ -1561,9 +1557,10 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
* page of the folio is unmapped and at least one page
* is still mapped.
*/
- if (folio_test_large(folio) && folio_test_anon(folio))
- if (level == RMAP_LEVEL_PTE || nr < nr_pmdmapped)
- deferred_split_folio(folio);
+ if (folio_test_anon(folio) &&
+ list_empty(&folio->_deferred_list) &&
+ partially_mapped)
+ deferred_split_folio(folio);
}
/*
@@ -2708,6 +2705,7 @@ void hugetlb_add_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
atomic_inc(&folio->_entire_mapcount);
+ atomic_inc(&folio->_large_mapcount);
if (flags & RMAP_EXCLUSIVE)
SetPageAnonExclusive(&folio->page);
VM_WARN_ON_FOLIO(folio_entire_mapcount(folio) > 1 &&
@@ -2722,6 +2720,7 @@ void hugetlb_add_new_anon_rmap(struct folio *folio,
BUG_ON(address < vma->vm_start || address >= vma->vm_end);
/* increment count (starts at -1) */
atomic_set(&folio->_entire_mapcount, 0);
+ atomic_set(&folio->_large_mapcount, 0);
folio_clear_hugetlb_restore_reserve(folio);
__folio_set_anon(folio, vma, address, true);
SetPageAnonExclusive(&folio->page);
diff --git a/mm/slab.h b/mm/slab.h
index 411251b9bdd15c..d12fb4392e3539 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -581,8 +581,6 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
gfp_t flags, size_t size, void **p);
void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
void **p, int objects, struct slabobj_ext *obj_exts);
-void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
- enum node_stat_item idx, int nr);
#endif
size_t __ksize(const void *objp);
diff --git a/mm/sparse.c b/mm/sparse.c
index 46e88549d1a6a9..de40b2c7340673 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -560,6 +560,8 @@ void __init sparse_init(void)
unsigned long pnum_end, pnum_begin, map_count = 1;
int nid_begin;
+ /* see include/linux/mmzone.h 'struct mem_section' definition */
+ BUILD_BUG_ON(!is_power_of_2(sizeof(struct mem_section)));
memblocks_present();
pnum_begin = first_present_section_nr();
diff --git a/mm/swap.c b/mm/swap.c
index 8ae5cd4ed180bd..67786cb771305c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -447,15 +447,18 @@ static void folio_inc_refs(struct folio *folio)
}
#endif /* CONFIG_LRU_GEN */
-/*
- * Mark a page as having seen activity.
+/**
+ * folio_mark_accessed - Mark a folio as having seen activity.
+ * @folio: The folio to mark.
+ *
+ * This function will perform one of the following transitions:
*
- * inactive,unreferenced -> inactive,referenced
- * inactive,referenced -> active,unreferenced
- * active,unreferenced -> active,referenced
+ * * inactive,unreferenced -> inactive,referenced
+ * * inactive,referenced -> active,unreferenced
+ * * active,unreferenced -> active,referenced
*
- * When a newly allocated page is not yet visible, so safe for non-atomic ops,
- * __SetPageReferenced(page) may be substituted for mark_page_accessed(page).
+ * When a newly allocated folio is not yet visible, so safe for non-atomic ops,
+ * __folio_set_referenced() may be substituted for folio_mark_accessed().
*/
void folio_mark_accessed(struct folio *folio)
{
@@ -980,7 +983,7 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs)
unlock_page_lruvec_irqrestore(lruvec, flags);
lruvec = NULL;
}
- if (put_devmap_managed_page_refs(&folio->page, nr_refs))
+ if (put_devmap_managed_folio_refs(folio, nr_refs))
continue;
if (folio_ref_sub_and_test(folio, nr_refs))
free_zone_device_folio(folio);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 148ef08f19dd66..96606580ee091d 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1238,16 +1238,15 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
/*
* When we get a swap entry, if there aren't some other ways to
- * prevent swapoff, such as the folio in swap cache is locked, page
- * table lock is held, etc., the swap entry may become invalid because
- * of swapoff. Then, we need to enclose all swap related functions
- * with get_swap_device() and put_swap_device(), unless the swap
- * functions call get/put_swap_device() by themselves.
+ * prevent swapoff, such as the folio in swap cache is locked, RCU
+ * reader side is locked, etc., the swap entry may become invalid
+ * because of swapoff. Then, we need to enclose all swap related
+ * functions with get_swap_device() and put_swap_device(), unless the
+ * swap functions call get/put_swap_device() by themselves.
*
- * Note that when only holding the PTL, swapoff might succeed immediately
- * after freeing a swap entry. Therefore, immediately after
- * __swap_entry_free(), the swap info might become stale and should not
- * be touched without a prior get_swap_device().
+ * RCU reader side lock (including any spinlock) is sufficient to
+ * prevent swapoff, because synchronize_rcu() is called in swapoff()
+ * before freeing data structures.
*
* Check whether swap entry is valid in the swap device. If so,
* return pointer to swap_info_struct, and keep the swap entry valid
@@ -2445,13 +2444,17 @@ static void reinsert_swap_info(struct swap_info_struct *p)
spin_unlock(&swap_lock);
}
+static bool __has_usable_swap(void)
+{
+ return !plist_head_empty(&swap_active_head);
+}
+
bool has_usable_swap(void)
{
- bool ret = true;
+ bool ret;
spin_lock(&swap_lock);
- if (plist_head_empty(&swap_active_head))
- ret = false;
+ ret = __has_usable_swap();
spin_unlock(&swap_lock);
return ret;
}
@@ -2544,10 +2547,11 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
/*
* Wait for swap operations protected by get/put_swap_device()
- * to complete.
- *
- * We need synchronize_rcu() here to protect the accessing to
- * the swap cache data structure.
+ * to complete. Because of synchronize_rcu() here, all swap
+ * operations protected by RCU reader side lock (including any
+ * spinlock) will be waited too. This makes it easy to
+ * prevent folio_test_swapcache() and the following swap cache
+ * operations from racing with swapoff.
*/
percpu_ref_kill(&p->users);
synchronize_rcu();
@@ -3710,6 +3714,9 @@ void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
if (!(gfp & __GFP_IO))
return;
+ if (!__has_usable_swap())
+ return;
+
if (!blk_cgroup_congested())
return;
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index b70618e8dcd248..d9e82ae68244e6 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -56,17 +56,16 @@ struct vm_area_struct *find_vma_and_prepare_anon(struct mm_struct *mm,
#ifdef CONFIG_PER_VMA_LOCK
/*
- * lock_vma() - Lookup and lock vma corresponding to @address.
+ * uffd_lock_vma() - Lookup and lock vma corresponding to @address.
* @mm: mm to search vma in.
* @address: address that the vma should contain.
*
- * Should be called without holding mmap_lock. vma should be unlocked after use
- * with unlock_vma().
+ * Should be called without holding mmap_lock.
*
* Return: A locked vma containing @address, -ENOENT if no vma is found, or
* -ENOMEM if anon_vma couldn't be allocated.
*/
-static struct vm_area_struct *lock_vma(struct mm_struct *mm,
+static struct vm_area_struct *uffd_lock_vma(struct mm_struct *mm,
unsigned long address)
{
struct vm_area_struct *vma;
@@ -74,9 +73,8 @@ static struct vm_area_struct *lock_vma(struct mm_struct *mm,
vma = lock_vma_under_rcu(mm, address);
if (vma) {
/*
- * lock_vma_under_rcu() only checks anon_vma for private
- * anonymous mappings. But we need to ensure it is assigned in
- * private file-backed vmas as well.
+ * We know we're going to need to use anon_vma, so check
+ * that early.
*/
if (!(vma->vm_flags & VM_SHARED) && unlikely(!vma->anon_vma))
vma_end_read(vma);
@@ -107,7 +105,7 @@ static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm,
{
struct vm_area_struct *dst_vma;
- dst_vma = lock_vma(dst_mm, dst_start);
+ dst_vma = uffd_lock_vma(dst_mm, dst_start);
if (IS_ERR(dst_vma) || validate_dst_vma(dst_vma, dst_start + len))
return dst_vma;
@@ -180,9 +178,9 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd,
pte_t _dst_pte, *dst_pte;
bool writable = dst_vma->vm_flags & VM_WRITE;
bool vm_shared = dst_vma->vm_flags & VM_SHARED;
- bool page_in_cache = page_mapping(page);
spinlock_t *ptl;
- struct folio *folio;
+ struct folio *folio = page_folio(page);
+ bool page_in_cache = folio_mapping(folio);
_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
_dst_pte = pte_mkdirty(_dst_pte);
@@ -212,7 +210,6 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd,
if (!pte_none_mostly(ptep_get(dst_pte)))
goto out_unlock;
- folio = page_folio(page);
if (page_in_cache) {
/* Usually, cache pages are already added to LRU */
if (newly_allocated)
@@ -1026,7 +1023,7 @@ static int move_present_pte(struct mm_struct *mm,
}
folio_move_anon_rmap(src_folio, dst_vma);
- WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr));
+ src_folio->index = linear_page_index(dst_vma, dst_addr);
orig_dst_pte = mk_pte(&src_folio->page, dst_vma->vm_page_prot);
/* Follow mremap() behavior and treat the entry dirty after the move */
@@ -1402,7 +1399,7 @@ static int uffd_move_lock(struct mm_struct *mm,
struct vm_area_struct *vma;
int err;
- vma = lock_vma(mm, dst_start);
+ vma = uffd_lock_vma(mm, dst_start);
if (IS_ERR(vma))
return PTR_ERR(vma);
@@ -1417,7 +1414,7 @@ static int uffd_move_lock(struct mm_struct *mm,
}
/*
- * Using lock_vma() to get src_vma can lead to following deadlock:
+ * Using uffd_lock_vma() to get src_vma can lead to following deadlock:
*
* Thread1 Thread2
* ------- -------
@@ -1439,7 +1436,7 @@ static int uffd_move_lock(struct mm_struct *mm,
err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap);
if (!err) {
/*
- * See comment in lock_vma() as to why not using
+ * See comment in uffd_lock_vma() as to why not using
* vma_start_read() here.
*/
down_read(&(*dst_vmap)->vm_lock->lock);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index a9fedb16ec17eb..6641be0ca80b4d 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -42,6 +42,7 @@
#include <linux/sched/mm.h>
#include <asm/tlbflush.h>
#include <asm/shmparam.h>
+#include <linux/page_owner.h>
#define CREATE_TRACE_POINTS
#include <trace/events/vmalloc.h>
@@ -96,6 +97,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
{
pte_t *pte;
u64 pfn;
+ struct page *page;
unsigned long size = PAGE_SIZE;
pfn = phys_addr >> PAGE_SHIFT;
@@ -103,7 +105,13 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
if (!pte)
return -ENOMEM;
do {
- BUG_ON(!pte_none(ptep_get(pte)));
+ if (!pte_none(ptep_get(pte))) {
+ if (pfn_valid(pfn)) {
+ page = pfn_to_page(pfn);
+ dump_page(page, "remapping already mapped page");
+ }
+ BUG();
+ }
#ifdef CONFIG_HUGETLB_PAGE
size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index da93213af981e9..49bd944239613d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -92,6 +92,11 @@ struct scan_control {
unsigned long anon_cost;
unsigned long file_cost;
+#ifdef CONFIG_MEMCG
+ /* Swappiness value for proactive reclaim. Always use sc_swappiness()! */
+ int *proactive_swappiness;
+#endif
+
/* Can active folios be deactivated as part of reclaim? */
#define DEACTIVATE_ANON 1
#define DEACTIVATE_FILE 2
@@ -189,7 +194,7 @@ struct scan_control {
#endif
/*
- * From 0 .. 200. Higher means more swappy.
+ * From 0 .. MAX_SWAPPINESS. Higher means more swappy.
*/
int vm_swappiness = 60;
@@ -233,6 +238,13 @@ static bool writeback_throttling_sane(struct scan_control *sc)
#endif
return false;
}
+
+static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg)
+{
+ if (sc->proactive && sc->proactive_swappiness)
+ return *sc->proactive_swappiness;
+ return mem_cgroup_swappiness(memcg);
+}
#else
static bool cgroup_reclaim(struct scan_control *sc)
{
@@ -248,6 +260,11 @@ static bool writeback_throttling_sane(struct scan_control *sc)
{
return true;
}
+
+static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg)
+{
+ return READ_ONCE(vm_swappiness);
+}
#endif
static void set_task_reclaim_state(struct task_struct *task,
@@ -1214,6 +1231,8 @@ retry:
goto activate_locked;
}
if (!add_to_swap(folio)) {
+ int __maybe_unused order = folio_order(folio);
+
if (!folio_test_large(folio))
goto activate_locked_split;
/* Fallback to swap normal pages */
@@ -1225,6 +1244,7 @@ retry:
THP_SWPOUT_FALLBACK, 1);
count_vm_event(THP_SWPOUT_FALLBACK);
}
+ count_mthp_stat(order, MTHP_STAT_ANON_SWPOUT_FALLBACK);
#endif
if (!add_to_swap(folio))
goto activate_locked_split;
@@ -2352,7 +2372,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
unsigned long anon_cost, file_cost, total_cost;
- int swappiness = mem_cgroup_swappiness(memcg);
+ int swappiness = sc_swappiness(sc, memcg);
u64 fraction[ANON_AND_FILE];
u64 denominator = 0; /* gcc */
enum scan_balance scan_balance;
@@ -2428,7 +2448,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
ap = swappiness * (total_cost + 1);
ap /= anon_cost + 1;
- fp = (200 - swappiness) * (total_cost + 1);
+ fp = (MAX_SWAPPINESS - swappiness) * (total_cost + 1);
fp /= file_cost + 1;
fraction[0] = ap;
@@ -2633,7 +2653,7 @@ static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
return 0;
- return mem_cgroup_swappiness(memcg);
+ return sc_swappiness(sc, memcg);
}
static int get_nr_gens(struct lruvec *lruvec, int type)
@@ -4448,7 +4468,7 @@ static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx
{
int type, tier;
struct ctrl_pos sp, pv;
- int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness };
+ int gain[ANON_AND_FILE] = { swappiness, MAX_SWAPPINESS - swappiness };
/*
* Compare the first tier of anon with that of file to determine which
@@ -4495,7 +4515,7 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw
type = LRU_GEN_ANON;
else if (swappiness == 1)
type = LRU_GEN_FILE;
- else if (swappiness == 200)
+ else if (swappiness == MAX_SWAPPINESS)
type = LRU_GEN_ANON;
else if (!(sc->gfp_mask & __GFP_IO))
type = LRU_GEN_FILE;
@@ -5429,9 +5449,9 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
lruvec = get_lruvec(memcg, nid);
- if (swappiness < 0)
+ if (swappiness < MIN_SWAPPINESS)
swappiness = get_swappiness(lruvec, sc);
- else if (swappiness > 200)
+ else if (swappiness > MAX_SWAPPINESS)
goto done;
switch (cmd) {
@@ -6514,12 +6534,14 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
unsigned long nr_pages,
gfp_t gfp_mask,
- unsigned int reclaim_options)
+ unsigned int reclaim_options,
+ int *swappiness)
{
unsigned long nr_reclaimed;
unsigned int noreclaim_flag;
struct scan_control sc = {
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
+ .proactive_swappiness = swappiness,
.gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
.reclaim_idx = MAX_NR_ZONES - 1,
diff --git a/mm/zswap.c b/mm/zswap.c
index e6cf6282f5fd01..a50e2986cd2fab 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -123,19 +123,6 @@ static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */
module_param_named(accept_threshold_percent, zswap_accept_thr_percent,
uint, 0644);
-/*
- * Enable/disable handling same-value filled pages (enabled by default).
- * If disabled every page is considered non-same-value filled.
- */
-static bool zswap_same_filled_pages_enabled = true;
-module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled,
- bool, 0644);
-
-/* Enable/disable handling non-same-value filled pages (enabled by default) */
-static bool zswap_non_same_filled_pages_enabled = true;
-module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled,
- bool, 0644);
-
/* Number of zpools in zswap_pool (empirically determined for scalability) */
#define ZSWAP_NR_ZPOOLS 32
@@ -517,6 +504,21 @@ unsigned long zswap_total_pages(void)
return total;
}
+static bool zswap_check_limits(void)
+{
+ unsigned long cur_pages = zswap_total_pages();
+ unsigned long max_pages = zswap_max_pages();
+
+ if (cur_pages >= max_pages) {
+ zswap_pool_limit_hit++;
+ zswap_pool_reached_full = true;
+ } else if (zswap_pool_reached_full &&
+ cur_pages <= zswap_accept_thr_pages()) {
+ zswap_pool_reached_full = false;
+ }
+ return zswap_pool_reached_full;
+}
+
/*********************************
* param callbacks
**********************************/
@@ -1368,26 +1370,32 @@ resched:
} while (zswap_total_pages() > thr);
}
-static int zswap_is_page_same_filled(void *ptr, unsigned long *value)
+/*********************************
+* same-filled functions
+**********************************/
+static bool zswap_is_folio_same_filled(struct folio *folio, unsigned long *value)
{
unsigned long *page;
unsigned long val;
unsigned int pos, last_pos = PAGE_SIZE / sizeof(*page) - 1;
+ bool ret = false;
- page = (unsigned long *)ptr;
+ page = kmap_local_folio(folio, 0);
val = page[0];
if (val != page[last_pos])
- return 0;
+ goto out;
for (pos = 1; pos < last_pos; pos++) {
if (val != page[pos])
- return 0;
+ goto out;
}
*value = val;
-
- return 1;
+ ret = true;
+out:
+ kunmap_local(page);
+ return ret;
}
static void zswap_fill_page(void *ptr, unsigned long value)
@@ -1398,6 +1406,9 @@ static void zswap_fill_page(void *ptr, unsigned long value)
memset_l(page, value, PAGE_SIZE / sizeof(unsigned long));
}
+/*********************************
+* main API
+**********************************/
bool zswap_store(struct folio *folio)
{
swp_entry_t swp = folio->swap;
@@ -1406,7 +1417,7 @@ bool zswap_store(struct folio *folio)
struct zswap_entry *entry, *old;
struct obj_cgroup *objcg = NULL;
struct mem_cgroup *memcg = NULL;
- unsigned long max_pages, cur_pages;
+ unsigned long value;
VM_WARN_ON_ONCE(!folio_test_locked(folio));
VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
@@ -1429,22 +1440,8 @@ bool zswap_store(struct folio *folio)
mem_cgroup_put(memcg);
}
- /* Check global limits */
- cur_pages = zswap_total_pages();
- max_pages = zswap_max_pages();
-
- if (cur_pages >= max_pages) {
- zswap_pool_limit_hit++;
- zswap_pool_reached_full = true;
- goto shrink;
- }
-
- if (zswap_pool_reached_full) {
- if (cur_pages > zswap_accept_thr_pages())
- goto shrink;
- else
- zswap_pool_reached_full = false;
- }
+ if (zswap_check_limits())
+ goto reject;
/* allocate entry */
entry = zswap_entry_cache_alloc(GFP_KERNEL, folio_nid(folio));
@@ -1453,24 +1450,13 @@ bool zswap_store(struct folio *folio)
goto reject;
}
- if (zswap_same_filled_pages_enabled) {
- unsigned long value;
- u8 *src;
-
- src = kmap_local_folio(folio, 0);
- if (zswap_is_page_same_filled(src, &value)) {
- kunmap_local(src);
- entry->length = 0;
- entry->value = value;
- atomic_inc(&zswap_same_filled_pages);
- goto insert_entry;
- }
- kunmap_local(src);
+ if (zswap_is_folio_same_filled(folio, &value)) {
+ entry->length = 0;
+ entry->value = value;
+ atomic_inc(&zswap_same_filled_pages);
+ goto store_entry;
}
- if (!zswap_non_same_filled_pages_enabled)
- goto freepage;
-
/* if entry is successfully added, it keeps the reference */
entry->pool = zswap_pool_current_get();
if (!entry->pool)
@@ -1488,7 +1474,7 @@ bool zswap_store(struct folio *folio)
if (!zswap_compress(folio, entry))
goto put_pool;
-insert_entry:
+store_entry:
entry->swpentry = swp;
entry->objcg = objcg;
@@ -1547,6 +1533,8 @@ freepage:
zswap_entry_cache_free(entry);
reject:
obj_cgroup_put(objcg);
+ if (zswap_pool_reached_full)
+ queue_work(shrink_wq, &zswap_shrink_work);
check_old:
/*
* If the zswap store fails or zswap is disabled, we must invalidate the
@@ -1557,10 +1545,6 @@ check_old:
if (entry)
zswap_entry_free(entry);
return false;
-
-shrink:
- queue_work(shrink_wq, &zswap_shrink_work);
- goto reject;
}
bool zswap_load(struct folio *folio)
diff --git a/samples/kfifo/dma-example.c b/samples/kfifo/dma-example.c
index 0cf27483cb3613..74fe915b7ffe8c 100644
--- a/samples/kfifo/dma-example.c
+++ b/samples/kfifo/dma-example.c
@@ -6,8 +6,9 @@
*/
#include <linux/init.h>
-#include <linux/module.h>
#include <linux/kfifo.h>
+#include <linux/module.h>
+#include <linux/scatterlist.h>
/*
* This module shows how to handle fifo dma operations.
diff --git a/scripts/Makefile.extrawarn b/scripts/Makefile.extrawarn
index c5af566e911ae7..1d13cecc7cc780 100644
--- a/scripts/Makefile.extrawarn
+++ b/scripts/Makefile.extrawarn
@@ -37,11 +37,6 @@ else
KBUILD_CFLAGS += -Wno-main
endif
-# These warnings generated too much noise in a regular build.
-# Use make W=1 to enable them (see scripts/Makefile.extrawarn)
-KBUILD_CFLAGS += $(call cc-disable-warning, unused-but-set-variable)
-KBUILD_CFLAGS += $(call cc-disable-warning, unused-const-variable)
-
# These result in bogus false positives
KBUILD_CFLAGS += $(call cc-disable-warning, dangling-pointer)
@@ -82,22 +77,17 @@ KBUILD_CFLAGS += $(call cc-option,-Werror=designated-init)
# Warn if there is an enum types mismatch
KBUILD_CFLAGS += $(call cc-option,-Wenum-conversion)
+KBUILD_CFLAGS += -Wextra
+KBUILD_CFLAGS += -Wunused
+
#
# W=1 - warnings which may be relevant and do not occur too often
#
ifneq ($(findstring 1, $(KBUILD_EXTRA_WARN)),)
-KBUILD_CFLAGS += -Wextra -Wunused -Wno-unused-parameter
-KBUILD_CFLAGS += $(call cc-option, -Wrestrict)
KBUILD_CFLAGS += -Wmissing-format-attribute
-KBUILD_CFLAGS += -Wold-style-definition
KBUILD_CFLAGS += -Wmissing-include-dirs
-KBUILD_CFLAGS += $(call cc-option, -Wunused-but-set-variable)
KBUILD_CFLAGS += $(call cc-option, -Wunused-const-variable)
-KBUILD_CFLAGS += $(call cc-option, -Wpacked-not-aligned)
-KBUILD_CFLAGS += $(call cc-option, -Wformat-overflow)
-KBUILD_CFLAGS += $(call cc-option, -Wformat-truncation)
-KBUILD_CFLAGS += $(call cc-option, -Wstringop-truncation)
KBUILD_CPPFLAGS += -Wundef
KBUILD_CPPFLAGS += -DKBUILD_EXTRA_WARN1
@@ -108,10 +98,16 @@ else
# Suppress them by using -Wno... except for W=1.
KBUILD_CFLAGS += $(call cc-disable-warning, unused-but-set-variable)
KBUILD_CFLAGS += $(call cc-disable-warning, unused-const-variable)
-KBUILD_CFLAGS += $(call cc-disable-warning, restrict)
KBUILD_CFLAGS += $(call cc-disable-warning, packed-not-aligned)
KBUILD_CFLAGS += $(call cc-disable-warning, format-overflow)
+ifdef CONFIG_CC_IS_GCC
KBUILD_CFLAGS += $(call cc-disable-warning, format-truncation)
+else
+# Clang checks for overflow/truncation with '%p', while GCC does not:
+# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111219
+KBUILD_CFLAGS += $(call cc-disable-warning, format-overflow-non-kprintf)
+KBUILD_CFLAGS += $(call cc-disable-warning, format-truncation-non-kprintf)
+endif
KBUILD_CFLAGS += $(call cc-disable-warning, stringop-truncation)
KBUILD_CFLAGS += -Wno-override-init # alias for -Wno-initializer-overrides in clang
@@ -133,7 +129,6 @@ endif
KBUILD_CFLAGS += $(call cc-disable-warning, pointer-to-enum-cast)
KBUILD_CFLAGS += -Wno-tautological-constant-out-of-range-compare
KBUILD_CFLAGS += $(call cc-disable-warning, unaligned-access)
-KBUILD_CFLAGS += $(call cc-disable-warning, cast-function-type-strict)
KBUILD_CFLAGS += -Wno-enum-compare-conditional
KBUILD_CFLAGS += -Wno-enum-enum-conversion
endif
@@ -148,9 +143,6 @@ ifneq ($(findstring 2, $(KBUILD_EXTRA_WARN)),)
KBUILD_CFLAGS += -Wdisabled-optimization
KBUILD_CFLAGS += -Wshadow
KBUILD_CFLAGS += $(call cc-option, -Wlogical-op)
-KBUILD_CFLAGS += -Wmissing-field-initializers
-KBUILD_CFLAGS += -Wtype-limits
-KBUILD_CFLAGS += $(call cc-option, -Wmaybe-uninitialized)
KBUILD_CFLAGS += $(call cc-option, -Wunused-macros)
KBUILD_CPPFLAGS += -DKBUILD_EXTRA_WARN2
@@ -190,6 +182,7 @@ else
# The following turn off the warnings enabled by -Wextra
KBUILD_CFLAGS += -Wno-sign-compare
+KBUILD_CFLAGS += -Wno-unused-parameter
endif
diff --git a/scripts/gdb/linux/cpus.py b/scripts/gdb/linux/cpus.py
index cba589e5b57d61..2f11c4f9c345a0 100644
--- a/scripts/gdb/linux/cpus.py
+++ b/scripts/gdb/linux/cpus.py
@@ -26,11 +26,7 @@ def get_current_cpu():
if utils.get_gdbserver_type() == utils.GDBSERVER_QEMU:
return gdb.selected_thread().num - 1
elif utils.get_gdbserver_type() == utils.GDBSERVER_KGDB:
- tid = gdb.selected_thread().ptid[2]
- if tid > (0x100000000 - MAX_CPUS - 2):
- return 0x100000000 - tid - 2
- else:
- return tasks.get_thread_info(tasks.get_task_by_pid(tid))['cpu']
+ return gdb.parse_and_eval("kgdb_active.counter")
else:
raise gdb.GdbError("Sorry, obtaining the current CPU is not yet "
"supported with this gdb server.")
@@ -152,9 +148,8 @@ Note that VAR has to be quoted as string."""
def __init__(self):
super(PerCpu, self).__init__("lx_per_cpu")
- def invoke(self, var_name, cpu=-1):
- var_ptr = gdb.parse_and_eval("&" + var_name.string())
- return per_cpu(var_ptr, cpu)
+ def invoke(self, var, cpu=-1):
+ return per_cpu(var.address, cpu)
PerCpu()
diff --git a/scripts/gdb/linux/tasks.py b/scripts/gdb/linux/tasks.py
index 6793d6e86e777b..62348397c1f5c8 100644
--- a/scripts/gdb/linux/tasks.py
+++ b/scripts/gdb/linux/tasks.py
@@ -85,7 +85,7 @@ thread_info_type = utils.CachedType("struct thread_info")
def get_thread_info(task):
thread_info_ptr_type = thread_info_type.get_type().pointer()
- if task.type.fields()[0].type == thread_info_type.get_type():
+ if task_type.get_type().fields()[0].type == thread_info_type.get_type():
return task['thread_info']
thread_info = task['stack'].cast(thread_info_ptr_type)
return thread_info.dereference()
diff --git a/scripts/gdb/linux/utils.py b/scripts/gdb/linux/utils.py
index 7d5278d815fa15..245ab297ea84a0 100644
--- a/scripts/gdb/linux/utils.py
+++ b/scripts/gdb/linux/utils.py
@@ -196,7 +196,7 @@ def get_gdbserver_type():
def probe_kgdb():
try:
thread_info = gdb.execute("info thread 2", to_string=True)
- return "shadowCPU0" in thread_info
+ return "shadowCPU" in thread_info
except gdb.error:
return False
diff --git a/tools/include/linux/rbtree_augmented.h b/tools/include/linux/rbtree_augmented.h
index 570bb9794421b9..95483c7d81df74 100644
--- a/tools/include/linux/rbtree_augmented.h
+++ b/tools/include/linux/rbtree_augmented.h
@@ -158,13 +158,13 @@ RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \
static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
{
- rb->__rb_parent_color = rb_color(rb) | (unsigned long)p;
+ rb->__rb_parent_color = rb_color(rb) + (unsigned long)p;
}
static inline void rb_set_parent_color(struct rb_node *rb,
struct rb_node *p, int color)
{
- rb->__rb_parent_color = (unsigned long)p | color;
+ rb->__rb_parent_color = (unsigned long)p + color;
}
static inline void
diff --git a/tools/lib/rbtree.c b/tools/lib/rbtree.c
index 727396de6be54f..9e7307186b7f41 100644
--- a/tools/lib/rbtree.c
+++ b/tools/lib/rbtree.c
@@ -58,7 +58,7 @@
static inline void rb_set_black(struct rb_node *rb)
{
- rb->__rb_parent_color |= RB_BLACK;
+ rb->__rb_parent_color += RB_BLACK;
}
static inline struct rb_node *rb_red_parent(struct rb_node *red)
diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore
index d26e962f2ac490..0b9ab987601cca 100644
--- a/tools/testing/selftests/mm/.gitignore
+++ b/tools/testing/selftests/mm/.gitignore
@@ -47,3 +47,5 @@ mkdirty
va_high_addr_switch
hugetlb_fault_after_madv
hugetlb_madv_vs_map
+mseal_test
+seal_elf
diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index 7ca9186a063943..1b07a48fc2ba4a 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -59,6 +59,8 @@ TEST_GEN_FILES += mlock2-tests
TEST_GEN_FILES += mrelease_test
TEST_GEN_FILES += mremap_dontunmap
TEST_GEN_FILES += mremap_test
+TEST_GEN_FILES += mseal_test
+TEST_GEN_FILES += seal_elf
TEST_GEN_FILES += on-fault-limit
TEST_GEN_FILES += pagemap_ioctl
TEST_GEN_FILES += thuge-gen
diff --git a/tools/testing/selftests/mm/mseal_test.c b/tools/testing/selftests/mm/mseal_test.c
new file mode 100644
index 00000000000000..ca8dbee0c612e7
--- /dev/null
+++ b/tools/testing/selftests/mm/mseal_test.c
@@ -0,0 +1,1893 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <sys/mman.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <stdbool.h>
+#include "../kselftest.h"
+#include <syscall.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/vfs.h>
+#include <sys/stat.h>
+
+/*
+ * need those definition for manually build using gcc.
+ * gcc -I ../../../../usr/include -DDEBUG -O3 -DDEBUG -O3 mseal_test.c -o mseal_test
+ */
+#ifndef PKEY_DISABLE_ACCESS
+# define PKEY_DISABLE_ACCESS 0x1
+#endif
+
+#ifndef PKEY_DISABLE_WRITE
+# define PKEY_DISABLE_WRITE 0x2
+#endif
+
+#ifndef PKEY_BITS_PER_KEY
+#define PKEY_BITS_PER_PKEY 2
+#endif
+
+#ifndef PKEY_MASK
+#define PKEY_MASK (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)
+#endif
+
+#define FAIL_TEST_IF_FALSE(c) do {\
+ if (!(c)) {\
+ ksft_test_result_fail("%s, line:%d\n", __func__, __LINE__);\
+ goto test_end;\
+ } \
+ } \
+ while (0)
+
+#define SKIP_TEST_IF_FALSE(c) do {\
+ if (!(c)) {\
+ ksft_test_result_skip("%s, line:%d\n", __func__, __LINE__);\
+ goto test_end;\
+ } \
+ } \
+ while (0)
+
+
+#define TEST_END_CHECK() {\
+ ksft_test_result_pass("%s\n", __func__);\
+ return;\
+test_end:\
+ return;\
+}
+
+#ifndef u64
+#define u64 unsigned long long
+#endif
+
+static unsigned long get_vma_size(void *addr, int *prot)
+{
+ FILE *maps;
+ char line[256];
+ int size = 0;
+ uintptr_t addr_start, addr_end;
+ char protstr[5];
+ *prot = 0;
+
+ maps = fopen("/proc/self/maps", "r");
+ if (!maps)
+ return 0;
+
+ while (fgets(line, sizeof(line), maps)) {
+ if (sscanf(line, "%lx-%lx %4s", &addr_start, &addr_end, protstr) == 3) {
+ if (addr_start == (uintptr_t) addr) {
+ size = addr_end - addr_start;
+ if (protstr[0] == 'r')
+ *prot |= 0x4;
+ if (protstr[1] == 'w')
+ *prot |= 0x2;
+ if (protstr[2] == 'x')
+ *prot |= 0x1;
+ break;
+ }
+ }
+ }
+ fclose(maps);
+ return size;
+}
+
+/*
+ * define sys_xyx to call syscall directly.
+ */
+static int sys_mseal(void *start, size_t len)
+{
+ int sret;
+
+ errno = 0;
+ sret = syscall(__NR_mseal, start, len, 0);
+ return sret;
+}
+
+static int sys_mprotect(void *ptr, size_t size, unsigned long prot)
+{
+ int sret;
+
+ errno = 0;
+ sret = syscall(__NR_mprotect, ptr, size, prot);
+ return sret;
+}
+
+static int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
+ unsigned long pkey)
+{
+ int sret;
+
+ errno = 0;
+ sret = syscall(__NR_pkey_mprotect, ptr, size, orig_prot, pkey);
+ return sret;
+}
+
+static void *sys_mmap(void *addr, unsigned long len, unsigned long prot,
+ unsigned long flags, unsigned long fd, unsigned long offset)
+{
+ void *sret;
+
+ errno = 0;
+ sret = (void *) syscall(__NR_mmap, addr, len, prot,
+ flags, fd, offset);
+ return sret;
+}
+
+static int sys_munmap(void *ptr, size_t size)
+{
+ int sret;
+
+ errno = 0;
+ sret = syscall(__NR_munmap, ptr, size);
+ return sret;
+}
+
+static int sys_madvise(void *start, size_t len, int types)
+{
+ int sret;
+
+ errno = 0;
+ sret = syscall(__NR_madvise, start, len, types);
+ return sret;
+}
+
+static int sys_pkey_alloc(unsigned long flags, unsigned long init_val)
+{
+ int ret = syscall(__NR_pkey_alloc, flags, init_val);
+
+ return ret;
+}
+
+static unsigned int __read_pkey_reg(void)
+{
+ unsigned int pkey_reg = 0;
+#if defined(__i386__) || defined(__x86_64__) /* arch */
+ unsigned int eax, edx;
+ unsigned int ecx = 0;
+
+ asm volatile(".byte 0x0f,0x01,0xee\n\t"
+ : "=a" (eax), "=d" (edx)
+ : "c" (ecx));
+ pkey_reg = eax;
+#endif
+ return pkey_reg;
+}
+
+static void __write_pkey_reg(u64 pkey_reg)
+{
+#if defined(__i386__) || defined(__x86_64__) /* arch */
+ unsigned int eax = pkey_reg;
+ unsigned int ecx = 0;
+ unsigned int edx = 0;
+
+ asm volatile(".byte 0x0f,0x01,0xef\n\t"
+ : : "a" (eax), "c" (ecx), "d" (edx));
+#endif
+}
+
+static unsigned long pkey_bit_position(int pkey)
+{
+ return pkey * PKEY_BITS_PER_PKEY;
+}
+
+static u64 set_pkey_bits(u64 reg, int pkey, u64 flags)
+{
+ unsigned long shift = pkey_bit_position(pkey);
+
+ /* mask out bits from pkey in old value */
+ reg &= ~((u64)PKEY_MASK << shift);
+ /* OR in new bits for pkey */
+ reg |= (flags & PKEY_MASK) << shift;
+ return reg;
+}
+
+static void set_pkey(int pkey, unsigned long pkey_value)
+{
+ u64 new_pkey_reg;
+
+ new_pkey_reg = set_pkey_bits(__read_pkey_reg(), pkey, pkey_value);
+ __write_pkey_reg(new_pkey_reg);
+}
+
+static void setup_single_address(int size, void **ptrOut)
+{
+ void *ptr;
+
+ ptr = sys_mmap(NULL, size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ *ptrOut = ptr;
+}
+
+static void setup_single_address_rw(int size, void **ptrOut)
+{
+ void *ptr;
+ unsigned long mapflags = MAP_ANONYMOUS | MAP_PRIVATE;
+
+ ptr = sys_mmap(NULL, size, PROT_READ | PROT_WRITE, mapflags, -1, 0);
+ *ptrOut = ptr;
+}
+
+static int clean_single_address(void *ptr, int size)
+{
+ int ret;
+ ret = munmap(ptr, size);
+ return ret;
+}
+
+static int seal_single_address(void *ptr, int size)
+{
+ int ret;
+ ret = sys_mseal(ptr, size);
+ return ret;
+}
+
+bool seal_support(void)
+{
+ int ret;
+ void *ptr;
+ unsigned long page_size = getpagesize();
+
+ ptr = sys_mmap(NULL, page_size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ if (ptr == (void *) -1)
+ return false;
+
+ ret = sys_mseal(ptr, page_size);
+ if (ret < 0)
+ return false;
+
+ return true;
+}
+
+bool pkey_supported(void)
+{
+#if defined(__i386__) || defined(__x86_64__) /* arch */
+ int pkey = sys_pkey_alloc(0, 0);
+
+ if (pkey > 0)
+ return true;
+#endif
+ return false;
+}
+
+static void test_seal_addseal(void)
+{
+ int ret;
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ TEST_END_CHECK();
+}
+
+static void test_seal_unmapped_start(void)
+{
+ int ret;
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ /* munmap 2 pages from ptr. */
+ ret = sys_munmap(ptr, 2 * page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* mprotect will fail because 2 pages from ptr are unmapped. */
+ ret = sys_mprotect(ptr, size, PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(ret < 0);
+
+ /* mseal will fail because 2 pages from ptr are unmapped. */
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(ret < 0);
+
+ ret = sys_mseal(ptr + 2 * page_size, 2 * page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ TEST_END_CHECK();
+}
+
+static void test_seal_unmapped_middle(void)
+{
+ int ret;
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ /* munmap 2 pages from ptr + page. */
+ ret = sys_munmap(ptr + page_size, 2 * page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* mprotect will fail, since middle 2 pages are unmapped. */
+ ret = sys_mprotect(ptr, size, PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(ret < 0);
+
+ /* mseal will fail as well. */
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(ret < 0);
+
+ /* we still can add seal to the first page and last page*/
+ ret = sys_mseal(ptr, page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ ret = sys_mseal(ptr + 3 * page_size, page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ TEST_END_CHECK();
+}
+
+static void test_seal_unmapped_end(void)
+{
+ int ret;
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ /* unmap last 2 pages. */
+ ret = sys_munmap(ptr + 2 * page_size, 2 * page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* mprotect will fail since last 2 pages are unmapped. */
+ ret = sys_mprotect(ptr, size, PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(ret < 0);
+
+ /* mseal will fail as well. */
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(ret < 0);
+
+ /* The first 2 pages is not sealed, and can add seals */
+ ret = sys_mseal(ptr, 2 * page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ TEST_END_CHECK();
+}
+
+static void test_seal_multiple_vmas(void)
+{
+ int ret;
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ /* use mprotect to split the vma into 3. */
+ ret = sys_mprotect(ptr + page_size, 2 * page_size,
+ PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* mprotect will get applied to all 4 pages - 3 VMAs. */
+ ret = sys_mprotect(ptr, size, PROT_READ);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* use mprotect to split the vma into 3. */
+ ret = sys_mprotect(ptr + page_size, 2 * page_size,
+ PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* mseal get applied to all 4 pages - 3 VMAs. */
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ TEST_END_CHECK();
+}
+
+static void test_seal_split_start(void)
+{
+ int ret;
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ /* use mprotect to split at middle */
+ ret = sys_mprotect(ptr, 2 * page_size, PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* seal the first page, this will split the VMA */
+ ret = sys_mseal(ptr, page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* add seal to the remain 3 pages */
+ ret = sys_mseal(ptr + page_size, 3 * page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ TEST_END_CHECK();
+}
+
+static void test_seal_split_end(void)
+{
+ int ret;
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ /* use mprotect to split at middle */
+ ret = sys_mprotect(ptr, 2 * page_size, PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* seal the last page */
+ ret = sys_mseal(ptr + 3 * page_size, page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* Adding seals to the first 3 pages */
+ ret = sys_mseal(ptr, 3 * page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ TEST_END_CHECK();
+}
+
+static void test_seal_invalid_input(void)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+
+ setup_single_address(8 * page_size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+ ret = clean_single_address(ptr + 4 * page_size, 4 * page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* invalid flag */
+ ret = syscall(__NR_mseal, ptr, size, 0x20);
+ FAIL_TEST_IF_FALSE(ret < 0);
+
+ /* unaligned address */
+ ret = sys_mseal(ptr + 1, 2 * page_size);
+ FAIL_TEST_IF_FALSE(ret < 0);
+
+ /* length too big */
+ ret = sys_mseal(ptr, 5 * page_size);
+ FAIL_TEST_IF_FALSE(ret < 0);
+
+ /* length overflow */
+ ret = sys_mseal(ptr, UINT64_MAX/page_size);
+ FAIL_TEST_IF_FALSE(ret < 0);
+
+ /* start is not in a valid VMA */
+ ret = sys_mseal(ptr - page_size, 5 * page_size);
+ FAIL_TEST_IF_FALSE(ret < 0);
+
+ TEST_END_CHECK();
+}
+
+static void test_seal_zero_length(void)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ ret = sys_mprotect(ptr, 0, PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* seal 0 length will be OK, same as mprotect */
+ ret = sys_mseal(ptr, 0);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* verify the 4 pages are not sealed by previous call. */
+ ret = sys_mprotect(ptr, size, PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ TEST_END_CHECK();
+}
+
+static void test_seal_zero_address(void)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+ int prot;
+
+ /* use mmap to change protection. */
+ ptr = sys_mmap(0, size, PROT_NONE,
+ MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0);
+ FAIL_TEST_IF_FALSE(ptr == 0);
+
+ size = get_vma_size(ptr, &prot);
+ FAIL_TEST_IF_FALSE(size == 4 * page_size);
+
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* verify the 4 pages are sealed by previous call. */
+ ret = sys_mprotect(ptr, size, PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(ret);
+
+ TEST_END_CHECK();
+}
+
+static void test_seal_twice(void)
+{
+ int ret;
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* apply the same seal will be OK. idempotent. */
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ TEST_END_CHECK();
+}
+
+static void test_seal_mprotect(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ if (seal) {
+ ret = seal_single_address(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ ret = sys_mprotect(ptr, size, PROT_READ | PROT_WRITE);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ TEST_END_CHECK();
+}
+
+static void test_seal_start_mprotect(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ if (seal) {
+ ret = seal_single_address(ptr, page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* the first page is sealed. */
+ ret = sys_mprotect(ptr, page_size, PROT_READ | PROT_WRITE);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* pages after the first page is not sealed. */
+ ret = sys_mprotect(ptr + page_size, page_size * 3,
+ PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ TEST_END_CHECK();
+}
+
+static void test_seal_end_mprotect(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ if (seal) {
+ ret = seal_single_address(ptr + page_size, 3 * page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* first page is not sealed */
+ ret = sys_mprotect(ptr, page_size, PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* last 3 page are sealed */
+ ret = sys_mprotect(ptr + page_size, page_size * 3,
+ PROT_READ | PROT_WRITE);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ TEST_END_CHECK();
+}
+
+static void test_seal_mprotect_unalign_len(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ if (seal) {
+ ret = seal_single_address(ptr, page_size * 2 - 1);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* 2 pages are sealed. */
+ ret = sys_mprotect(ptr, page_size * 2, PROT_READ | PROT_WRITE);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ ret = sys_mprotect(ptr + page_size * 2, page_size,
+ PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ TEST_END_CHECK();
+}
+
+static void test_seal_mprotect_unalign_len_variant_2(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+ if (seal) {
+ ret = seal_single_address(ptr, page_size * 2 + 1);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* 3 pages are sealed. */
+ ret = sys_mprotect(ptr, page_size * 3, PROT_READ | PROT_WRITE);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ ret = sys_mprotect(ptr + page_size * 3, page_size,
+ PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ TEST_END_CHECK();
+}
+
+static void test_seal_mprotect_two_vma(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ /* use mprotect to split */
+ ret = sys_mprotect(ptr, page_size * 2, PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ if (seal) {
+ ret = seal_single_address(ptr, page_size * 4);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ ret = sys_mprotect(ptr, page_size * 2, PROT_READ | PROT_WRITE);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ ret = sys_mprotect(ptr + page_size * 2, page_size * 2,
+ PROT_READ | PROT_WRITE);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ TEST_END_CHECK();
+}
+
+static void test_seal_mprotect_two_vma_with_split(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ /* use mprotect to split as two vma. */
+ ret = sys_mprotect(ptr, page_size * 2, PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* mseal can apply across 2 vma, also split them. */
+ if (seal) {
+ ret = seal_single_address(ptr + page_size, page_size * 2);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* the first page is not sealed. */
+ ret = sys_mprotect(ptr, page_size, PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* the second page is sealed. */
+ ret = sys_mprotect(ptr + page_size, page_size, PROT_READ | PROT_WRITE);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* the third page is sealed. */
+ ret = sys_mprotect(ptr + 2 * page_size, page_size,
+ PROT_READ | PROT_WRITE);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* the fouth page is not sealed. */
+ ret = sys_mprotect(ptr + 3 * page_size, page_size,
+ PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ TEST_END_CHECK();
+}
+
+static void test_seal_mprotect_partial_mprotect(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ /* seal one page. */
+ if (seal) {
+ ret = seal_single_address(ptr, page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* mprotect first 2 page will fail, since the first page are sealed. */
+ ret = sys_mprotect(ptr, 2 * page_size, PROT_READ | PROT_WRITE);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ TEST_END_CHECK();
+}
+
+static void test_seal_mprotect_two_vma_with_gap(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ /* use mprotect to split. */
+ ret = sys_mprotect(ptr, page_size, PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* use mprotect to split. */
+ ret = sys_mprotect(ptr + 3 * page_size, page_size,
+ PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* use munmap to free two pages in the middle */
+ ret = sys_munmap(ptr + page_size, 2 * page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* mprotect will fail, because there is a gap in the address. */
+ /* notes, internally mprotect still updated the first page. */
+ ret = sys_mprotect(ptr, 4 * page_size, PROT_READ);
+ FAIL_TEST_IF_FALSE(ret < 0);
+
+ /* mseal will fail as well. */
+ ret = sys_mseal(ptr, 4 * page_size);
+ FAIL_TEST_IF_FALSE(ret < 0);
+
+ /* the first page is not sealed. */
+ ret = sys_mprotect(ptr, page_size, PROT_READ);
+ FAIL_TEST_IF_FALSE(ret == 0);
+
+ /* the last page is not sealed. */
+ ret = sys_mprotect(ptr + 3 * page_size, page_size, PROT_READ);
+ FAIL_TEST_IF_FALSE(ret == 0);
+
+ TEST_END_CHECK();
+}
+
+static void test_seal_mprotect_split(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ /* use mprotect to split. */
+ ret = sys_mprotect(ptr, page_size, PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* seal all 4 pages. */
+ if (seal) {
+ ret = sys_mseal(ptr, 4 * page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* mprotect is sealed. */
+ ret = sys_mprotect(ptr, 2 * page_size, PROT_READ);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+
+ ret = sys_mprotect(ptr + 2 * page_size, 2 * page_size, PROT_READ);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ TEST_END_CHECK();
+}
+
+static void test_seal_mprotect_merge(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ /* use mprotect to split one page. */
+ ret = sys_mprotect(ptr, page_size, PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* seal first two pages. */
+ if (seal) {
+ ret = sys_mseal(ptr, 2 * page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* 2 pages are sealed. */
+ ret = sys_mprotect(ptr, 2 * page_size, PROT_READ);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* last 2 pages are not sealed. */
+ ret = sys_mprotect(ptr + 2 * page_size, 2 * page_size, PROT_READ);
+ FAIL_TEST_IF_FALSE(ret == 0);
+
+ TEST_END_CHECK();
+}
+
+static void test_seal_munmap(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ if (seal) {
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* 4 pages are sealed. */
+ ret = sys_munmap(ptr, size);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ TEST_END_CHECK();
+}
+
+/*
+ * allocate 4 pages,
+ * use mprotect to split it as two VMAs
+ * seal the whole range
+ * munmap will fail on both
+ */
+static void test_seal_munmap_two_vma(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ /* use mprotect to split */
+ ret = sys_mprotect(ptr, page_size * 2, PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ if (seal) {
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ ret = sys_munmap(ptr, page_size * 2);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ ret = sys_munmap(ptr + page_size, page_size * 2);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ TEST_END_CHECK();
+}
+
+/*
+ * allocate a VMA with 4 pages.
+ * munmap the middle 2 pages.
+ * seal the whole 4 pages, will fail.
+ * munmap the first page will be OK.
+ * munmap the last page will be OK.
+ */
+static void test_seal_munmap_vma_with_gap(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ ret = sys_munmap(ptr + page_size, page_size * 2);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ if (seal) {
+ /* can't have gap in the middle. */
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(ret < 0);
+ }
+
+ ret = sys_munmap(ptr, page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ ret = sys_munmap(ptr + page_size * 2, page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ ret = sys_munmap(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ TEST_END_CHECK();
+}
+
+static void test_munmap_start_freed(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+ int prot;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ /* unmap the first page. */
+ ret = sys_munmap(ptr, page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* seal the last 3 pages. */
+ if (seal) {
+ ret = sys_mseal(ptr + page_size, 3 * page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* unmap from the first page. */
+ ret = sys_munmap(ptr, size);
+ if (seal) {
+ FAIL_TEST_IF_FALSE(ret < 0);
+
+ size = get_vma_size(ptr + page_size, &prot);
+ FAIL_TEST_IF_FALSE(size == page_size * 3);
+ } else {
+ /* note: this will be OK, even the first page is */
+ /* already unmapped. */
+ FAIL_TEST_IF_FALSE(!ret);
+
+ size = get_vma_size(ptr + page_size, &prot);
+ FAIL_TEST_IF_FALSE(size == 0);
+ }
+
+ TEST_END_CHECK();
+}
+
+static void test_munmap_end_freed(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ /* unmap last page. */
+ ret = sys_munmap(ptr + page_size * 3, page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* seal the first 3 pages. */
+ if (seal) {
+ ret = sys_mseal(ptr, 3 * page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* unmap all pages. */
+ ret = sys_munmap(ptr, size);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ TEST_END_CHECK();
+}
+
+static void test_munmap_middle_freed(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+ int prot;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ /* unmap 2 pages in the middle. */
+ ret = sys_munmap(ptr + page_size, page_size * 2);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* seal the first page. */
+ if (seal) {
+ ret = sys_mseal(ptr, page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* munmap all 4 pages. */
+ ret = sys_munmap(ptr, size);
+ if (seal) {
+ FAIL_TEST_IF_FALSE(ret < 0);
+
+ size = get_vma_size(ptr, &prot);
+ FAIL_TEST_IF_FALSE(size == page_size);
+
+ size = get_vma_size(ptr + page_size * 3, &prot);
+ FAIL_TEST_IF_FALSE(size == page_size);
+ } else {
+ FAIL_TEST_IF_FALSE(!ret);
+
+ size = get_vma_size(ptr, &prot);
+ FAIL_TEST_IF_FALSE(size == 0);
+
+ size = get_vma_size(ptr + page_size * 3, &prot);
+ FAIL_TEST_IF_FALSE(size == 0);
+ }
+
+ TEST_END_CHECK();
+}
+
+static void test_seal_mremap_shrink(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+ void *ret2;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ if (seal) {
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* shrink from 4 pages to 2 pages. */
+ ret2 = mremap(ptr, size, 2 * page_size, 0, 0);
+ if (seal) {
+ FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED);
+ FAIL_TEST_IF_FALSE(errno == EPERM);
+ } else {
+ FAIL_TEST_IF_FALSE(ret2 != MAP_FAILED);
+
+ }
+
+ TEST_END_CHECK();
+}
+
+static void test_seal_mremap_expand(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+ void *ret2;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+ /* ummap last 2 pages. */
+ ret = sys_munmap(ptr + 2 * page_size, 2 * page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ if (seal) {
+ ret = sys_mseal(ptr, 2 * page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* expand from 2 page to 4 pages. */
+ ret2 = mremap(ptr, 2 * page_size, 4 * page_size, 0, 0);
+ if (seal) {
+ FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED);
+ FAIL_TEST_IF_FALSE(errno == EPERM);
+ } else {
+ FAIL_TEST_IF_FALSE(ret2 == ptr);
+
+ }
+
+ TEST_END_CHECK();
+}
+
+static void test_seal_mremap_move(bool seal)
+{
+ void *ptr, *newPtr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = page_size;
+ int ret;
+ void *ret2;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+ setup_single_address(size, &newPtr);
+ FAIL_TEST_IF_FALSE(newPtr != (void *)-1);
+ ret = clean_single_address(newPtr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ if (seal) {
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* move from ptr to fixed address. */
+ ret2 = mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, newPtr);
+ if (seal) {
+ FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED);
+ FAIL_TEST_IF_FALSE(errno == EPERM);
+ } else {
+ FAIL_TEST_IF_FALSE(ret2 != MAP_FAILED);
+
+ }
+
+ TEST_END_CHECK();
+}
+
+static void test_seal_mmap_overwrite_prot(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = page_size;
+ int ret;
+ void *ret2;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ if (seal) {
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* use mmap to change protection. */
+ ret2 = sys_mmap(ptr, size, PROT_NONE,
+ MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0);
+ if (seal) {
+ FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED);
+ FAIL_TEST_IF_FALSE(errno == EPERM);
+ } else
+ FAIL_TEST_IF_FALSE(ret2 == ptr);
+
+ TEST_END_CHECK();
+}
+
+static void test_seal_mmap_expand(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 12 * page_size;
+ int ret;
+ void *ret2;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+ /* ummap last 4 pages. */
+ ret = sys_munmap(ptr + 8 * page_size, 4 * page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ if (seal) {
+ ret = sys_mseal(ptr, 8 * page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* use mmap to expand. */
+ ret2 = sys_mmap(ptr, size, PROT_READ,
+ MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0);
+ if (seal) {
+ FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED);
+ FAIL_TEST_IF_FALSE(errno == EPERM);
+ } else
+ FAIL_TEST_IF_FALSE(ret2 == ptr);
+
+ TEST_END_CHECK();
+}
+
+static void test_seal_mmap_shrink(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 12 * page_size;
+ int ret;
+ void *ret2;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ if (seal) {
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* use mmap to shrink. */
+ ret2 = sys_mmap(ptr, 8 * page_size, PROT_READ,
+ MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0);
+ if (seal) {
+ FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED);
+ FAIL_TEST_IF_FALSE(errno == EPERM);
+ } else
+ FAIL_TEST_IF_FALSE(ret2 == ptr);
+
+ TEST_END_CHECK();
+}
+
+static void test_seal_mremap_shrink_fixed(bool seal)
+{
+ void *ptr;
+ void *newAddr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+ void *ret2;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+ setup_single_address(size, &newAddr);
+ FAIL_TEST_IF_FALSE(newAddr != (void *)-1);
+
+ if (seal) {
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* mremap to move and shrink to fixed address */
+ ret2 = mremap(ptr, size, 2 * page_size, MREMAP_MAYMOVE | MREMAP_FIXED,
+ newAddr);
+ if (seal) {
+ FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED);
+ FAIL_TEST_IF_FALSE(errno == EPERM);
+ } else
+ FAIL_TEST_IF_FALSE(ret2 == newAddr);
+
+ TEST_END_CHECK();
+}
+
+static void test_seal_mremap_expand_fixed(bool seal)
+{
+ void *ptr;
+ void *newAddr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+ void *ret2;
+
+ setup_single_address(page_size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+ setup_single_address(size, &newAddr);
+ FAIL_TEST_IF_FALSE(newAddr != (void *)-1);
+
+ if (seal) {
+ ret = sys_mseal(newAddr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* mremap to move and expand to fixed address */
+ ret2 = mremap(ptr, page_size, size, MREMAP_MAYMOVE | MREMAP_FIXED,
+ newAddr);
+ if (seal) {
+ FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED);
+ FAIL_TEST_IF_FALSE(errno == EPERM);
+ } else
+ FAIL_TEST_IF_FALSE(ret2 == newAddr);
+
+ TEST_END_CHECK();
+}
+
+static void test_seal_mremap_move_fixed(bool seal)
+{
+ void *ptr;
+ void *newAddr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+ void *ret2;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+ setup_single_address(size, &newAddr);
+ FAIL_TEST_IF_FALSE(newAddr != (void *)-1);
+
+ if (seal) {
+ ret = sys_mseal(newAddr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* mremap to move to fixed address */
+ ret2 = mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, newAddr);
+ if (seal) {
+ FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED);
+ FAIL_TEST_IF_FALSE(errno == EPERM);
+ } else
+ FAIL_TEST_IF_FALSE(ret2 == newAddr);
+
+ TEST_END_CHECK();
+}
+
+static void test_seal_mremap_move_fixed_zero(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+ void *ret2;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ if (seal) {
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /*
+ * MREMAP_FIXED can move the mapping to zero address
+ */
+ ret2 = mremap(ptr, size, 2 * page_size, MREMAP_MAYMOVE | MREMAP_FIXED,
+ 0);
+ if (seal) {
+ FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED);
+ FAIL_TEST_IF_FALSE(errno == EPERM);
+ } else {
+ FAIL_TEST_IF_FALSE(ret2 == 0);
+
+ }
+
+ TEST_END_CHECK();
+}
+
+static void test_seal_mremap_move_dontunmap(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+ void *ret2;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ if (seal) {
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* mremap to move, and don't unmap src addr. */
+ ret2 = mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_DONTUNMAP, 0);
+ if (seal) {
+ FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED);
+ FAIL_TEST_IF_FALSE(errno == EPERM);
+ } else {
+ FAIL_TEST_IF_FALSE(ret2 != MAP_FAILED);
+
+ }
+
+ TEST_END_CHECK();
+}
+
+static void test_seal_mremap_move_dontunmap_anyaddr(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+ void *ret2;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ if (seal) {
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /*
+ * The 0xdeaddead should not have effect on dest addr
+ * when MREMAP_DONTUNMAP is set.
+ */
+ ret2 = mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_DONTUNMAP,
+ 0xdeaddead);
+ if (seal) {
+ FAIL_TEST_IF_FALSE(ret2 == MAP_FAILED);
+ FAIL_TEST_IF_FALSE(errno == EPERM);
+ } else {
+ FAIL_TEST_IF_FALSE(ret2 != MAP_FAILED);
+ FAIL_TEST_IF_FALSE((long)ret2 != 0xdeaddead);
+
+ }
+
+ TEST_END_CHECK();
+}
+
+
+static void test_seal_merge_and_split(void)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size;
+ int ret;
+ int prot;
+
+ /* (24 RO) */
+ setup_single_address(24 * page_size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ /* use mprotect(NONE) to set out boundary */
+ /* (1 NONE) (22 RO) (1 NONE) */
+ ret = sys_mprotect(ptr, page_size, PROT_NONE);
+ FAIL_TEST_IF_FALSE(!ret);
+ ret = sys_mprotect(ptr + 23 * page_size, page_size, PROT_NONE);
+ FAIL_TEST_IF_FALSE(!ret);
+ size = get_vma_size(ptr + page_size, &prot);
+ FAIL_TEST_IF_FALSE(size == 22 * page_size);
+ FAIL_TEST_IF_FALSE(prot == 4);
+
+ /* use mseal to split from beginning */
+ /* (1 NONE) (1 RO_SEAL) (21 RO) (1 NONE) */
+ ret = sys_mseal(ptr + page_size, page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+ size = get_vma_size(ptr + page_size, &prot);
+ FAIL_TEST_IF_FALSE(size == page_size);
+ FAIL_TEST_IF_FALSE(prot == 0x4);
+ size = get_vma_size(ptr + 2 * page_size, &prot);
+ FAIL_TEST_IF_FALSE(size == 21 * page_size);
+ FAIL_TEST_IF_FALSE(prot == 0x4);
+
+ /* use mseal to split from the end. */
+ /* (1 NONE) (1 RO_SEAL) (20 RO) (1 RO_SEAL) (1 NONE) */
+ ret = sys_mseal(ptr + 22 * page_size, page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+ size = get_vma_size(ptr + 22 * page_size, &prot);
+ FAIL_TEST_IF_FALSE(size == page_size);
+ FAIL_TEST_IF_FALSE(prot == 0x4);
+ size = get_vma_size(ptr + 2 * page_size, &prot);
+ FAIL_TEST_IF_FALSE(size == 20 * page_size);
+ FAIL_TEST_IF_FALSE(prot == 0x4);
+
+ /* merge with prev. */
+ /* (1 NONE) (2 RO_SEAL) (19 RO) (1 RO_SEAL) (1 NONE) */
+ ret = sys_mseal(ptr + 2 * page_size, page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+ size = get_vma_size(ptr + page_size, &prot);
+ FAIL_TEST_IF_FALSE(size == 2 * page_size);
+ FAIL_TEST_IF_FALSE(prot == 0x4);
+
+ /* merge with after. */
+ /* (1 NONE) (2 RO_SEAL) (18 RO) (2 RO_SEALS) (1 NONE) */
+ ret = sys_mseal(ptr + 21 * page_size, page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+ size = get_vma_size(ptr + 21 * page_size, &prot);
+ FAIL_TEST_IF_FALSE(size == 2 * page_size);
+ FAIL_TEST_IF_FALSE(prot == 0x4);
+
+ /* split and merge from prev */
+ /* (1 NONE) (3 RO_SEAL) (17 RO) (2 RO_SEALS) (1 NONE) */
+ ret = sys_mseal(ptr + 2 * page_size, 2 * page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+ size = get_vma_size(ptr + 1 * page_size, &prot);
+ FAIL_TEST_IF_FALSE(size == 3 * page_size);
+ FAIL_TEST_IF_FALSE(prot == 0x4);
+ ret = sys_munmap(ptr + page_size, page_size);
+ FAIL_TEST_IF_FALSE(ret < 0);
+ ret = sys_mprotect(ptr + 2 * page_size, page_size, PROT_NONE);
+ FAIL_TEST_IF_FALSE(ret < 0);
+
+ /* split and merge from next */
+ /* (1 NONE) (3 RO_SEAL) (16 RO) (3 RO_SEALS) (1 NONE) */
+ ret = sys_mseal(ptr + 20 * page_size, 2 * page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+ FAIL_TEST_IF_FALSE(prot == 0x4);
+ size = get_vma_size(ptr + 20 * page_size, &prot);
+ FAIL_TEST_IF_FALSE(size == 3 * page_size);
+ FAIL_TEST_IF_FALSE(prot == 0x4);
+
+ /* merge from middle of prev and middle of next. */
+ /* (1 NONE) (22 RO_SEAL) (1 NONE) */
+ ret = sys_mseal(ptr + 2 * page_size, 20 * page_size);
+ FAIL_TEST_IF_FALSE(!ret);
+ size = get_vma_size(ptr + page_size, &prot);
+ FAIL_TEST_IF_FALSE(size == 22 * page_size);
+ FAIL_TEST_IF_FALSE(prot == 0x4);
+
+ TEST_END_CHECK();
+}
+
+static void test_seal_discard_ro_anon_on_rw(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+
+ setup_single_address_rw(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ if (seal) {
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* sealing doesn't take effect on RW memory. */
+ ret = sys_madvise(ptr, size, MADV_DONTNEED);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* base seal still apply. */
+ ret = sys_munmap(ptr, size);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ TEST_END_CHECK();
+}
+
+static void test_seal_discard_ro_anon_on_pkey(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+ int pkey;
+
+ SKIP_TEST_IF_FALSE(pkey_supported());
+
+ setup_single_address_rw(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ pkey = sys_pkey_alloc(0, 0);
+ FAIL_TEST_IF_FALSE(pkey > 0);
+
+ ret = sys_mprotect_pkey((void *)ptr, size, PROT_READ | PROT_WRITE, pkey);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ if (seal) {
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* sealing doesn't take effect if PKRU allow write. */
+ set_pkey(pkey, 0);
+ ret = sys_madvise(ptr, size, MADV_DONTNEED);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* sealing will take effect if PKRU deny write. */
+ set_pkey(pkey, PKEY_DISABLE_WRITE);
+ ret = sys_madvise(ptr, size, MADV_DONTNEED);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ /* base seal still apply. */
+ ret = sys_munmap(ptr, size);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ TEST_END_CHECK();
+}
+
+static void test_seal_discard_ro_anon_on_filebacked(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+ int fd;
+ unsigned long mapflags = MAP_PRIVATE;
+
+ fd = memfd_create("test", 0);
+ FAIL_TEST_IF_FALSE(fd > 0);
+
+ ret = fallocate(fd, 0, 0, size);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ ptr = sys_mmap(NULL, size, PROT_READ, mapflags, fd, 0);
+ FAIL_TEST_IF_FALSE(ptr != MAP_FAILED);
+
+ if (seal) {
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* sealing doesn't apply for file backed mapping. */
+ ret = sys_madvise(ptr, size, MADV_DONTNEED);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ ret = sys_munmap(ptr, size);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+ close(fd);
+
+ TEST_END_CHECK();
+}
+
+static void test_seal_discard_ro_anon_on_shared(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+ unsigned long mapflags = MAP_ANONYMOUS | MAP_SHARED;
+
+ ptr = sys_mmap(NULL, size, PROT_READ, mapflags, -1, 0);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ if (seal) {
+ ret = sys_mseal(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ /* sealing doesn't apply for shared mapping. */
+ ret = sys_madvise(ptr, size, MADV_DONTNEED);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ ret = sys_munmap(ptr, size);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ TEST_END_CHECK();
+}
+
+static void test_seal_discard_ro_anon(bool seal)
+{
+ void *ptr;
+ unsigned long page_size = getpagesize();
+ unsigned long size = 4 * page_size;
+ int ret;
+
+ setup_single_address(size, &ptr);
+ FAIL_TEST_IF_FALSE(ptr != (void *)-1);
+
+ if (seal) {
+ ret = seal_single_address(ptr, size);
+ FAIL_TEST_IF_FALSE(!ret);
+ }
+
+ ret = sys_madvise(ptr, size, MADV_DONTNEED);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ ret = sys_munmap(ptr, size);
+ if (seal)
+ FAIL_TEST_IF_FALSE(ret < 0);
+ else
+ FAIL_TEST_IF_FALSE(!ret);
+
+ TEST_END_CHECK();
+}
+
+int main(int argc, char **argv)
+{
+ bool test_seal = seal_support();
+
+ ksft_print_header();
+
+ if (!test_seal)
+ ksft_exit_skip("sealing not supported, check CONFIG_64BIT\n");
+
+ if (!pkey_supported())
+ ksft_print_msg("PKEY not supported\n");
+
+ ksft_set_plan(80);
+
+ test_seal_addseal();
+ test_seal_unmapped_start();
+ test_seal_unmapped_middle();
+ test_seal_unmapped_end();
+ test_seal_multiple_vmas();
+ test_seal_split_start();
+ test_seal_split_end();
+ test_seal_invalid_input();
+ test_seal_zero_length();
+ test_seal_twice();
+
+ test_seal_mprotect(false);
+ test_seal_mprotect(true);
+
+ test_seal_start_mprotect(false);
+ test_seal_start_mprotect(true);
+
+ test_seal_end_mprotect(false);
+ test_seal_end_mprotect(true);
+
+ test_seal_mprotect_unalign_len(false);
+ test_seal_mprotect_unalign_len(true);
+
+ test_seal_mprotect_unalign_len_variant_2(false);
+ test_seal_mprotect_unalign_len_variant_2(true);
+
+ test_seal_mprotect_two_vma(false);
+ test_seal_mprotect_two_vma(true);
+
+ test_seal_mprotect_two_vma_with_split(false);
+ test_seal_mprotect_two_vma_with_split(true);
+
+ test_seal_mprotect_partial_mprotect(false);
+ test_seal_mprotect_partial_mprotect(true);
+
+ test_seal_mprotect_two_vma_with_gap(false);
+ test_seal_mprotect_two_vma_with_gap(true);
+
+ test_seal_mprotect_merge(false);
+ test_seal_mprotect_merge(true);
+
+ test_seal_mprotect_split(false);
+ test_seal_mprotect_split(true);
+
+ test_seal_munmap(false);
+ test_seal_munmap(true);
+ test_seal_munmap_two_vma(false);
+ test_seal_munmap_two_vma(true);
+ test_seal_munmap_vma_with_gap(false);
+ test_seal_munmap_vma_with_gap(true);
+
+ test_munmap_start_freed(false);
+ test_munmap_start_freed(true);
+ test_munmap_middle_freed(false);
+ test_munmap_middle_freed(true);
+ test_munmap_end_freed(false);
+ test_munmap_end_freed(true);
+
+ test_seal_mremap_shrink(false);
+ test_seal_mremap_shrink(true);
+ test_seal_mremap_expand(false);
+ test_seal_mremap_expand(true);
+ test_seal_mremap_move(false);
+ test_seal_mremap_move(true);
+
+ test_seal_mremap_shrink_fixed(false);
+ test_seal_mremap_shrink_fixed(true);
+ test_seal_mremap_expand_fixed(false);
+ test_seal_mremap_expand_fixed(true);
+ test_seal_mremap_move_fixed(false);
+ test_seal_mremap_move_fixed(true);
+ test_seal_mremap_move_dontunmap(false);
+ test_seal_mremap_move_dontunmap(true);
+ test_seal_mremap_move_fixed_zero(false);
+ test_seal_mremap_move_fixed_zero(true);
+ test_seal_mremap_move_dontunmap_anyaddr(false);
+ test_seal_mremap_move_dontunmap_anyaddr(true);
+ test_seal_discard_ro_anon(false);
+ test_seal_discard_ro_anon(true);
+ test_seal_discard_ro_anon_on_rw(false);
+ test_seal_discard_ro_anon_on_rw(true);
+ test_seal_discard_ro_anon_on_shared(false);
+ test_seal_discard_ro_anon_on_shared(true);
+ test_seal_discard_ro_anon_on_filebacked(false);
+ test_seal_discard_ro_anon_on_filebacked(true);
+ test_seal_mmap_overwrite_prot(false);
+ test_seal_mmap_overwrite_prot(true);
+ test_seal_mmap_expand(false);
+ test_seal_mmap_expand(true);
+ test_seal_mmap_shrink(false);
+ test_seal_mmap_shrink(true);
+
+ test_seal_merge_and_split();
+ test_seal_zero_address();
+
+ test_seal_discard_ro_anon_on_pkey(false);
+ test_seal_discard_ro_anon_on_pkey(true);
+
+ ksft_finished();
+}
diff --git a/tools/testing/selftests/mm/seal_elf.c b/tools/testing/selftests/mm/seal_elf.c
new file mode 100644
index 00000000000000..f2babec79bb63d
--- /dev/null
+++ b/tools/testing/selftests/mm/seal_elf.c
@@ -0,0 +1,179 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <sys/mman.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <stdbool.h>
+#include "../kselftest.h"
+#include <syscall.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/vfs.h>
+#include <sys/stat.h>
+
+/*
+ * need those definition for manually build using gcc.
+ * gcc -I ../../../../usr/include -DDEBUG -O3 -DDEBUG -O3 seal_elf.c -o seal_elf
+ */
+#define FAIL_TEST_IF_FALSE(c) do {\
+ if (!(c)) {\
+ ksft_test_result_fail("%s, line:%d\n", __func__, __LINE__);\
+ goto test_end;\
+ } \
+ } \
+ while (0)
+
+#define SKIP_TEST_IF_FALSE(c) do {\
+ if (!(c)) {\
+ ksft_test_result_skip("%s, line:%d\n", __func__, __LINE__);\
+ goto test_end;\
+ } \
+ } \
+ while (0)
+
+
+#define TEST_END_CHECK() {\
+ ksft_test_result_pass("%s\n", __func__);\
+ return;\
+test_end:\
+ return;\
+}
+
+#ifndef u64
+#define u64 unsigned long long
+#endif
+
+/*
+ * define sys_xyx to call syscall directly.
+ */
+static int sys_mseal(void *start, size_t len)
+{
+ int sret;
+
+ errno = 0;
+ sret = syscall(__NR_mseal, start, len, 0);
+ return sret;
+}
+
+static void *sys_mmap(void *addr, unsigned long len, unsigned long prot,
+ unsigned long flags, unsigned long fd, unsigned long offset)
+{
+ void *sret;
+
+ errno = 0;
+ sret = (void *) syscall(__NR_mmap, addr, len, prot,
+ flags, fd, offset);
+ return sret;
+}
+
+static inline int sys_mprotect(void *ptr, size_t size, unsigned long prot)
+{
+ int sret;
+
+ errno = 0;
+ sret = syscall(__NR_mprotect, ptr, size, prot);
+ return sret;
+}
+
+static bool seal_support(void)
+{
+ int ret;
+ void *ptr;
+ unsigned long page_size = getpagesize();
+
+ ptr = sys_mmap(NULL, page_size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ if (ptr == (void *) -1)
+ return false;
+
+ ret = sys_mseal(ptr, page_size);
+ if (ret < 0)
+ return false;
+
+ return true;
+}
+
+const char somestr[4096] = {"READONLY"};
+
+static void test_seal_elf(void)
+{
+ int ret;
+ FILE *maps;
+ char line[512];
+ uintptr_t addr_start, addr_end;
+ char prot[5];
+ char filename[256];
+ unsigned long page_size = getpagesize();
+ unsigned long long ptr = (unsigned long long) somestr;
+ char *somestr2 = (char *)somestr;
+
+ /*
+ * Modify the protection of readonly somestr
+ */
+ if (((unsigned long long)ptr % page_size) != 0)
+ ptr = (unsigned long long)ptr & ~(page_size - 1);
+
+ ksft_print_msg("somestr = %s\n", somestr);
+ ksft_print_msg("change protection to rw\n");
+ ret = sys_mprotect((void *)ptr, page_size, PROT_READ|PROT_WRITE);
+ FAIL_TEST_IF_FALSE(!ret);
+ *somestr2 = 'A';
+ ksft_print_msg("somestr is modified to: %s\n", somestr);
+ ret = sys_mprotect((void *)ptr, page_size, PROT_READ);
+ FAIL_TEST_IF_FALSE(!ret);
+
+ maps = fopen("/proc/self/maps", "r");
+ FAIL_TEST_IF_FALSE(maps);
+
+ /*
+ * apply sealing to elf binary
+ */
+ while (fgets(line, sizeof(line), maps)) {
+ if (sscanf(line, "%lx-%lx %4s %*x %*x:%*x %*u %255[^\n]",
+ &addr_start, &addr_end, prot, filename) == 4) {
+ if (strlen(filename)) {
+ /*
+ * seal the mapping if read only.
+ */
+ if (strstr(prot, "r-")) {
+ ret = sys_mseal((void *)addr_start, addr_end - addr_start);
+ FAIL_TEST_IF_FALSE(!ret);
+ ksft_print_msg("sealed: %lx-%lx %s %s\n",
+ addr_start, addr_end, prot, filename);
+ if ((uintptr_t) somestr >= addr_start &&
+ (uintptr_t) somestr <= addr_end)
+ ksft_print_msg("mapping for somestr found\n");
+ }
+ }
+ }
+ }
+ fclose(maps);
+
+ ret = sys_mprotect((void *)ptr, page_size, PROT_READ | PROT_WRITE);
+ FAIL_TEST_IF_FALSE(ret < 0);
+ ksft_print_msg("somestr is sealed, mprotect is rejected\n");
+
+ TEST_END_CHECK();
+}
+
+int main(int argc, char **argv)
+{
+ bool test_seal = seal_support();
+
+ ksft_print_header();
+ ksft_print_msg("pid=%d\n", getpid());
+
+ if (!test_seal)
+ ksft_exit_skip("sealing not supported, check CONFIG_64BIT\n");
+
+ ksft_set_plan(1);
+
+ test_seal_elf();
+
+ ksft_finished();
+}
diff --git a/tools/testing/selftests/mm/soft-dirty.c b/tools/testing/selftests/mm/soft-dirty.c
index 7dbfa53d93a05f..bdfa5d085f0099 100644
--- a/tools/testing/selftests/mm/soft-dirty.c
+++ b/tools/testing/selftests/mm/soft-dirty.c
@@ -209,5 +209,5 @@ int main(int argc, char **argv)
close(pagemap_fd);
- return ksft_exit_pass();
+ ksft_finished();
}
diff --git a/tools/writeback/wb_monitor.py b/tools/writeback/wb_monitor.py
new file mode 100644
index 00000000000000..5e3591f1f9a9df
--- /dev/null
+++ b/tools/writeback/wb_monitor.py
@@ -0,0 +1,172 @@
+#!/usr/bin/env drgn
+#
+# Copyright (C) 2024 Kemeng Shi <shikemeng@huaweicloud.com>
+# Copyright (C) 2024 Huawei Inc
+
+desc = """
+This is a drgn script based on wq_monitor.py to monitor writeback info on
+backing dev. For more info on drgn, visit https://github.com/osandov/drgn.
+
+ writeback(kB) Amount of dirty pages are currently being written back to
+ disk.
+
+ reclaimable(kB) Amount of pages are currently reclaimable.
+
+ dirtied(kB) Amount of pages have been dirtied.
+
+ wrttien(kB) Amount of dirty pages have been written back to disk.
+
+ avg_wb(kBps) Smoothly estimated write bandwidth of writing dirty pages
+ back to disk.
+"""
+
+import signal
+import re
+import time
+import json
+
+import drgn
+from drgn.helpers.linux.list import list_for_each_entry
+
+import argparse
+parser = argparse.ArgumentParser(description=desc,
+ formatter_class=argparse.RawTextHelpFormatter)
+parser.add_argument('bdi', metavar='REGEX', nargs='*',
+ help='Target backing device name patterns (all if empty)')
+parser.add_argument('-i', '--interval', metavar='SECS', type=float, default=1,
+ help='Monitoring interval (0 to print once and exit)')
+parser.add_argument('-j', '--json', action='store_true',
+ help='Output in json')
+parser.add_argument('-c', '--cgroup', action='store_true',
+ help='show writeback of bdi in cgroup')
+args = parser.parse_args()
+
+bdi_list = prog['bdi_list']
+
+WB_RECLAIMABLE = prog['WB_RECLAIMABLE']
+WB_WRITEBACK = prog['WB_WRITEBACK']
+WB_DIRTIED = prog['WB_DIRTIED']
+WB_WRITTEN = prog['WB_WRITTEN']
+NR_WB_STAT_ITEMS = prog['NR_WB_STAT_ITEMS']
+
+PAGE_SHIFT = prog['PAGE_SHIFT']
+
+def K(x):
+ return x << (PAGE_SHIFT - 10)
+
+class Stats:
+ def dict(self, now):
+ return { 'timestamp' : now,
+ 'name' : self.name,
+ 'writeback' : self.stats[WB_WRITEBACK],
+ 'reclaimable' : self.stats[WB_RECLAIMABLE],
+ 'dirtied' : self.stats[WB_DIRTIED],
+ 'written' : self.stats[WB_WRITTEN],
+ 'avg_wb' : self.avg_bw, }
+
+ def table_header_str():
+ return f'{"":>16} {"writeback":>10} {"reclaimable":>12} ' \
+ f'{"dirtied":>9} {"written":>9} {"avg_bw":>9}'
+
+ def table_row_str(self):
+ out = f'{self.name[-16:]:16} ' \
+ f'{self.stats[WB_WRITEBACK]:10} ' \
+ f'{self.stats[WB_RECLAIMABLE]:12} ' \
+ f'{self.stats[WB_DIRTIED]:9} ' \
+ f'{self.stats[WB_WRITTEN]:9} ' \
+ f'{self.avg_bw:9} '
+ return out
+
+ def show_header():
+ if Stats.table_fmt:
+ print()
+ print(Stats.table_header_str())
+
+ def show_stats(self):
+ if Stats.table_fmt:
+ print(self.table_row_str())
+ else:
+ print(self.dict(Stats.now))
+
+class WbStats(Stats):
+ def __init__(self, wb):
+ bdi_name = wb.bdi.dev_name.string_().decode()
+ # avoid to use bdi.wb.memcg_css which is only defined when
+ # CONFIG_CGROUP_WRITEBACK is enabled
+ if wb == wb.bdi.wb.address_of_():
+ ino = "1"
+ else:
+ ino = str(wb.memcg_css.cgroup.kn.id.value_())
+ self.name = bdi_name + '_' + ino
+
+ self.stats = [0] * NR_WB_STAT_ITEMS
+ for i in range(NR_WB_STAT_ITEMS):
+ if wb.stat[i].count >= 0:
+ self.stats[i] = int(K(wb.stat[i].count))
+ else:
+ self.stats[i] = 0
+
+ self.avg_bw = int(K(wb.avg_write_bandwidth))
+
+class BdiStats(Stats):
+ def __init__(self, bdi):
+ self.name = bdi.dev_name.string_().decode()
+ self.stats = [0] * NR_WB_STAT_ITEMS
+ self.avg_bw = 0
+
+ def collectStats(self, wb_stats):
+ for i in range(NR_WB_STAT_ITEMS):
+ self.stats[i] += wb_stats.stats[i]
+
+ self.avg_bw += wb_stats.avg_bw
+
+exit_req = False
+
+def sigint_handler(signr, frame):
+ global exit_req
+ exit_req = True
+
+def main():
+ # handle args
+ Stats.table_fmt = not args.json
+ interval = args.interval
+ cgroup = args.cgroup
+
+ re_str = None
+ if args.bdi:
+ for r in args.bdi:
+ if re_str is None:
+ re_str = r
+ else:
+ re_str += '|' + r
+
+ filter_re = re.compile(re_str) if re_str else None
+
+ # monitoring loop
+ signal.signal(signal.SIGINT, sigint_handler)
+
+ while not exit_req:
+ Stats.now = time.time()
+
+ Stats.show_header()
+ for bdi in list_for_each_entry('struct backing_dev_info', bdi_list.address_of_(), 'bdi_list'):
+ bdi_stats = BdiStats(bdi)
+ if filter_re and not filter_re.search(bdi_stats.name):
+ continue
+
+ for wb in list_for_each_entry('struct bdi_writeback', bdi.wb_list.address_of_(), 'bdi_node'):
+ wb_stats = WbStats(wb)
+ bdi_stats.collectStats(wb_stats)
+ if cgroup:
+ wb_stats.show_stats()
+
+ bdi_stats.show_stats()
+ if cgroup and Stats.table_fmt:
+ print()
+
+ if interval == 0:
+ break
+ time.sleep(interval)
+
+if __name__ == "__main__":
+ main()
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ff0a20565f9087..92c70c99420af1 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2901,7 +2901,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
spinlock_t *ptl;
int r;
- r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
+ r = follow_pte(vma, addr, &ptep, &ptl);
if (r) {
/*
* get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
@@ -2916,7 +2916,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
if (r)
return r;
- r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
+ r = follow_pte(vma, addr, &ptep, &ptl);
if (r)
return r;
}