aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMichael Ellerman <mpe@ellerman.id.au>2015-06-05 22:39:03 +1000
committerMichael Ellerman <mpe@ellerman.id.au>2015-06-05 22:39:03 +1000
commitec868bdb35b66363bfe8998e4f522cb3743a160b (patch)
tree5bb7106d83d4d47e4cc22e22d181b4ea62e53a4d
parent36469a05610408bbd1e4705a9e721bf2009135de (diff)
parent45be0ab9ce2e8142ff9f21d8d15debc5ee9d8603 (diff)
downloadlinux-next-ec868bdb35b66363bfe8998e4f522cb3743a160b.tar.gz
Merge branch 'akpm-current/current'
Conflicts: arch/x86/Kconfig arch/x86/kernel/machine_kexec_64.c drivers/staging/android/lowmemorykiller.c
-rw-r--r--.mailmap2
-rw-r--r--Documentation/ABI/testing/configfs-spear-pcie-gadget2
-rw-r--r--Documentation/ABI/testing/dev-kmsg9
-rw-r--r--Documentation/ABI/testing/sysfs-bus-usb-lvstest12
-rw-r--r--Documentation/ABI/testing/sysfs-class-zram24
-rw-r--r--Documentation/blockdev/zram.txt29
-rw-r--r--Documentation/devicetree/bindings/w1/omap-hdq.txt7
-rw-r--r--Documentation/filesystems/vfat.txt10
-rw-r--r--Documentation/ioctl/ioctl-number.txt1
-rw-r--r--Documentation/lockup-watchdogs.txt18
-rw-r--r--Documentation/misc-devices/spear-pcie-gadget.txt2
-rw-r--r--Documentation/networking/netconsole.txt35
-rw-r--r--Documentation/printk-formats.txt8
-rw-r--r--Documentation/sysctl/kernel.txt25
-rw-r--r--Documentation/vm/unevictable-lru.txt8
-rw-r--r--Documentation/vm/userfaultfd.txt144
-rw-r--r--Documentation/vm/zswap.txt18
-rw-r--r--Documentation/w1/masters/omap-hdq6
-rw-r--r--arch/Kconfig7
-rw-r--r--arch/alpha/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/alpha/include/uapi/asm/mman.h1
-rw-r--r--arch/arc/include/asm/dma-mapping.h12
-rw-r--r--arch/arc/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/arm/include/asm/hugetlb.h13
-rw-r--r--arch/arm/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/arm/include/asm/pgtable-3level.h1
-rw-r--r--arch/arm/mm/hugetlbpage.c5
-rw-r--r--arch/arm64/include/asm/hugetlb.h13
-rw-r--r--arch/arm64/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/arm64/include/asm/pgtable.h2
-rw-r--r--arch/arm64/mm/hugetlbpage.c7
-rw-r--r--arch/avr32/include/asm/dma-mapping.h19
-rw-r--r--arch/avr32/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/blackfin/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/c6x/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/cris/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/frv/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/frv/include/asm/sections.h6
-rw-r--r--arch/frv/mb93090-mb00/pci-dma-nommu.c10
-rw-r--r--arch/frv/mb93090-mb00/pci-dma.c7
-rw-r--r--arch/hexagon/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/ia64/include/asm/hugetlb.h13
-rw-r--r--arch/ia64/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/ia64/mm/hugetlbpage.c5
-rw-r--r--arch/ia64/mm/numa.c19
-rw-r--r--arch/m32r/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/m68k/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/metag/include/asm/hugetlb.h13
-rw-r--r--arch/metag/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/metag/mm/hugetlbpage.c5
-rw-r--r--arch/microblaze/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/mips/include/asm/hugetlb.h13
-rw-r--r--arch/mips/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/mips/include/asm/pgtable.h8
-rw-r--r--arch/mips/include/uapi/asm/mman.h1
-rw-r--r--arch/mips/mm/dma-default.c30
-rw-r--r--arch/mips/mm/hugetlbpage.c5
-rw-r--r--arch/mn10300/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/nios2/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/openrisc/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/parisc/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/parisc/include/uapi/asm/mman.h1
-rw-r--r--arch/parisc/kernel/pci-dma.c27
-rw-r--r--arch/powerpc/include/asm/hugetlb.h14
-rw-r--r--arch/powerpc/include/asm/mm-arch-hooks.h28
-rw-r--r--arch/powerpc/include/asm/mmu_context.h23
-rw-r--r--arch/powerpc/include/asm/pgtable-ppc64.h16
-rw-r--r--arch/powerpc/include/asm/systbl.h1
-rw-r--r--arch/powerpc/include/asm/unistd.h2
-rw-r--r--arch/powerpc/include/uapi/asm/unistd.h1
-rw-r--r--arch/powerpc/kernel/vio.c10
-rw-r--r--arch/powerpc/mm/hugetlbpage.c5
-rw-r--r--arch/powerpc/mm/pgtable_64.c73
-rw-r--r--arch/s390/include/asm/hugetlb.h4
-rw-r--r--arch/s390/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/s390/include/asm/page.h8
-rw-r--r--arch/s390/include/asm/pgtable.h30
-rw-r--r--arch/s390/kernel/compat_wrapper.c2
-rw-r--r--arch/s390/kernel/crash_dump.c5
-rw-r--r--arch/s390/kernel/setup.c2
-rw-r--r--arch/s390/mm/hugetlbpage.c70
-rw-r--r--arch/s390/mm/pgtable.c2
-rw-r--r--arch/score/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/sh/include/asm/hugetlb.h12
-rw-r--r--arch/sh/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/sh/mm/hugetlbpage.c5
-rw-r--r--arch/sparc/include/asm/hugetlb.h13
-rw-r--r--arch/sparc/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/sparc/include/asm/pgtable_64.h17
-rw-r--r--arch/sparc/kernel/ldc.c8
-rw-r--r--arch/sparc/mm/hugetlbpage.c5
-rw-r--r--arch/sparc/mm/init_64.c6
-rw-r--r--arch/tile/include/asm/hugetlb.h13
-rw-r--r--arch/tile/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/tile/include/asm/pgtable.h8
-rw-r--r--arch/tile/mm/hugetlbpage.c5
-rw-r--r--arch/um/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/unicore32/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/include/asm/hugetlb.h12
-rw-r--r--arch/x86/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/x86/include/asm/pgtable.h9
-rw-r--r--arch/x86/kernel/check.c3
-rw-r--r--arch/x86/kernel/e820.c3
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c11
-rw-r--r--arch/x86/kernel/machine_kexec_64.c1
-rw-r--r--arch/x86/kernel/setup.c3
-rw-r--r--arch/x86/mm/init_32.c2
-rw-r--r--arch/x86/platform/efi/efi.c21
-rw-r--r--arch/x86/syscalls/syscall_32.tbl1
-rw-r--r--arch/x86/syscalls/syscall_64.tbl1
-rw-r--r--arch/xtensa/include/asm/dma-mapping.h19
-rw-r--r--arch/xtensa/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/xtensa/include/uapi/asm/mman.h1
-rw-r--r--block/genhd.c2
-rw-r--r--drivers/base/node.c6
-rw-r--r--drivers/block/zram/Kconfig10
-rw-r--r--drivers/block/zram/zcomp.c7
-rw-r--r--drivers/block/zram/zcomp.h1
-rw-r--r--drivers/block/zram/zram_drv.c957
-rw-r--r--drivers/block/zram/zram_drv.h10
-rw-r--r--drivers/memstick/host/jmb38x_ms.c12
-rw-r--r--drivers/memstick/host/r592.c10
-rw-r--r--drivers/misc/kgdbts.c2
-rw-r--r--drivers/misc/spear13xx_pcie_gadget.c2
-rw-r--r--drivers/net/netconsole.c169
-rw-r--r--drivers/pci/host/pcie-spear13xx.c6
-rw-r--r--drivers/phy/phy-spear1310-miphy.c6
-rw-r--r--drivers/phy/phy-spear1340-miphy.c6
-rw-r--r--drivers/rtc/class.c24
-rw-r--r--drivers/rtc/interface.c13
-rw-r--r--drivers/rtc/rtc-ds1307.c12
-rw-r--r--drivers/rtc/rtc-omap.c14
-rw-r--r--drivers/scsi/scsi_debug.c12
-rw-r--r--drivers/staging/android/lowmemorykiller.c2
-rw-r--r--drivers/tty/sysrq.c5
-rw-r--r--drivers/usb/misc/lvstest.c2
-rw-r--r--drivers/w1/masters/omap_hdq.c224
-rw-r--r--drivers/xen/tmem.c8
-rw-r--r--fs/Makefile1
-rw-r--r--fs/adfs/super.c2
-rw-r--r--fs/affs/amigaffs.c2
-rw-r--r--fs/affs/inode.c2
-rw-r--r--fs/affs/symlink.c4
-rw-r--r--fs/befs/btree.c6
-rw-r--r--fs/cifs/file.c8
-rw-r--r--fs/configfs/item.c3
-rw-r--r--fs/coredump.c19
-rw-r--r--fs/devpts/inode.c31
-rw-r--r--fs/efs/super.c2
-rw-r--r--fs/ext4/fsync.c5
-rw-r--r--fs/fat/cache.c79
-rw-r--r--fs/fat/dir.c2
-rw-r--r--fs/fat/fat.h6
-rw-r--r--fs/fat/file.c61
-rw-r--r--fs/fat/inode.c75
-rw-r--r--fs/hugetlbfs/inode.c1
-rw-r--r--fs/jbd2/journal.c11
-rw-r--r--fs/jbd2/transaction.c20
-rw-r--r--fs/minix/inode.c2
-rw-r--r--fs/mount.h3
-rw-r--r--fs/mpage.c23
-rw-r--r--fs/namespace.c6
-rw-r--r--fs/nilfs2/namei.c5
-rw-r--r--fs/ntfs/file.c3
-rw-r--r--fs/ntfs/malloc.h7
-rw-r--r--fs/ocfs2/acl.c26
-rw-r--r--fs/ocfs2/alloc.c282
-rw-r--r--fs/ocfs2/aops.c61
-rw-r--r--fs/ocfs2/aops.h7
-rw-r--r--fs/ocfs2/buffer_head_io.c6
-rw-r--r--fs/ocfs2/cluster/heartbeat.c49
-rw-r--r--fs/ocfs2/cluster/masklog.c34
-rw-r--r--fs/ocfs2/cluster/masklog.h42
-rw-r--r--fs/ocfs2/cluster/tcp.c2
-rw-r--r--fs/ocfs2/dir.c90
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h1
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c22
-rw-r--r--fs/ocfs2/dlm/dlmthread.c10
-rw-r--r--fs/ocfs2/dlmglue.c10
-rw-r--r--fs/ocfs2/extent_map.c22
-rw-r--r--fs/ocfs2/file.c40
-rw-r--r--fs/ocfs2/inode.c49
-rw-r--r--fs/ocfs2/journal.c68
-rw-r--r--fs/ocfs2/localalloc.c3
-rw-r--r--fs/ocfs2/move_extents.c8
-rw-r--r--fs/ocfs2/namei.c46
-rw-r--r--fs/ocfs2/namei.h4
-rw-r--r--fs/ocfs2/ocfs2.h12
-rw-r--r--fs/ocfs2/quota_local.c3
-rw-r--r--fs/ocfs2/refcounttree.c81
-rw-r--r--fs/ocfs2/suballoc.c90
-rw-r--r--fs/ocfs2/super.c67
-rw-r--r--fs/ocfs2/super.h8
-rw-r--r--fs/ocfs2/xattr.c51
-rw-r--r--fs/posix_acl.c46
-rw-r--r--fs/proc/Kconfig4
-rw-r--r--fs/proc/array.c12
-rw-r--r--fs/proc/base.c207
-rw-r--r--fs/proc/task_mmu.c2
-rw-r--r--fs/proc_namespace.c34
-rw-r--r--fs/reiserfs/super.c3
-rw-r--r--fs/seq_file.c19
-rw-r--r--fs/splice.c2
-rw-r--r--fs/ufs/balloc.c34
-rw-r--r--fs/ufs/ialloc.c16
-rw-r--r--fs/ufs/inode.c5
-rw-r--r--fs/ufs/namei.c14
-rw-r--r--fs/ufs/super.c9
-rw-r--r--fs/ufs/ufs.h1
-rw-r--r--fs/userfaultfd.c1277
-rw-r--r--include/asm-generic/pgtable.h34
-rw-r--r--include/linux/bootmem.h8
-rw-r--r--include/linux/compiler-gcc.h207
-rw-r--r--include/linux/compiler-gcc3.h23
-rw-r--r--include/linux/compiler-gcc4.h91
-rw-r--r--include/linux/compiler-gcc5.h67
-rw-r--r--include/linux/compiler-intel.h2
-rw-r--r--include/linux/configfs.h1
-rw-r--r--include/linux/console.h1
-rw-r--r--include/linux/crc64_ecma.h56
-rw-r--r--include/linux/efi.h3
-rw-r--r--include/linux/frontswap.h14
-rw-r--r--include/linux/fsnotify_backend.h2
-rw-r--r--include/linux/gfp.h8
-rw-r--r--include/linux/huge_mm.h4
-rw-r--r--include/linux/kernel.h3
-rw-r--r--include/linux/kexec.h2
-rw-r--r--include/linux/memblock.h67
-rw-r--r--include/linux/mm-arch-hooks.h25
-rw-r--r--include/linux/mm.h89
-rw-r--r--include/linux/mm_types.h11
-rw-r--r--include/linux/mmu_notifier.h12
-rw-r--r--include/linux/mmzone.h23
-rw-r--r--include/linux/nmi.h3
-rw-r--r--include/linux/oom.h15
-rw-r--r--include/linux/page-flags.h237
-rw-r--r--include/linux/pagemap.h25
-rw-r--r--include/linux/poison.h4
-rw-r--r--include/linux/printk.h2
-rw-r--r--include/linux/rmap.h9
-rw-r--r--include/linux/rtc.h4
-rw-r--r--include/linux/scatterlist.h7
-rw-r--r--include/linux/sched.h15
-rw-r--r--include/linux/slab.h36
-rw-r--r--include/linux/smpboot.h5
-rw-r--r--include/linux/string.h1
-rw-r--r--include/linux/swap.h1
-rw-r--r--include/linux/syscalls.h7
-rw-r--r--include/linux/syslog.h6
-rw-r--r--include/linux/userfaultfd_k.h85
-rw-r--r--include/linux/vm_event_item.h1
-rw-r--r--include/linux/wait.h5
-rw-r--r--include/linux/zpool.h5
-rw-r--r--include/ras/ras_event.h85
-rw-r--r--include/uapi/asm-generic/mman-common.h1
-rw-r--r--include/uapi/linux/Kbuild1
-rw-r--r--include/uapi/linux/userfaultfd.h169
-rw-r--r--init/Kconfig12
-rw-r--r--init/do_mounts.c9
-rw-r--r--init/main.c2
-rw-r--r--ipc/msg.c36
-rw-r--r--ipc/shm.c10
-rw-r--r--ipc/util.c5
-rw-r--r--kernel/exit.c4
-rw-r--r--kernel/fork.c51
-rw-r--r--kernel/gcov/base.c6
-rw-r--r--kernel/gcov/gcc_4_7.c4
-rw-r--r--kernel/kexec.c11
-rw-r--r--kernel/panic.c5
-rw-r--r--kernel/printk/printk.c235
-rw-r--r--kernel/sched/wait.c7
-rw-r--r--kernel/smpboot.c60
-rw-r--r--kernel/sys.c158
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl.c7
-rw-r--r--kernel/watchdog.c67
-rw-r--r--lib/Kconfig7
-rw-r--r--lib/Makefile1
-rw-r--r--lib/bitmap.c32
-rw-r--r--lib/crc64_ecma.c341
-rw-r--r--lib/radix-tree.c28
-rw-r--r--lib/scatterlist.c18
-rw-r--r--lib/sort.c23
-rw-r--r--lib/test-hexdump.c6
-rw-r--r--lib/vsprintf.c20
-rw-r--r--mm/Kconfig19
-rw-r--r--mm/Makefile1
-rw-r--r--mm/bootmem.c13
-rw-r--r--mm/cma.c10
-rw-r--r--mm/compaction.c32
-rw-r--r--mm/filemap.c38
-rw-r--r--mm/frontswap.c215
-rw-r--r--mm/huge_memory.c134
-rw-r--r--mm/hugetlb.c197
-rw-r--r--mm/hwpoison-inject.c4
-rw-r--r--mm/internal.h11
-rw-r--r--mm/kasan/kasan.h1
-rw-r--r--mm/kmemleak.c18
-rw-r--r--mm/ksm.c2
-rw-r--r--mm/madvise.c179
-rw-r--r--mm/memblock.c157
-rw-r--r--mm/memcontrol.c68
-rw-r--r--mm/memory-failure.c353
-rw-r--r--mm/memory.c28
-rw-r--r--mm/memory_hotplug.c15
-rw-r--r--mm/mempolicy.c4
-rw-r--r--mm/memtest.c3
-rw-r--r--mm/migrate.c13
-rw-r--r--mm/mlock.c3
-rw-r--r--mm/mm_init.c9
-rw-r--r--mm/mmap.c40
-rw-r--r--mm/mprotect.c14
-rw-r--r--mm/mremap.c17
-rw-r--r--mm/nobootmem.c21
-rw-r--r--mm/nommu.c112
-rw-r--r--mm/oom_kill.c230
-rw-r--r--mm/page_alloc.c628
-rw-r--r--mm/page_isolation.c7
-rw-r--r--mm/pgtable-generic.c29
-rw-r--r--mm/rmap.c55
-rw-r--r--mm/shmem.c4
-rw-r--r--mm/slab.c14
-rw-r--r--mm/slab.h10
-rw-r--r--mm/slab_common.c121
-rw-r--r--mm/slob.c13
-rw-r--r--mm/slub.c69
-rw-r--r--mm/swap.c45
-rw-r--r--mm/swap_state.c4
-rw-r--r--mm/userfaultfd.c308
-rw-r--r--mm/util.c40
-rw-r--r--mm/vmscan.c132
-rw-r--r--mm/vmstat.c1
-rw-r--r--mm/zbud.c23
-rw-r--r--mm/zpool.c35
-rw-r--r--mm/zsmalloc.c7
-rw-r--r--mm/zswap.c16
-rw-r--r--net/sunrpc/sched.c2
-rwxr-xr-xscripts/checkpatch.pl244
-rw-r--r--scripts/gdb/linux/dmesg.py1
-rw-r--r--scripts/gdb/linux/lists.py92
-rw-r--r--scripts/gdb/linux/symbols.py9
-rw-r--r--scripts/gdb/linux/tasks.py20
-rw-r--r--scripts/gdb/linux/utils.py4
-rw-r--r--scripts/gdb/vmlinux-gdb.py1
-rwxr-xr-xscripts/get_maintainer.pl32
346 files changed, 9309 insertions, 3570 deletions
diff --git a/.mailmap b/.mailmap
index 6287004040e7c1..bcc72d093eca49 100644
--- a/.mailmap
+++ b/.mailmap
@@ -84,6 +84,7 @@ Mayuresh Janorkar <mayur@ti.com>
Michael Buesch <m@bues.ch>
Michel Dänzer <michel@tungstengraphics.com>
Mitesh shah <mshah@teja.com>
+Mohit Kumar <mohit.kumar@st.com> <mohit.kumar.dhaka@gmail.com>
Morten Welinder <terra@gnome.org>
Morten Welinder <welinder@anemone.rentec.com>
Morten Welinder <welinder@darter.rentec.com>
@@ -95,6 +96,7 @@ Patrick Mochel <mochel@digitalimplant.org>
Peter A Jonsson <pj@ludd.ltu.se>
Peter Oruba <peter@oruba.de>
Peter Oruba <peter.oruba@amd.com>
+Pratyush Anand <pratyush.anand@gmail.com> <pratyush.anand@st.com>
Praveen BP <praveenbp@ti.com>
Rajesh Shah <rajesh.shah@intel.com>
Ralf Baechle <ralf@linux-mips.org>
diff --git a/Documentation/ABI/testing/configfs-spear-pcie-gadget b/Documentation/ABI/testing/configfs-spear-pcie-gadget
index 875988146a63cd..840c324ef34d3e 100644
--- a/Documentation/ABI/testing/configfs-spear-pcie-gadget
+++ b/Documentation/ABI/testing/configfs-spear-pcie-gadget
@@ -1,7 +1,7 @@
What: /config/pcie-gadget
Date: Feb 2011
KernelVersion: 2.6.37
-Contact: Pratyush Anand <pratyush.anand@st.com>
+Contact: Pratyush Anand <pratyush.anand@gmail.com>
Description:
Interface is used to configure selected dual mode PCIe controller
diff --git a/Documentation/ABI/testing/dev-kmsg b/Documentation/ABI/testing/dev-kmsg
index bb820be4817900..fff817efa508f5 100644
--- a/Documentation/ABI/testing/dev-kmsg
+++ b/Documentation/ABI/testing/dev-kmsg
@@ -98,4 +98,13 @@ Description: The /dev/kmsg character device node provides userspace access
logic is used internally when messages are printed to the
console, /proc/kmsg or the syslog() syscall.
+ By default, kernel tries to avoid fragments by concatenating
+ when it can and fragments are rare; however, when extended
+ console support is enabled, the in-kernel concatenation is
+ disabled and /dev/kmsg output will contain more fragments. If
+ the log consumer performs concatenation, the end result
+ should be the same. In the future, the in-kernel concatenation
+ may be removed entirely and /dev/kmsg users are recommended to
+ implement fragment handling.
+
Users: dmesg(1), userspace kernel log consumers
diff --git a/Documentation/ABI/testing/sysfs-bus-usb-lvstest b/Documentation/ABI/testing/sysfs-bus-usb-lvstest
index aae68fc2d842a5..5151290cf8e75f 100644
--- a/Documentation/ABI/testing/sysfs-bus-usb-lvstest
+++ b/Documentation/ABI/testing/sysfs-bus-usb-lvstest
@@ -4,14 +4,14 @@ driver is bound with root hub device.
What: /sys/bus/usb/devices/.../get_dev_desc
Date: March 2014
-Contact: Pratyush Anand <pratyush.anand@st.com>
+Contact: Pratyush Anand <pratyush.anand@gmail.com>
Description:
Write to this node to issue "Get Device Descriptor"
for Link Layer Validation device. It is needed for TD.7.06.
What: /sys/bus/usb/devices/.../u1_timeout
Date: March 2014
-Contact: Pratyush Anand <pratyush.anand@st.com>
+Contact: Pratyush Anand <pratyush.anand@gmail.com>
Description:
Set "U1 timeout" for the downstream port where Link Layer
Validation device is connected. Timeout value must be between 0
@@ -19,7 +19,7 @@ Description:
What: /sys/bus/usb/devices/.../u2_timeout
Date: March 2014
-Contact: Pratyush Anand <pratyush.anand@st.com>
+Contact: Pratyush Anand <pratyush.anand@gmail.com>
Description:
Set "U2 timeout" for the downstream port where Link Layer
Validation device is connected. Timeout value must be between 0
@@ -27,21 +27,21 @@ Description:
What: /sys/bus/usb/devices/.../hot_reset
Date: March 2014
-Contact: Pratyush Anand <pratyush.anand@st.com>
+Contact: Pratyush Anand <pratyush.anand@gmail.com>
Description:
Write to this node to issue "Reset" for Link Layer Validation
device. It is needed for TD.7.29, TD.7.31, TD.7.34 and TD.7.35.
What: /sys/bus/usb/devices/.../u3_entry
Date: March 2014
-Contact: Pratyush Anand <pratyush.anand@st.com>
+Contact: Pratyush Anand <pratyush.anand@gmail.com>
Description:
Write to this node to issue "U3 entry" for Link Layer
Validation device. It is needed for TD.7.35 and TD.7.36.
What: /sys/bus/usb/devices/.../u3_exit
Date: March 2014
-Contact: Pratyush Anand <pratyush.anand@st.com>
+Contact: Pratyush Anand <pratyush.anand@gmail.com>
Description:
Write to this node to issue "U3 exit" for Link Layer
Validation device. It is needed for TD.7.36.
diff --git a/Documentation/ABI/testing/sysfs-class-zram b/Documentation/ABI/testing/sysfs-class-zram
new file mode 100644
index 00000000000000..48ddacbe0e69e5
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-zram
@@ -0,0 +1,24 @@
+What: /sys/class/zram-control/
+Date: August 2015
+KernelVersion: 4.2
+Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Description:
+ The zram-control/ class sub-directory belongs to zram
+ device class
+
+What: /sys/class/zram-control/hot_add
+Date: August 2015
+KernelVersion: 4.2
+Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Description:
+ RO attribute. Read operation will cause zram to add a new
+ device and return its device id back to user (so one can
+ use /dev/zram<id>), or error code.
+
+What: /sys/class/zram-control/hot_remove
+Date: August 2015
+KernelVersion: 4.2
+Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
+Description:
+ WO attribute. Remove a specific /dev/zramX device, where X
+ is a device_id provided by user.
diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
index 48a183e29988d9..c4de576093affe 100644
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt
@@ -19,7 +19,9 @@ Following shows a typical sequence of steps for using zram.
1) Load Module:
modprobe zram num_devices=4
This creates 4 devices: /dev/zram{0,1,2,3}
- (num_devices parameter is optional. Default: 1)
+
+num_devices parameter is optional and tells zram how many devices should be
+pre-created. Default: 1.
2) Set max number of compression streams
Compression backend may use up to max_comp_streams compression streams,
@@ -97,7 +99,24 @@ size of the disk when not in use so a huge zram is wasteful.
mkfs.ext4 /dev/zram1
mount /dev/zram1 /tmp
-7) Stats:
+7) Add/remove zram devices
+
+zram provides a control interface, which enables dynamic (on-demand) device
+addition and removal.
+
+In order to add a new /dev/zramX device, perform read operation on hot_add
+attribute. This will return either new device's device id (meaning that you
+can use /dev/zram<id>) or error code.
+
+Example:
+ cat /sys/class/zram-control/hot_add
+ 1
+
+To remove the existing /dev/zramX device (where X is a device id)
+execute
+ echo X > /sys/class/zram-control/hot_remove
+
+8) Stats:
Per-device statistics are exported as various nodes under /sys/block/zram<id>/
A brief description of exported device attritbutes. For more details please
@@ -126,7 +145,7 @@ mem_used_max RW the maximum amount memory zram have consumed to
mem_limit RW the maximum amount of memory ZRAM can use to store
the compressed data
num_migrated RO the number of objects migrated migrated by compaction
-
+compact WO trigger memory compaction
WARNING
=======
@@ -172,11 +191,11 @@ line of text and contains the following stats separated by whitespace:
zero_pages
num_migrated
-8) Deactivate:
+9) Deactivate:
swapoff /dev/zram0
umount /dev/zram1
-9) Reset:
+10) Reset:
Write any positive value to 'reset' sysfs node
echo 1 > /sys/block/zram0/reset
echo 1 > /sys/block/zram1/reset
diff --git a/Documentation/devicetree/bindings/w1/omap-hdq.txt b/Documentation/devicetree/bindings/w1/omap-hdq.txt
index fef794741bd1d8..913c5f91a0f9f8 100644
--- a/Documentation/devicetree/bindings/w1/omap-hdq.txt
+++ b/Documentation/devicetree/bindings/w1/omap-hdq.txt
@@ -1,11 +1,15 @@
* OMAP HDQ One wire bus master controller
Required properties:
-- compatible : should be "ti,omap3-1w"
+- compatible : should be "ti,omap3-1w" or "ti,am4372-hdq"
- reg : Address and length of the register set for the device
- interrupts : interrupt line.
- ti,hwmods : "hdq1w"
+Optional properties:
+- ti,mode: should be "hdq": HDQ mode "1w": one-wire mode.
+ If not specified HDQ mode is implied.
+
Example:
- From omap3.dtsi
@@ -14,4 +18,5 @@ Example:
reg = <0x480b2000 0x1000>;
interrupts = <58>;
ti,hwmods = "hdq1w";
+ ti,mode = "hdq";
};
diff --git a/Documentation/filesystems/vfat.txt b/Documentation/filesystems/vfat.txt
index ce1126aceed8fc..223c32171dcc2b 100644
--- a/Documentation/filesystems/vfat.txt
+++ b/Documentation/filesystems/vfat.txt
@@ -180,6 +180,16 @@ dos1xfloppy -- If set, use a fallback default BIOS Parameter Block
<bool>: 0,1,yes,no,true,false
+LIMITATION
+---------------------------------------------------------------------
+* The fallocated region of file is discarded at umount/evict time
+ when using fallocate with FALLOC_FL_KEEP_SIZE.
+ So, User should assume that fallocated region can be discarded at
+ last close if there is memory pressure resulting in eviction of
+ the inode from the memory. As a result, for any dependency on
+ the fallocated region, user should make sure to recheck fallocate
+ after reopening the file.
+
TODO
----------------------------------------------------------------------
* Need to get rid of the raw scanning stuff. Instead, always use
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index 1e166ad3e1d732..bb38ebf1fa1695 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -302,6 +302,7 @@ Code Seq#(hex) Include File Comments
0xA3 80-8F Port ACL in development:
<mailto:tlewis@mindspring.com>
0xA3 90-9F linux/dtlk.h
+0xAA 00-3F linux/uapi/linux/userfaultfd.h
0xAB 00-1F linux/nbd.h
0xAC 00-1F linux/raw.h
0xAD 00 Netfilter device in development:
diff --git a/Documentation/lockup-watchdogs.txt b/Documentation/lockup-watchdogs.txt
index ab0baa692c13be..22dd6af2e4bd42 100644
--- a/Documentation/lockup-watchdogs.txt
+++ b/Documentation/lockup-watchdogs.txt
@@ -61,3 +61,21 @@ As explained above, a kernel knob is provided that allows
administrators to configure the period of the hrtimer and the perf
event. The right value for a particular environment is a trade-off
between fast response to lockups and detection overhead.
+
+By default, the watchdog runs on all online cores. However, on a
+kernel configured with NO_HZ_FULL, by default the watchdog runs only
+on the housekeeping cores, not the cores specified in the "nohz_full"
+boot argument. If we allowed the watchdog to run by default on
+the "nohz_full" cores, we would have to run timer ticks to activate
+the scheduler, which would prevent the "nohz_full" functionality
+from protecting the user code on those cores from the kernel.
+Of course, disabling it by default on the nohz_full cores means that
+when those cores do enter the kernel, by default we will not be
+able to detect if they lock up. However, allowing the watchdog
+to continue to run on the housekeeping (non-tickless) cores means
+that we will continue to detect lockups properly on those cores.
+
+In either case, the set of cores excluded from running the watchdog
+may be adjusted via the kernel.watchdog_cpumask sysctl. For
+nohz_full cores, this may be useful for debugging a case where the
+kernel seems to be hanging on the nohz_full cores.
diff --git a/Documentation/misc-devices/spear-pcie-gadget.txt b/Documentation/misc-devices/spear-pcie-gadget.txt
index 02c13ef5e908a6..89b88dee4143fd 100644
--- a/Documentation/misc-devices/spear-pcie-gadget.txt
+++ b/Documentation/misc-devices/spear-pcie-gadget.txt
@@ -2,7 +2,7 @@ Spear PCIe Gadget Driver:
Author
=============
-Pratyush Anand (pratyush.anand@st.com)
+Pratyush Anand (pratyush.anand@gmail.com)
Location
============
diff --git a/Documentation/networking/netconsole.txt b/Documentation/networking/netconsole.txt
index a5d574a9ae0987..30409a36e95daa 100644
--- a/Documentation/networking/netconsole.txt
+++ b/Documentation/networking/netconsole.txt
@@ -2,6 +2,7 @@
started by Ingo Molnar <mingo@redhat.com>, 2001.09.17
2.6 port and netpoll api by Matt Mackall <mpm@selenic.com>, Sep 9 2003
IPv6 support by Cong Wang <xiyou.wangcong@gmail.com>, Jan 1 2013
+Extended console support by Tejun Heo <tj@kernel.org>, May 1 2015
Please send bug reports to Matt Mackall <mpm@selenic.com>
Satyam Sharma <satyam.sharma@gmail.com>, and Cong Wang <xiyou.wangcong@gmail.com>
@@ -24,9 +25,10 @@ Sender and receiver configuration:
It takes a string configuration parameter "netconsole" in the
following format:
- netconsole=[src-port]@[src-ip]/[<dev>],[tgt-port]@<tgt-ip>/[tgt-macaddr]
+ netconsole=[+][src-port]@[src-ip]/[<dev>],[tgt-port]@<tgt-ip>/[tgt-macaddr]
where
+ + if present, enable extended console support
src-port source for UDP packets (defaults to 6665)
src-ip source IP to use (interface address)
dev network interface (eth0)
@@ -107,6 +109,7 @@ To remove a target:
The interface exposes these parameters of a netconsole target to userspace:
enabled Is this target currently enabled? (read-write)
+ extended Extended mode enabled (read-write)
dev_name Local network interface name (read-write)
local_port Source UDP port to use (read-write)
remote_port Remote agent's UDP port (read-write)
@@ -132,6 +135,36 @@ You can also update the local interface dynamically. This is especially
useful if you want to use interfaces that have newly come up (and may not
have existed when netconsole was loaded / initialized).
+Extended console:
+=================
+
+If '+' is prefixed to the configuration line or "extended" config file
+is set to 1, extended console support is enabled. An example boot
+param follows.
+
+ linux netconsole=+4444@10.0.0.1/eth1,9353@10.0.0.2/12:34:56:78:9a:bc
+
+Log messages are transmitted with extended metadata header in the
+following format which is the same as /dev/kmsg.
+
+ <level>,<sequnum>,<timestamp>,<contflag>;<message text>
+
+Non printable characters in <message text> are escaped using "\xff"
+notation. If the message contains optional dictionary, verbatim
+newline is used as the delimeter.
+
+If a message doesn't fit in certain number of bytes (currently 1000),
+the message is split into multiple fragments by netconsole. These
+fragments are transmitted with "ncfrag" header field added.
+
+ ncfrag=<byte-offset>/<total-bytes>
+
+For example, assuming a lot smaller chunk size, a message "the first
+chunk, the 2nd chunk." may be split as follows.
+
+ 6,416,1758426,-,ncfrag=0/31;the first chunk,
+ 6,416,1758426,-,ncfrag=16/31; the 2nd chunk.
+
Miscellaneous notes:
====================
diff --git a/Documentation/printk-formats.txt b/Documentation/printk-formats.txt
index 2216eb187c213b..2ec6d84f391cb4 100644
--- a/Documentation/printk-formats.txt
+++ b/Documentation/printk-formats.txt
@@ -244,6 +244,14 @@ dentry names:
Passed by reference.
+task_struct comm name:
+
+ %pT
+
+ For printing task_struct->comm.
+
+ Passed by reference (NULL for "current").
+
struct va_format:
%pV
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index c831001c45f116..6fccb69c03e747 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -197,8 +197,8 @@ core_pattern is used to specify a core dumpfile pattern name.
%P global pid (init PID namespace)
%i tid
%I global tid (init PID namespace)
- %u uid
- %g gid
+ %u uid (in initial user namespace)
+ %g gid (in initial user namespace)
%d dump mode, matches PR_SET_DUMPABLE and
/proc/sys/fs/suid_dumpable
%s signal number
@@ -923,6 +923,27 @@ and nmi_watchdog.
==============================================================
+watchdog_cpumask:
+
+This value can be used to control on which cpus the watchdog may run.
+The default cpumask is all possible cores, but if NO_HZ_FULL is
+enabled in the kernel config, and cores are specified with the
+nohz_full= boot argument, those cores are excluded by default.
+Offline cores can be included in this mask, and if the core is later
+brought online, the watchdog will be started based on the mask value.
+
+Typically this value would only be touched in the nohz_full case
+to re-enable cores that by default were not running the watchdog,
+if a kernel lockup was suspected on those cores.
+
+The argument value is the standard cpulist format for cpumasks,
+so for example to enable the watchdog on cores 0, 2, 3, and 4 you
+might say:
+
+ echo 0,2-4 > /proc/sys/kernel/watchdog_cpumask
+
+==============================================================
+
watchdog_thresh:
This value can be used to control the frequency of hrtimer and NMI
diff --git a/Documentation/vm/unevictable-lru.txt b/Documentation/vm/unevictable-lru.txt
index 3be0bfc4738df5..32ee3a67dba20e 100644
--- a/Documentation/vm/unevictable-lru.txt
+++ b/Documentation/vm/unevictable-lru.txt
@@ -467,7 +467,13 @@ mmap(MAP_LOCKED) SYSTEM CALL HANDLING
In addition the mlock()/mlockall() system calls, an application can request
that a region of memory be mlocked supplying the MAP_LOCKED flag to the mmap()
-call. Furthermore, any mmap() call or brk() call that expands the heap by a
+call. There is one important and subtle difference here, though. mmap() + mlock()
+will fail if the range cannot be faulted in (e.g. because mm_populate fails)
+and returns with ENOMEM while mmap(MAP_LOCKED) will not fail. The mmaped
+area will still have properties of the locked area - aka. pages will not get
+swapped out - but major page faults to fault memory in might still happen.
+
+Furthermore, any mmap() call or brk() call that expands the heap by a
task that has previously called mlockall() with the MCL_FUTURE flag will result
in the newly mapped memory being mlocked. Before the unevictable/mlock
changes, the kernel simply called make_pages_present() to allocate pages and
diff --git a/Documentation/vm/userfaultfd.txt b/Documentation/vm/userfaultfd.txt
new file mode 100644
index 00000000000000..70a3c94d19413b
--- /dev/null
+++ b/Documentation/vm/userfaultfd.txt
@@ -0,0 +1,144 @@
+= Userfaultfd =
+
+== Objective ==
+
+Userfaults allow the implementation of on-demand paging from userland
+and more generally they allow userland to take control of various
+memory page faults, something otherwise only the kernel code could do.
+
+For example userfaults allows a proper and more optimal implementation
+of the PROT_NONE+SIGSEGV trick.
+
+== Design ==
+
+Userfaults are delivered and resolved through the userfaultfd syscall.
+
+The userfaultfd (aside from registering and unregistering virtual
+memory ranges) provides two primary functionalities:
+
+1) read/POLLIN protocol to notify a userland thread of the faults
+ happening
+
+2) various UFFDIO_* ioctls that can manage the virtual memory regions
+ registered in the userfaultfd that allows userland to efficiently
+ resolve the userfaults it receives via 1) or to manage the virtual
+ memory in the background
+
+The real advantage of userfaults if compared to regular virtual memory
+management of mremap/mprotect is that the userfaults in all their
+operations never involve heavyweight structures like vmas (in fact the
+userfaultfd runtime load never takes the mmap_sem for writing).
+
+Vmas are not suitable for page- (or hugepage) granular fault tracking
+when dealing with virtual address spaces that could span
+Terabytes. Too many vmas would be needed for that.
+
+The userfaultfd once opened by invoking the syscall, can also be
+passed using unix domain sockets to a manager process, so the same
+manager process could handle the userfaults of a multitude of
+different processes without them being aware about what is going on
+(well of course unless they later try to use the userfaultfd
+themselves on the same region the manager is already tracking, which
+is a corner case that would currently return -EBUSY).
+
+== API ==
+
+When first opened the userfaultfd must be enabled invoking the
+UFFDIO_API ioctl specifying a uffdio_api.api value set to UFFD_API (or
+a later API version) which will specify the read/POLLIN protocol
+userland intends to speak on the UFFD and the uffdio_api.features
+userland requires. The UFFDIO_API ioctl if successful (i.e. if the
+requested uffdio_api.api is spoken also by the running kernel and the
+requested features are going to be enabled) will return into
+uffdio_api.features and uffdio_api.ioctls two 64bit bitmasks of
+respectively all the available features of the read(2) protocol and
+the generic ioctl available.
+
+Once the userfaultfd has been enabled the UFFDIO_REGISTER ioctl should
+be invoked (if present in the returned uffdio_api.ioctls bitmask) to
+register a memory range in the userfaultfd by setting the
+uffdio_register structure accordingly. The uffdio_register.mode
+bitmask will specify to the kernel which kind of faults to track for
+the range (UFFDIO_REGISTER_MODE_MISSING would track missing
+pages). The UFFDIO_REGISTER ioctl will return the
+uffdio_register.ioctls bitmask of ioctls that are suitable to resolve
+userfaults on the range registered. Not all ioctls will necessarily be
+supported for all memory types depending on the underlying virtual
+memory backend (anonymous memory vs tmpfs vs real filebacked
+mappings).
+
+Userland can use the uffdio_register.ioctls to manage the virtual
+address space in the background (to add or potentially also remove
+memory from the userfaultfd registered range). This means a userfault
+could be triggering just before userland maps in the background the
+user-faulted page.
+
+The primary ioctl to resolve userfaults is UFFDIO_COPY. That
+atomically copies a page into the userfault registered range and wakes
+up the blocked userfaults (unless uffdio_copy.mode &
+UFFDIO_COPY_MODE_DONTWAKE is set). Other ioctl works similarly to
+UFFDIO_COPY. They're atomic as in guaranteeing that nothing can see an
+half copied page since it'll keep userfaulting until the copy has
+finished.
+
+== QEMU/KVM ==
+
+QEMU/KVM is using the userfaultfd syscall to implement postcopy live
+migration. Postcopy live migration is one form of memory
+externalization consisting of a virtual machine running with part or
+all of its memory residing on a different node in the cloud. The
+userfaultfd abstraction is generic enough that not a single line of
+KVM kernel code had to be modified in order to add postcopy live
+migration to QEMU.
+
+Guest async page faults, FOLL_NOWAIT and all other GUP features work
+just fine in combination with userfaults. Userfaults trigger async
+page faults in the guest scheduler so those guest processes that
+aren't waiting for userfaults (i.e. network bound) can keep running in
+the guest vcpus.
+
+It is generally beneficial to run one pass of precopy live migration
+just before starting postcopy live migration, in order to avoid
+generating userfaults for readonly guest regions.
+
+The implementation of postcopy live migration currently uses one
+single bidirectional socket but in the future two different sockets
+will be used (to reduce the latency of the userfaults to the minimum
+possible without having to decrease /proc/sys/net/ipv4/tcp_wmem).
+
+The QEMU in the source node writes all pages that it knows are missing
+in the destination node, into the socket, and the migration thread of
+the QEMU running in the destination node runs UFFDIO_COPY|ZEROPAGE
+ioctls on the userfaultfd in order to map the received pages into the
+guest (UFFDIO_ZEROCOPY is used if the source page was a zero page).
+
+A different postcopy thread in the destination node listens with
+poll() to the userfaultfd in parallel. When a POLLIN event is
+generated after a userfault triggers, the postcopy thread read() from
+the userfaultfd and receives the fault address (or -EAGAIN in case the
+userfault was already resolved and waken by a UFFDIO_COPY|ZEROPAGE run
+by the parallel QEMU migration thread).
+
+After the QEMU postcopy thread (running in the destination node) gets
+the userfault address it writes the information about the missing page
+into the socket. The QEMU source node receives the information and
+roughly "seeks" to that page address and continues sending all
+remaining missing pages from that new page offset. Soon after that
+(just the time to flush the tcp_wmem queue through the network) the
+migration thread in the QEMU running in the destination node will
+receive the page that triggered the userfault and it'll map it as
+usual with the UFFDIO_COPY|ZEROPAGE (without actually knowing if it
+was spontaneously sent by the source or if it was an urgent page
+requested through an userfault).
+
+By the time the userfaults start, the QEMU in the destination node
+doesn't need to keep any per-page state bitmap relative to the live
+migration around and a single per-page bitmap has to be maintained in
+the QEMU running in the source node to know which pages are still
+missing in the destination node. The bitmap in the source node is
+checked to find which missing pages to send in round robin and we seek
+over it when receiving incoming userfaults. After sending each page of
+course the bitmap is updated accordingly. It's also useful to avoid
+sending the same page twice (in case the userfault is read by the
+postcopy thread just before UFFDIO_COPY|ZEROPAGE runs in the migration
+thread).
diff --git a/Documentation/vm/zswap.txt b/Documentation/vm/zswap.txt
index 00c3d31e7971bb..8458c0861e4e6e 100644
--- a/Documentation/vm/zswap.txt
+++ b/Documentation/vm/zswap.txt
@@ -26,8 +26,22 @@ Zswap evicts pages from compressed cache on an LRU basis to the backing swap
device when the compressed pool reaches its size limit. This requirement had
been identified in prior community discussions.
-To enabled zswap, the "enabled" attribute must be set to 1 at boot time. e.g.
-zswap.enabled=1
+Zswap is disabled by default but can be enabled at boot time by setting
+the "enabled" attribute to 1 at boot time. ie: zswap.enabled=1. Zswap
+can also be enabled and disabled at runtime using the sysfs interface.
+An example command to enable zswap at runtime, assuming sysfs is mounted
+at /sys, is:
+
+echo 1 > /sys/modules/zswap/parameters/enabled
+
+When zswap is disabled at runtime it will stop storing pages that are
+being swapped out. However, it will _not_ immediately write out or fault
+back into memory all of the pages stored in the compressed pool. The
+pages stored in zswap will remain in the compressed pool until they are
+either invalidated or faulted back into memory. In order to force all
+pages out of the compressed pool, a swapoff on the swap device(s) will
+fault back into memory all swapped out pages, including those in the
+compressed pool.
Design:
diff --git a/Documentation/w1/masters/omap-hdq b/Documentation/w1/masters/omap-hdq
index 884dc284b2155d..234522709a5f4b 100644
--- a/Documentation/w1/masters/omap-hdq
+++ b/Documentation/w1/masters/omap-hdq
@@ -44,3 +44,9 @@ e.g:
insmod omap_hdq.ko W1_ID=2
inamod w1_bq27000.ko F_ID=2
+The driver also supports 1-wire mode. In this mode, there is no need to
+pass slave ID as parameter. The driver will auto-detect slaves connected
+to the bus using SEARCH_ROM procedure. 1-wire mode can be selected by
+setting "ti,mode" property to "1w" in DT (see
+Documentation/devicetree/bindings/w1/omap-hdq.txt for more details).
+By default driver is in HDQ mode.
diff --git a/arch/Kconfig b/arch/Kconfig
index a65eafb24997e5..bec6666a3cc466 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -499,6 +499,13 @@ config ARCH_HAS_ELF_RANDOMIZE
- arch_mmap_rnd()
- arch_randomize_brk()
+config HAVE_COPY_THREAD_TLS
+ bool
+ help
+ Architecture provides copy_thread_tls to accept tls argument via
+ normal C parameter passing, rather than extracting the syscall
+ argument from pt_regs.
+
#
# ABI hall of shame
#
diff --git a/arch/alpha/include/asm/mm-arch-hooks.h b/arch/alpha/include/asm/mm-arch-hooks.h
new file mode 100644
index 00000000000000..b07fd862fec359
--- /dev/null
+++ b/arch/alpha/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_ALPHA_MM_ARCH_HOOKS_H
+#define _ASM_ALPHA_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_ALPHA_MM_ARCH_HOOKS_H */
diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h
index 0086b472bc2b4c..836fbd44f65bae 100644
--- a/arch/alpha/include/uapi/asm/mman.h
+++ b/arch/alpha/include/uapi/asm/mman.h
@@ -44,6 +44,7 @@
#define MADV_WILLNEED 3 /* will need these pages */
#define MADV_SPACEAVAIL 5 /* ensure resources are available */
#define MADV_DONTNEED 6 /* don't need these pages */
+#define MADV_FREE 7 /* free pages only if memory pressure */
/* common/generic parameters */
#define MADV_REMOVE 9 /* remove these pages & resources */
diff --git a/arch/arc/include/asm/dma-mapping.h b/arch/arc/include/asm/dma-mapping.h
index fd6cdb56d4fd7e..2d28ba939d8edc 100644
--- a/arch/arc/include/asm/dma-mapping.h
+++ b/arch/arc/include/asm/dma-mapping.h
@@ -157,22 +157,24 @@ dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle,
}
static inline void
-dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
+dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist, int nelems,
enum dma_data_direction dir)
{
int i;
+ struct scatterlist *sg;
- for (i = 0; i < nelems; i++, sg++)
+ for_each_sg(sglist, sg, nelems, i)
_dma_cache_sync((unsigned int)sg_virt(sg), sg->length, dir);
}
static inline void
-dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
- enum dma_data_direction dir)
+dma_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
+ int nelems, enum dma_data_direction dir)
{
int i;
+ struct scatterlist *sg;
- for (i = 0; i < nelems; i++, sg++)
+ for_each_sg(sglist, sg, nelems, i)
_dma_cache_sync((unsigned int)sg_virt(sg), sg->length, dir);
}
diff --git a/arch/arc/include/asm/mm-arch-hooks.h b/arch/arc/include/asm/mm-arch-hooks.h
new file mode 100644
index 00000000000000..c37541c5f8ba4a
--- /dev/null
+++ b/arch/arc/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_ARC_MM_ARCH_HOOKS_H
+#define _ASM_ARC_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_ARC_MM_ARCH_HOOKS_H */
diff --git a/arch/arm/include/asm/hugetlb.h b/arch/arm/include/asm/hugetlb.h
index 1f1b1cd112f3ad..7d26f6c4f0f5eb 100644
--- a/arch/arm/include/asm/hugetlb.h
+++ b/arch/arm/include/asm/hugetlb.h
@@ -53,10 +53,6 @@ static inline int prepare_hugepage_range(struct file *file,
return 0;
}
-static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm)
-{
-}
-
static inline int huge_pte_none(pte_t pte)
{
return pte_none(pte);
@@ -67,15 +63,6 @@ static inline pte_t huge_pte_wrprotect(pte_t pte)
return pte_wrprotect(pte);
}
-static inline int arch_prepare_hugepage(struct page *page)
-{
- return 0;
-}
-
-static inline void arch_release_hugepage(struct page *page)
-{
-}
-
static inline void arch_clear_hugepage_flags(struct page *page)
{
clear_bit(PG_dcache_clean, &page->flags);
diff --git a/arch/arm/include/asm/mm-arch-hooks.h b/arch/arm/include/asm/mm-arch-hooks.h
new file mode 100644
index 00000000000000..7056660c7cc472
--- /dev/null
+++ b/arch/arm/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_ARM_MM_ARCH_HOOKS_H
+#define _ASM_ARM_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_ARM_MM_ARCH_HOOKS_H */
diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
index a745a2a53853c3..6d6012a320b207 100644
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -249,6 +249,7 @@ PMD_BIT_FUNC(mkold, &= ~PMD_SECT_AF);
PMD_BIT_FUNC(mksplitting, |= L_PMD_SECT_SPLITTING);
PMD_BIT_FUNC(mkwrite, &= ~L_PMD_SECT_RDONLY);
PMD_BIT_FUNC(mkdirty, |= L_PMD_SECT_DIRTY);
+PMD_BIT_FUNC(mkclean, &= ~L_PMD_SECT_DIRTY);
PMD_BIT_FUNC(mkyoung, |= PMD_SECT_AF);
#define pmd_mkhuge(pmd) (__pmd(pmd_val(pmd) & ~PMD_TABLE_BIT))
diff --git a/arch/arm/mm/hugetlbpage.c b/arch/arm/mm/hugetlbpage.c
index c724124150934f..fcafb521f14edd 100644
--- a/arch/arm/mm/hugetlbpage.c
+++ b/arch/arm/mm/hugetlbpage.c
@@ -41,11 +41,6 @@ int pud_huge(pud_t pud)
return 0;
}
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
-{
- return 0;
-}
-
int pmd_huge(pmd_t pmd)
{
return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT);
diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h
index 5b7ca8ace95f46..2fd9b14ca2956e 100644
--- a/arch/arm64/include/asm/hugetlb.h
+++ b/arch/arm64/include/asm/hugetlb.h
@@ -86,10 +86,6 @@ static inline int prepare_hugepage_range(struct file *file,
return 0;
}
-static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm)
-{
-}
-
static inline int huge_pte_none(pte_t pte)
{
return pte_none(pte);
@@ -100,15 +96,6 @@ static inline pte_t huge_pte_wrprotect(pte_t pte)
return pte_wrprotect(pte);
}
-static inline int arch_prepare_hugepage(struct page *page)
-{
- return 0;
-}
-
-static inline void arch_release_hugepage(struct page *page)
-{
-}
-
static inline void arch_clear_hugepage_flags(struct page *page)
{
clear_bit(PG_dcache_clean, &page->flags);
diff --git a/arch/arm64/include/asm/mm-arch-hooks.h b/arch/arm64/include/asm/mm-arch-hooks.h
new file mode 100644
index 00000000000000..562b655f5ba96f
--- /dev/null
+++ b/arch/arm64/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_ARM64_MM_ARCH_HOOKS_H
+#define _ASM_ARM64_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_ARM64_MM_ARCH_HOOKS_H */
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 56283f8a675c5f..bd5db28324badf 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -285,10 +285,12 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd))
#define pmd_young(pmd) pte_young(pmd_pte(pmd))
+#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd))
#define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd)))
#define pmd_mksplitting(pmd) pte_pmd(pte_mkspecial(pmd_pte(pmd)))
#define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd)))
#define pmd_mkwrite(pmd) pte_pmd(pte_mkwrite(pmd_pte(pmd)))
+#define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd)))
#define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd)))
#define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd)))
#define pmd_mknotpresent(pmd) (__pmd(pmd_val(pmd) & ~PMD_TYPE_MASK))
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 2de9d2e59d9680..cccc4af87a0372 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -31,13 +31,6 @@
#include <asm/tlbflush.h>
#include <asm/pgalloc.h>
-#ifndef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
-{
- return 0;
-}
-#endif
-
int pmd_huge(pmd_t pmd)
{
return !(pmd_val(pmd) & PMD_TABLE_BIT);
diff --git a/arch/avr32/include/asm/dma-mapping.h b/arch/avr32/include/asm/dma-mapping.h
index b3d18f9f3e8d46..ae7ac9205d20cb 100644
--- a/arch/avr32/include/asm/dma-mapping.h
+++ b/arch/avr32/include/asm/dma-mapping.h
@@ -209,17 +209,18 @@ dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
* the same here.
*/
static inline int
-dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
+dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents,
enum dma_data_direction direction)
{
int i;
+ struct scatterlist *sg;
- for (i = 0; i < nents; i++) {
+ for_each_sg(sglist, sg, nents, i) {
char *virt;
- sg[i].dma_address = page_to_bus(sg_page(&sg[i])) + sg[i].offset;
- virt = sg_virt(&sg[i]);
- dma_cache_sync(dev, virt, sg[i].length, direction);
+ sg->dma_address = page_to_bus(sg_page(sg)) + sg->offset;
+ virt = sg_virt(sg);
+ dma_cache_sync(dev, virt, sg->length, direction);
}
return nents;
@@ -321,14 +322,14 @@ dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
}
static inline void
-dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
+dma_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
int nents, enum dma_data_direction direction)
{
int i;
+ struct scatterlist *sg;
- for (i = 0; i < nents; i++) {
- dma_cache_sync(dev, sg_virt(&sg[i]), sg[i].length, direction);
- }
+ for_each_sg(sglist, sg, nents, i)
+ dma_cache_sync(dev, sg_virt(sg), sg->length, direction);
}
/* Now for the API extensions over the pci_ one */
diff --git a/arch/avr32/include/asm/mm-arch-hooks.h b/arch/avr32/include/asm/mm-arch-hooks.h
new file mode 100644
index 00000000000000..145452ffbdade7
--- /dev/null
+++ b/arch/avr32/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_AVR32_MM_ARCH_HOOKS_H
+#define _ASM_AVR32_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_AVR32_MM_ARCH_HOOKS_H */
diff --git a/arch/blackfin/include/asm/mm-arch-hooks.h b/arch/blackfin/include/asm/mm-arch-hooks.h
new file mode 100644
index 00000000000000..1c5211ec338fdd
--- /dev/null
+++ b/arch/blackfin/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_BLACKFIN_MM_ARCH_HOOKS_H
+#define _ASM_BLACKFIN_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_BLACKFIN_MM_ARCH_HOOKS_H */
diff --git a/arch/c6x/include/asm/mm-arch-hooks.h b/arch/c6x/include/asm/mm-arch-hooks.h
new file mode 100644
index 00000000000000..bb3c4a6ce8e912
--- /dev/null
+++ b/arch/c6x/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_C6X_MM_ARCH_HOOKS_H
+#define _ASM_C6X_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_C6X_MM_ARCH_HOOKS_H */
diff --git a/arch/cris/include/asm/mm-arch-hooks.h b/arch/cris/include/asm/mm-arch-hooks.h
new file mode 100644
index 00000000000000..314f774db2b0da
--- /dev/null
+++ b/arch/cris/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_CRIS_MM_ARCH_HOOKS_H
+#define _ASM_CRIS_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_CRIS_MM_ARCH_HOOKS_H */
diff --git a/arch/frv/include/asm/mm-arch-hooks.h b/arch/frv/include/asm/mm-arch-hooks.h
new file mode 100644
index 00000000000000..51d13a8704040d
--- /dev/null
+++ b/arch/frv/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_FRV_MM_ARCH_HOOKS_H
+#define _ASM_FRV_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_FRV_MM_ARCH_HOOKS_H */
diff --git a/arch/frv/include/asm/sections.h b/arch/frv/include/asm/sections.h
index 17d0fb171bba7f..d03fb64e93e9ce 100644
--- a/arch/frv/include/asm/sections.h
+++ b/arch/frv/include/asm/sections.h
@@ -35,12 +35,6 @@ extern unsigned long __nongprelbss memory_start;
extern unsigned long __nongprelbss memory_end;
extern unsigned long __nongprelbss rom_length;
-/* determine if we're running from ROM */
-static inline int is_in_rom(unsigned long addr)
-{
- return 0; /* default case: not in ROM */
-}
-
#endif
#endif
#endif /* _ASM_SECTIONS_H */
diff --git a/arch/frv/mb93090-mb00/pci-dma-nommu.c b/arch/frv/mb93090-mb00/pci-dma-nommu.c
index b99c2a7cc7a41f..8eeea0d77aadac 100644
--- a/arch/frv/mb93090-mb00/pci-dma-nommu.c
+++ b/arch/frv/mb93090-mb00/pci-dma-nommu.c
@@ -119,14 +119,16 @@ dma_addr_t dma_map_single(struct device *dev, void *ptr, size_t size,
EXPORT_SYMBOL(dma_map_single);
-int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
+int dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents,
enum dma_data_direction direction)
{
int i;
+ struct scatterlist *sg;
- for (i=0; i<nents; i++)
- frv_cache_wback_inv(sg_dma_address(&sg[i]),
- sg_dma_address(&sg[i]) + sg_dma_len(&sg[i]));
+ for_each_sg(sglist, sg, nents, i) {
+ frv_cache_wback_inv(sg_dma_address(sg),
+ sg_dma_address(sg) + sg_dma_len(sg));
+ }
BUG_ON(direction == DMA_NONE);
diff --git a/arch/frv/mb93090-mb00/pci-dma.c b/arch/frv/mb93090-mb00/pci-dma.c
index 82478979ac9a8c..4d1f01dc46e5bb 100644
--- a/arch/frv/mb93090-mb00/pci-dma.c
+++ b/arch/frv/mb93090-mb00/pci-dma.c
@@ -50,19 +50,20 @@ dma_addr_t dma_map_single(struct device *dev, void *ptr, size_t size,
EXPORT_SYMBOL(dma_map_single);
-int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
+int dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents,
enum dma_data_direction direction)
{
unsigned long dampr2;
void *vaddr;
int i;
+ struct scatterlist *sg;
BUG_ON(direction == DMA_NONE);
dampr2 = __get_DAMPR(2);
- for (i = 0; i < nents; i++) {
- vaddr = kmap_atomic_primary(sg_page(&sg[i]));
+ for_each_sg(sglist, sg, nents, i) {
+ vaddr = kmap_atomic_primary(sg_page(sg));
frv_dcache_writeback((unsigned long) vaddr,
(unsigned long) vaddr + PAGE_SIZE);
diff --git a/arch/hexagon/include/asm/mm-arch-hooks.h b/arch/hexagon/include/asm/mm-arch-hooks.h
new file mode 100644
index 00000000000000..05e8b939e416c6
--- /dev/null
+++ b/arch/hexagon/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_HEXAGON_MM_ARCH_HOOKS_H
+#define _ASM_HEXAGON_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_HEXAGON_MM_ARCH_HOOKS_H */
diff --git a/arch/ia64/include/asm/hugetlb.h b/arch/ia64/include/asm/hugetlb.h
index aa910054b8e732..ef65f026b11e6f 100644
--- a/arch/ia64/include/asm/hugetlb.h
+++ b/arch/ia64/include/asm/hugetlb.h
@@ -20,10 +20,6 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
REGION_NUMBER((addr)+(len)-1) == RGN_HPAGE);
}
-static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm)
-{
-}
-
static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte)
{
@@ -69,15 +65,6 @@ static inline pte_t huge_ptep_get(pte_t *ptep)
return *ptep;
}
-static inline int arch_prepare_hugepage(struct page *page)
-{
- return 0;
-}
-
-static inline void arch_release_hugepage(struct page *page)
-{
-}
-
static inline void arch_clear_hugepage_flags(struct page *page)
{
}
diff --git a/arch/ia64/include/asm/mm-arch-hooks.h b/arch/ia64/include/asm/mm-arch-hooks.h
new file mode 100644
index 00000000000000..ab4b5c69832239
--- /dev/null
+++ b/arch/ia64/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_IA64_MM_ARCH_HOOKS_H
+#define _ASM_IA64_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_IA64_MM_ARCH_HOOKS_H */
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index 52b7604b5215f8..f50d4b3f501ad6 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -65,11 +65,6 @@ huge_pte_offset (struct mm_struct *mm, unsigned long addr)
return pte;
}
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
-{
- return 0;
-}
-
#define mk_pte_huge(entry) { pte_val(entry) |= _PAGE_P; }
/*
diff --git a/arch/ia64/mm/numa.c b/arch/ia64/mm/numa.c
index ea21d4cad540eb..aa19b7ac8222a2 100644
--- a/arch/ia64/mm/numa.c
+++ b/arch/ia64/mm/numa.c
@@ -58,27 +58,22 @@ paddr_to_nid(unsigned long paddr)
* SPARSEMEM to allocate the SPARSEMEM sectionmap on the NUMA node where
* the section resides.
*/
-int __meminit __early_pfn_to_nid(unsigned long pfn)
+int __meminit __early_pfn_to_nid(unsigned long pfn,
+ struct mminit_pfnnid_cache *state)
{
int i, section = pfn >> PFN_SECTION_SHIFT, ssec, esec;
- /*
- * NOTE: The following SMP-unsafe globals are only used early in boot
- * when the kernel is running single-threaded.
- */
- static int __meminitdata last_ssec, last_esec;
- static int __meminitdata last_nid;
- if (section >= last_ssec && section < last_esec)
- return last_nid;
+ if (section >= state->last_start && section < state->last_end)
+ return state->last_nid;
for (i = 0; i < num_node_memblks; i++) {
ssec = node_memblk[i].start_paddr >> PA_SECTION_SHIFT;
esec = (node_memblk[i].start_paddr + node_memblk[i].size +
((1L << PA_SECTION_SHIFT) - 1)) >> PA_SECTION_SHIFT;
if (section >= ssec && section < esec) {
- last_ssec = ssec;
- last_esec = esec;
- last_nid = node_memblk[i].nid;
+ state->last_start = ssec;
+ state->last_end = esec;
+ state->last_nid = node_memblk[i].nid;
return node_memblk[i].nid;
}
}
diff --git a/arch/m32r/include/asm/mm-arch-hooks.h b/arch/m32r/include/asm/mm-arch-hooks.h
new file mode 100644
index 00000000000000..6d60b4750f418c
--- /dev/null
+++ b/arch/m32r/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_M32R_MM_ARCH_HOOKS_H
+#define _ASM_M32R_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_M32R_MM_ARCH_HOOKS_H */
diff --git a/arch/m68k/include/asm/mm-arch-hooks.h b/arch/m68k/include/asm/mm-arch-hooks.h
new file mode 100644
index 00000000000000..7e8709bc90ae3b
--- /dev/null
+++ b/arch/m68k/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_M68K_MM_ARCH_HOOKS_H
+#define _ASM_M68K_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_M68K_MM_ARCH_HOOKS_H */
diff --git a/arch/metag/include/asm/hugetlb.h b/arch/metag/include/asm/hugetlb.h
index 471f481e67f3eb..905ed422dbeb36 100644
--- a/arch/metag/include/asm/hugetlb.h
+++ b/arch/metag/include/asm/hugetlb.h
@@ -14,10 +14,6 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
int prepare_hugepage_range(struct file *file, unsigned long addr,
unsigned long len);
-static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm)
-{
-}
-
static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
unsigned long addr, unsigned long end,
unsigned long floor,
@@ -71,15 +67,6 @@ static inline pte_t huge_ptep_get(pte_t *ptep)
return *ptep;
}
-static inline int arch_prepare_hugepage(struct page *page)
-{
- return 0;
-}
-
-static inline void arch_release_hugepage(struct page *page)
-{
-}
-
static inline void arch_clear_hugepage_flags(struct page *page)
{
}
diff --git a/arch/metag/include/asm/mm-arch-hooks.h b/arch/metag/include/asm/mm-arch-hooks.h
new file mode 100644
index 00000000000000..b0072b2eb0deff
--- /dev/null
+++ b/arch/metag/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_METAG_MM_ARCH_HOOKS_H
+#define _ASM_METAG_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_METAG_MM_ARCH_HOOKS_H */
diff --git a/arch/metag/mm/hugetlbpage.c b/arch/metag/mm/hugetlbpage.c
index 7ca80ac42ed5a9..53f0f6c470273b 100644
--- a/arch/metag/mm/hugetlbpage.c
+++ b/arch/metag/mm/hugetlbpage.c
@@ -89,11 +89,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
return pte;
}
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
-{
- return 0;
-}
-
int pmd_huge(pmd_t pmd)
{
return pmd_page_shift(pmd) > PAGE_SHIFT;
diff --git a/arch/microblaze/include/asm/mm-arch-hooks.h b/arch/microblaze/include/asm/mm-arch-hooks.h
new file mode 100644
index 00000000000000..5c4065911bdabc
--- /dev/null
+++ b/arch/microblaze/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_MICROBLAZE_MM_ARCH_HOOKS_H
+#define _ASM_MICROBLAZE_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_MICROBLAZE_MM_ARCH_HOOKS_H */
diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h
index fe0d15d3266015..982bc068533020 100644
--- a/arch/mips/include/asm/hugetlb.h
+++ b/arch/mips/include/asm/hugetlb.h
@@ -38,10 +38,6 @@ static inline int prepare_hugepage_range(struct file *file,
return 0;
}
-static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm)
-{
-}
-
static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
unsigned long addr,
unsigned long end,
@@ -114,15 +110,6 @@ static inline pte_t huge_ptep_get(pte_t *ptep)
return *ptep;
}
-static inline int arch_prepare_hugepage(struct page *page)
-{
- return 0;
-}
-
-static inline void arch_release_hugepage(struct page *page)
-{
-}
-
static inline void arch_clear_hugepage_flags(struct page *page)
{
}
diff --git a/arch/mips/include/asm/mm-arch-hooks.h b/arch/mips/include/asm/mm-arch-hooks.h
new file mode 100644
index 00000000000000..b5609fe8e47585
--- /dev/null
+++ b/arch/mips/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_MIPS_MM_ARCH_HOOKS_H
+#define _ASM_MIPS_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_MIPS_MM_ARCH_HOOKS_H */
diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
index 819af9d057a8ba..9d810675814291 100644
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -568,12 +568,12 @@ static inline pmd_t pmd_mknotpresent(pmd_t pmd)
}
/*
- * The generic version pmdp_get_and_clear uses a version of pmd_clear() with a
+ * The generic version pmdp_huge_get_and_clear uses a version of pmd_clear() with a
* different prototype.
*/
-#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
-static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
- unsigned long address, pmd_t *pmdp)
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
+static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
+ unsigned long address, pmd_t *pmdp)
{
pmd_t old = *pmdp;
diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h
index cfcb876cae6bc0..106e741aa7eee8 100644
--- a/arch/mips/include/uapi/asm/mman.h
+++ b/arch/mips/include/uapi/asm/mman.h
@@ -67,6 +67,7 @@
#define MADV_SEQUENTIAL 2 /* expect sequential page references */
#define MADV_WILLNEED 3 /* will need these pages */
#define MADV_DONTNEED 4 /* don't need these pages */
+#define MADV_FREE 5 /* free pages only if memory pressure */
/* common parameters: try to keep these consistent across architectures */
#define MADV_REMOVE 9 /* remove these pages & resources */
diff --git a/arch/mips/mm/dma-default.c b/arch/mips/mm/dma-default.c
index 609d1241b0c47c..eeaf0245c3b16f 100644
--- a/arch/mips/mm/dma-default.c
+++ b/arch/mips/mm/dma-default.c
@@ -262,12 +262,13 @@ static void mips_dma_unmap_page(struct device *dev, dma_addr_t dma_addr,
plat_unmap_dma_mem(dev, dma_addr, size, direction);
}
-static int mips_dma_map_sg(struct device *dev, struct scatterlist *sg,
+static int mips_dma_map_sg(struct device *dev, struct scatterlist *sglist,
int nents, enum dma_data_direction direction, struct dma_attrs *attrs)
{
int i;
+ struct scatterlist *sg;
- for (i = 0; i < nents; i++, sg++) {
+ for_each_sg(sglist, sg, nents, i) {
if (!plat_device_is_coherent(dev))
__dma_sync(sg_page(sg), sg->offset, sg->length,
direction);
@@ -291,13 +292,14 @@ static dma_addr_t mips_dma_map_page(struct device *dev, struct page *page,
return plat_map_dma_mem_page(dev, page) + offset;
}
-static void mips_dma_unmap_sg(struct device *dev, struct scatterlist *sg,
+static void mips_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
int nhwentries, enum dma_data_direction direction,
struct dma_attrs *attrs)
{
int i;
+ struct scatterlist *sg;
- for (i = 0; i < nhwentries; i++, sg++) {
+ for_each_sg(sglist, sg, nhwentries, i) {
if (!plat_device_is_coherent(dev) &&
direction != DMA_TO_DEVICE)
__dma_sync(sg_page(sg), sg->offset, sg->length,
@@ -324,26 +326,34 @@ static void mips_dma_sync_single_for_device(struct device *dev,
}
static void mips_dma_sync_sg_for_cpu(struct device *dev,
- struct scatterlist *sg, int nelems, enum dma_data_direction direction)
+ struct scatterlist *sglist, int nelems,
+ enum dma_data_direction direction)
{
int i;
+ struct scatterlist *sg;
- if (cpu_needs_post_dma_flush(dev))
- for (i = 0; i < nelems; i++, sg++)
+ if (cpu_needs_post_dma_flush(dev)) {
+ for_each_sg(sglist, sg, nelems, i) {
__dma_sync(sg_page(sg), sg->offset, sg->length,
direction);
+ }
+ }
plat_post_dma_flush(dev);
}
static void mips_dma_sync_sg_for_device(struct device *dev,
- struct scatterlist *sg, int nelems, enum dma_data_direction direction)
+ struct scatterlist *sglist, int nelems,
+ enum dma_data_direction direction)
{
int i;
+ struct scatterlist *sg;
- if (!plat_device_is_coherent(dev))
- for (i = 0; i < nelems; i++, sg++)
+ if (!plat_device_is_coherent(dev)) {
+ for_each_sg(sglist, sg, nelems, i) {
__dma_sync(sg_page(sg), sg->offset, sg->length,
direction);
+ }
+ }
}
int mips_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
diff --git a/arch/mips/mm/hugetlbpage.c b/arch/mips/mm/hugetlbpage.c
index 06e0f421b41b19..74aa6f62468f2e 100644
--- a/arch/mips/mm/hugetlbpage.c
+++ b/arch/mips/mm/hugetlbpage.c
@@ -51,11 +51,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
return (pte_t *) pmd;
}
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
-{
- return 0;
-}
-
/*
* This function checks for proper alignment of input addr and len parameters.
*/
diff --git a/arch/mn10300/include/asm/mm-arch-hooks.h b/arch/mn10300/include/asm/mm-arch-hooks.h
new file mode 100644
index 00000000000000..e2029a652f4c84
--- /dev/null
+++ b/arch/mn10300/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_MN10300_MM_ARCH_HOOKS_H
+#define _ASM_MN10300_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_MN10300_MM_ARCH_HOOKS_H */
diff --git a/arch/nios2/include/asm/mm-arch-hooks.h b/arch/nios2/include/asm/mm-arch-hooks.h
new file mode 100644
index 00000000000000..d7290dc68558bd
--- /dev/null
+++ b/arch/nios2/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_NIOS2_MM_ARCH_HOOKS_H
+#define _ASM_NIOS2_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_NIOS2_MM_ARCH_HOOKS_H */
diff --git a/arch/openrisc/include/asm/mm-arch-hooks.h b/arch/openrisc/include/asm/mm-arch-hooks.h
new file mode 100644
index 00000000000000..6d33cb555fe13f
--- /dev/null
+++ b/arch/openrisc/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_OPENRISC_MM_ARCH_HOOKS_H
+#define _ASM_OPENRISC_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_OPENRISC_MM_ARCH_HOOKS_H */
diff --git a/arch/parisc/include/asm/mm-arch-hooks.h b/arch/parisc/include/asm/mm-arch-hooks.h
new file mode 100644
index 00000000000000..654ec63b0ee954
--- /dev/null
+++ b/arch/parisc/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_PARISC_MM_ARCH_HOOKS_H
+#define _ASM_PARISC_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_PARISC_MM_ARCH_HOOKS_H */
diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h
index 294d251ca7b2ee..6cb8db76fd4ec9 100644
--- a/arch/parisc/include/uapi/asm/mman.h
+++ b/arch/parisc/include/uapi/asm/mman.h
@@ -40,6 +40,7 @@
#define MADV_SPACEAVAIL 5 /* insure that resources are reserved */
#define MADV_VPS_PURGE 6 /* Purge pages from VM page cache */
#define MADV_VPS_INHERIT 7 /* Inherit parents page size */
+#define MADV_FREE 8 /* free pages only if memory pressure */
/* common/generic parameters */
#define MADV_REMOVE 9 /* remove these pages & resources */
diff --git a/arch/parisc/kernel/pci-dma.c b/arch/parisc/kernel/pci-dma.c
index ff834fd67478ac..b9402c9b34545e 100644
--- a/arch/parisc/kernel/pci-dma.c
+++ b/arch/parisc/kernel/pci-dma.c
@@ -478,14 +478,16 @@ static void pa11_dma_unmap_single(struct device *dev, dma_addr_t dma_handle, siz
static int pa11_dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction)
{
int i;
+ struct scatterlist *sg;
BUG_ON(direction == DMA_NONE);
- for (i = 0; i < nents; i++, sglist++ ) {
- unsigned long vaddr = (unsigned long)sg_virt(sglist);
- sg_dma_address(sglist) = (dma_addr_t) virt_to_phys(vaddr);
- sg_dma_len(sglist) = sglist->length;
- flush_kernel_dcache_range(vaddr, sglist->length);
+ for_each_sg(sglist, sg, nents, i) {
+ unsigned long vaddr = (unsigned long)sg_virt(sg);
+
+ sg_dma_address(sg) = (dma_addr_t) virt_to_phys(vaddr);
+ sg_dma_len(sg) = sg->length;
+ flush_kernel_dcache_range(vaddr, sg->length);
}
return nents;
}
@@ -493,6 +495,7 @@ static int pa11_dma_map_sg(struct device *dev, struct scatterlist *sglist, int n
static void pa11_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction)
{
int i;
+ struct scatterlist *sg;
BUG_ON(direction == DMA_NONE);
@@ -501,8 +504,8 @@ static void pa11_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, in
/* once we do combining we'll need to use phys_to_virt(sg_dma_address(sglist)) */
- for (i = 0; i < nents; i++, sglist++ )
- flush_kernel_vmap_range(sg_virt(sglist), sglist->length);
+ for_each_sg(sglist, sg, nents, i)
+ flush_kernel_vmap_range(sg_virt(sg), sg->length);
return;
}
@@ -523,21 +526,23 @@ static void pa11_dma_sync_single_for_device(struct device *dev, dma_addr_t dma_h
static void pa11_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction)
{
int i;
+ struct scatterlist *sg;
/* once we do combining we'll need to use phys_to_virt(sg_dma_address(sglist)) */
- for (i = 0; i < nents; i++, sglist++ )
- flush_kernel_vmap_range(sg_virt(sglist), sglist->length);
+ for_each_sg(sglist, sg, nents, i)
+ flush_kernel_vmap_range(sg_virt(sg), sg->length);
}
static void pa11_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sglist, int nents, enum dma_data_direction direction)
{
int i;
+ struct scatterlist *sg;
/* once we do combining we'll need to use phys_to_virt(sg_dma_address(sglist)) */
- for (i = 0; i < nents; i++, sglist++ )
- flush_kernel_vmap_range(sg_virt(sglist), sglist->length);
+ for_each_sg(sglist, sg, nents, i)
+ flush_kernel_vmap_range(sg_virt(sg), sg->length);
}
struct hppa_dma_ops pcxl_dma_ops = {
diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h
index 1d53a65b4ec17e..7eac89b9f02e5f 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -112,11 +112,6 @@ static inline int prepare_hugepage_range(struct file *file,
return 0;
}
-static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm)
-{
-}
-
-
static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte)
{
@@ -173,15 +168,6 @@ static inline pte_t huge_ptep_get(pte_t *ptep)
return *ptep;
}
-static inline int arch_prepare_hugepage(struct page *page)
-{
- return 0;
-}
-
-static inline void arch_release_hugepage(struct page *page)
-{
-}
-
static inline void arch_clear_hugepage_flags(struct page *page)
{
}
diff --git a/arch/powerpc/include/asm/mm-arch-hooks.h b/arch/powerpc/include/asm/mm-arch-hooks.h
new file mode 100644
index 00000000000000..f2a2da89589704
--- /dev/null
+++ b/arch/powerpc/include/asm/mm-arch-hooks.h
@@ -0,0 +1,28 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_POWERPC_MM_ARCH_HOOKS_H
+#define _ASM_POWERPC_MM_ARCH_HOOKS_H
+
+static inline void arch_remap(struct mm_struct *mm,
+ unsigned long old_start, unsigned long old_end,
+ unsigned long new_start, unsigned long new_end)
+{
+ /*
+ * mremap() doesn't allow moving multiple vmas so we can limit the
+ * check to old_start == vdso_base.
+ */
+ if (old_start == mm->context.vdso_base)
+ mm->context.vdso_base = new_start;
+}
+#define arch_remap arch_remap
+
+#endif /* _ASM_POWERPC_MM_ARCH_HOOKS_H */
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index 73382eba02dccf..825cb232eab6e9 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -8,7 +8,6 @@
#include <linux/spinlock.h>
#include <asm/mmu.h>
#include <asm/cputable.h>
-#include <asm-generic/mm_hooks.h>
#include <asm/cputhreads.h>
/*
@@ -109,5 +108,27 @@ static inline void enter_lazy_tlb(struct mm_struct *mm,
#endif
}
+static inline void arch_dup_mmap(struct mm_struct *oldmm,
+ struct mm_struct *mm)
+{
+}
+
+static inline void arch_exit_mmap(struct mm_struct *mm)
+{
+}
+
+static inline void arch_unmap(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ if (start <= mm->context.vdso_base && mm->context.vdso_base < end)
+ mm->context.vdso_base = 0;
+}
+
+static inline void arch_bprm_mm_init(struct mm_struct *mm,
+ struct vm_area_struct *vma)
+{
+}
+
#endif /* __KERNEL__ */
#endif /* __ASM_POWERPC_MMU_CONTEXT_H */
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index f951d9cf358af8..07dbe968dc0297 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -491,9 +491,11 @@ static inline pte_t *pmdp_ptep(pmd_t *pmd)
#define pmd_pfn(pmd) pte_pfn(pmd_pte(pmd))
#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd))
#define pmd_young(pmd) pte_young(pmd_pte(pmd))
+#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd))
#define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd)))
#define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd)))
#define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd)))
+#define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd)))
#define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd)))
#define pmd_mkwrite(pmd) pte_pmd(pte_mkwrite(pmd_pte(pmd)))
@@ -553,13 +555,9 @@ extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp);
-#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
-extern pmd_t pmdp_get_and_clear(struct mm_struct *mm,
- unsigned long addr, pmd_t *pmdp);
-
-#define __HAVE_ARCH_PMDP_CLEAR_FLUSH
-extern pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmdp);
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
+extern pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pmd_t *pmdp);
#define __HAVE_ARCH_PMDP_SET_WRPROTECT
static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
@@ -576,6 +574,10 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
extern void pmdp_splitting_flush(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp);
+extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp);
+#define pmdp_collapse_flush pmdp_collapse_flush
+
#define __HAVE_ARCH_PGTABLE_DEPOSIT
extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
pgtable_t pgtable);
diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h
index 71f2b3f02cf884..4d65499ee1c115 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -368,3 +368,4 @@ SYSCALL_SPU(memfd_create)
SYSCALL_SPU(bpf)
COMPAT_SYS(execveat)
PPC64ONLY(switch_endian)
+SYSCALL_SPU(userfaultfd)
diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h
index f4f8b667d75be5..4a055b6c2a64f0 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -12,7 +12,7 @@
#include <uapi/asm/unistd.h>
-#define __NR_syscalls 364
+#define __NR_syscalls 365
#define __NR__exit __NR_exit
#define NR_syscalls __NR_syscalls
diff --git a/arch/powerpc/include/uapi/asm/unistd.h b/arch/powerpc/include/uapi/asm/unistd.h
index e4aa173dae6236..6ad58d4c879bf6 100644
--- a/arch/powerpc/include/uapi/asm/unistd.h
+++ b/arch/powerpc/include/uapi/asm/unistd.h
@@ -386,5 +386,6 @@
#define __NR_bpf 361
#define __NR_execveat 362
#define __NR_switch_endian 363
+#define __NR_userfaultfd 364
#endif /* _UAPI_ASM_POWERPC_UNISTD_H_ */
diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c
index 5bfdab9047be25..b7a68442b76741 100644
--- a/arch/powerpc/kernel/vio.c
+++ b/arch/powerpc/kernel/vio.c
@@ -557,11 +557,11 @@ static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist,
struct vio_dev *viodev = to_vio_dev(dev);
struct iommu_table *tbl;
struct scatterlist *sgl;
- int ret, count = 0;
+ int ret, count;
size_t alloc_size = 0;
tbl = get_iommu_table_base(dev);
- for (sgl = sglist; count < nelems; count++, sgl++)
+ for_each_sg(sglist, sgl, nelems, count)
alloc_size += roundup(sgl->length, IOMMU_PAGE_SIZE(tbl));
if (vio_cmo_alloc(viodev, alloc_size)) {
@@ -577,7 +577,7 @@ static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist,
return ret;
}
- for (sgl = sglist, count = 0; count < ret; count++, sgl++)
+ for_each_sg(sglist, sgl, ret, count)
alloc_size -= roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl));
if (alloc_size)
vio_cmo_dealloc(viodev, alloc_size);
@@ -594,10 +594,10 @@ static void vio_dma_iommu_unmap_sg(struct device *dev,
struct iommu_table *tbl;
struct scatterlist *sgl;
size_t alloc_size = 0;
- int count = 0;
+ int count;
tbl = get_iommu_table_base(dev);
- for (sgl = sglist; count < nelems; count++, sgl++)
+ for_each_sg(sglist, sgl, nelems, count)
alloc_size += roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl));
dma_iommu_ops.unmap_sg(dev, sglist, nelems, direction, attrs);
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index f88ea4abc2ea9f..bb0bd7025cb88f 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -439,11 +439,6 @@ int alloc_bootmem_huge_page(struct hstate *hstate)
}
#endif
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
-{
- return 0;
-}
-
#ifdef CONFIG_PPC_FSL_BOOK3E
#define HUGEPD_FREELIST_SIZE \
((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 6bfadf1aa5cbbf..876232d6412688 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -554,47 +554,42 @@ unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
return old;
}
-pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmdp)
+pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
{
pmd_t pmd;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- if (pmd_trans_huge(*pmdp)) {
- pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
- } else {
- /*
- * khugepaged calls this for normal pmd
- */
- pmd = *pmdp;
- pmd_clear(pmdp);
- /*
- * Wait for all pending hash_page to finish. This is needed
- * in case of subpage collapse. When we collapse normal pages
- * to hugepage, we first clear the pmd, then invalidate all
- * the PTE entries. The assumption here is that any low level
- * page fault will see a none pmd and take the slow path that
- * will wait on mmap_sem. But we could very well be in a
- * hash_page with local ptep pointer value. Such a hash page
- * can result in adding new HPTE entries for normal subpages.
- * That means we could be modifying the page content as we
- * copy them to a huge page. So wait for parallel hash_page
- * to finish before invalidating HPTE entries. We can do this
- * by sending an IPI to all the cpus and executing a dummy
- * function there.
- */
- kick_all_cpus_sync();
- /*
- * Now invalidate the hpte entries in the range
- * covered by pmd. This make sure we take a
- * fault and will find the pmd as none, which will
- * result in a major fault which takes mmap_sem and
- * hence wait for collapse to complete. Without this
- * the __collapse_huge_page_copy can result in copying
- * the old content.
- */
- flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
- }
+ VM_BUG_ON(pmd_trans_huge(*pmdp));
+
+ pmd = *pmdp;
+ pmd_clear(pmdp);
+ /*
+ * Wait for all pending hash_page to finish. This is needed
+ * in case of subpage collapse. When we collapse normal pages
+ * to hugepage, we first clear the pmd, then invalidate all
+ * the PTE entries. The assumption here is that any low level
+ * page fault will see a none pmd and take the slow path that
+ * will wait on mmap_sem. But we could very well be in a
+ * hash_page with local ptep pointer value. Such a hash page
+ * can result in adding new HPTE entries for normal subpages.
+ * That means we could be modifying the page content as we
+ * copy them to a huge page. So wait for parallel hash_page
+ * to finish before invalidating HPTE entries. We can do this
+ * by sending an IPI to all the cpus and executing a dummy
+ * function there.
+ */
+ kick_all_cpus_sync();
+ /*
+ * Now invalidate the hpte entries in the range
+ * covered by pmd. This make sure we take a
+ * fault and will find the pmd as none, which will
+ * result in a major fault which takes mmap_sem and
+ * hence wait for collapse to complete. Without this
+ * the __collapse_huge_page_copy can result in copying
+ * the old content.
+ */
+ flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
return pmd;
}
@@ -817,8 +812,8 @@ void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
return;
}
-pmd_t pmdp_get_and_clear(struct mm_struct *mm,
- unsigned long addr, pmd_t *pmdp)
+pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pmd_t *pmdp)
{
pmd_t old_pmd;
pgtable_t pgtable;
diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h
index 11eae5f55b709d..0130d0379edd17 100644
--- a/arch/s390/include/asm/hugetlb.h
+++ b/arch/s390/include/asm/hugetlb.h
@@ -35,12 +35,8 @@ static inline int prepare_hugepage_range(struct file *file,
return 0;
}
-#define hugetlb_prefault_arch_hook(mm) do { } while (0)
#define arch_clear_hugepage_flags(page) do { } while (0)
-int arch_prepare_hugepage(struct page *page);
-void arch_release_hugepage(struct page *page);
-
static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
diff --git a/arch/s390/include/asm/mm-arch-hooks.h b/arch/s390/include/asm/mm-arch-hooks.h
new file mode 100644
index 00000000000000..07680b2f3c59fe
--- /dev/null
+++ b/arch/s390/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_S390_MM_ARCH_HOOKS_H
+#define _ASM_S390_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_S390_MM_ARCH_HOOKS_H */
diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
index 53eacbd4f09bf4..0844b780c6a44b 100644
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@ -17,7 +17,10 @@
#define PAGE_DEFAULT_ACC 0
#define PAGE_DEFAULT_KEY (PAGE_DEFAULT_ACC << 4)
-#define HPAGE_SHIFT 20
+#include <asm/setup.h>
+#ifndef __ASSEMBLY__
+
+extern unsigned int HPAGE_SHIFT;
#define HPAGE_SIZE (1UL << HPAGE_SHIFT)
#define HPAGE_MASK (~(HPAGE_SIZE - 1))
#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
@@ -27,9 +30,6 @@
#define ARCH_HAS_PREPARE_HUGEPAGE
#define ARCH_HAS_HUGEPAGE_CLEAR_FLUSH
-#include <asm/setup.h>
-#ifndef __ASSEMBLY__
-
static inline void storage_key_init_range(unsigned long start, unsigned long end)
{
#if PAGE_DEFAULT_KEY
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index ef24a212eeb727..536e857ac0f95c 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1498,9 +1498,9 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
return pmd_young(pmd);
}
-#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
-static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
- unsigned long address, pmd_t *pmdp)
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
+static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
+ unsigned long address, pmd_t *pmdp)
{
pmd_t pmd = *pmdp;
@@ -1509,10 +1509,10 @@ static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
return pmd;
}
-#define __HAVE_ARCH_PMDP_GET_AND_CLEAR_FULL
-static inline pmd_t pmdp_get_and_clear_full(struct mm_struct *mm,
- unsigned long address,
- pmd_t *pmdp, int full)
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
+static inline pmd_t pmdp_huge_get_and_clear_full(struct mm_struct *mm,
+ unsigned long address,
+ pmd_t *pmdp, int full)
{
pmd_t pmd = *pmdp;
@@ -1522,11 +1522,11 @@ static inline pmd_t pmdp_get_and_clear_full(struct mm_struct *mm,
return pmd;
}
-#define __HAVE_ARCH_PMDP_CLEAR_FLUSH
-static inline pmd_t pmdp_clear_flush(struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmdp)
+#define __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
+static inline pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
{
- return pmdp_get_and_clear(vma->vm_mm, address, pmdp);
+ return pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
}
#define __HAVE_ARCH_PMDP_INVALIDATE
@@ -1548,6 +1548,14 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
}
}
+static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
+ unsigned long address,
+ pmd_t *pmdp)
+{
+ return pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
+}
+#define pmdp_collapse_flush pmdp_collapse_flush
+
#define pfn_pmd(pfn, pgprot) mk_pmd_phys(__pa((pfn) << PAGE_SHIFT), (pgprot))
#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot))
diff --git a/arch/s390/kernel/compat_wrapper.c b/arch/s390/kernel/compat_wrapper.c
index d7fa2f0f142514..f8498dde67b1a3 100644
--- a/arch/s390/kernel/compat_wrapper.c
+++ b/arch/s390/kernel/compat_wrapper.c
@@ -202,7 +202,7 @@ COMPAT_SYSCALL_WRAP1(epoll_create1, int, flags);
COMPAT_SYSCALL_WRAP2(tkill, int, pid, int, sig);
COMPAT_SYSCALL_WRAP3(tgkill, int, tgid, int, pid, int, sig);
COMPAT_SYSCALL_WRAP5(perf_event_open, struct perf_event_attr __user *, attr_uptr, pid_t, pid, int, cpu, int, group_fd, unsigned long, flags);
-COMPAT_SYSCALL_WRAP5(clone, unsigned long, newsp, unsigned long, clone_flags, int __user *, parent_tidptr, int __user *, child_tidptr, int, tls_val);
+COMPAT_SYSCALL_WRAP5(clone, unsigned long, newsp, unsigned long, clone_flags, int __user *, parent_tidptr, int __user *, child_tidptr, unsigned long, tls);
COMPAT_SYSCALL_WRAP2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags);
COMPAT_SYSCALL_WRAP4(prlimit64, pid_t, pid, unsigned int, resource, const struct rlimit64 __user *, new_rlim, struct rlimit64 __user *, old_rlim);
COMPAT_SYSCALL_WRAP5(name_to_handle_at, int, dfd, const char __user *, name, struct file_handle __user *, handle, int __user *, mnt_id, int, flag);
diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c
index d9f0dcfcae5eae..7a75ad4594e3e7 100644
--- a/arch/s390/kernel/crash_dump.c
+++ b/arch/s390/kernel/crash_dump.c
@@ -33,11 +33,12 @@ static struct memblock_type oldmem_type = {
};
#define for_each_dump_mem_range(i, nid, p_start, p_end, p_nid) \
- for (i = 0, __next_mem_range(&i, nid, &memblock.physmem, \
+ for (i = 0, __next_mem_range(&i, nid, MEMBLOCK_NONE, \
+ &memblock.physmem, \
&oldmem_type, p_start, \
p_end, p_nid); \
i != (u64)ULLONG_MAX; \
- __next_mem_range(&i, nid, &memblock.physmem, \
+ __next_mem_range(&i, nid, MEMBLOCK_NONE, &memblock.physmem,\
&oldmem_type, \
p_start, p_end, p_nid))
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index bdde603ee0ac6b..c057783b350a74 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -875,6 +875,8 @@ void __init setup_arch(char **cmdline_p)
*/
setup_hwcaps();
+ HPAGE_SHIFT = MACHINE_HAS_HPAGE ? 20 : 0;
+
/*
* Create kernel page tables and switch to virtual addressing.
*/
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index e617e74b7be22a..fb4bf2c4379e47 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -86,31 +86,16 @@ static inline pte_t __pmd_to_pte(pmd_t pmd)
void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte)
{
- pmd_t pmd;
+ pmd_t pmd = __pte_to_pmd(pte);
- pmd = __pte_to_pmd(pte);
- if (!MACHINE_HAS_HPAGE) {
- /* Emulated huge ptes loose the dirty and young bit */
- pmd_val(pmd) &= ~_SEGMENT_ENTRY_ORIGIN;
- pmd_val(pmd) |= pte_page(pte)[1].index;
- } else
- pmd_val(pmd) |= _SEGMENT_ENTRY_LARGE;
+ pmd_val(pmd) |= _SEGMENT_ENTRY_LARGE;
*(pmd_t *) ptep = pmd;
}
pte_t huge_ptep_get(pte_t *ptep)
{
- unsigned long origin;
- pmd_t pmd;
+ pmd_t pmd = *(pmd_t *) ptep;
- pmd = *(pmd_t *) ptep;
- if (!MACHINE_HAS_HPAGE && pmd_present(pmd)) {
- origin = pmd_val(pmd) & _SEGMENT_ENTRY_ORIGIN;
- pmd_val(pmd) &= ~_SEGMENT_ENTRY_ORIGIN;
- pmd_val(pmd) |= *(unsigned long *) origin;
- /* Emulated huge ptes are young and dirty by definition */
- pmd_val(pmd) |= _SEGMENT_ENTRY_YOUNG | _SEGMENT_ENTRY_DIRTY;
- }
return __pmd_to_pte(pmd);
}
@@ -125,45 +110,6 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
return pte;
}
-int arch_prepare_hugepage(struct page *page)
-{
- unsigned long addr = page_to_phys(page);
- pte_t pte;
- pte_t *ptep;
- int i;
-
- if (MACHINE_HAS_HPAGE)
- return 0;
-
- ptep = (pte_t *) pte_alloc_one(&init_mm, addr);
- if (!ptep)
- return -ENOMEM;
-
- pte_val(pte) = addr;
- for (i = 0; i < PTRS_PER_PTE; i++) {
- set_pte_at(&init_mm, addr + i * PAGE_SIZE, ptep + i, pte);
- pte_val(pte) += PAGE_SIZE;
- }
- page[1].index = (unsigned long) ptep;
- return 0;
-}
-
-void arch_release_hugepage(struct page *page)
-{
- pte_t *ptep;
-
- if (MACHINE_HAS_HPAGE)
- return;
-
- ptep = (pte_t *) page[1].index;
- if (!ptep)
- return;
- clear_table((unsigned long *) ptep, _PAGE_INVALID,
- PTRS_PER_PTE * sizeof(pte_t));
- page_table_free(&init_mm, (unsigned long *) ptep);
- page[1].index = 0;
-}
-
pte_t *huge_pte_alloc(struct mm_struct *mm,
unsigned long addr, unsigned long sz)
{
@@ -193,17 +139,9 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
return (pte_t *) pmdp;
}
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
-{
- return 0;
-}
-
int pmd_huge(pmd_t pmd)
{
- if (!MACHINE_HAS_HPAGE)
- return 0;
-
- return !!(pmd_val(pmd) & _SEGMENT_ENTRY_LARGE);
+ return pmd_large(pmd);
}
int pud_huge(pud_t pud)
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index b33f66110ca940..16154720bdb607 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -31,6 +31,8 @@
#define ALLOC_ORDER 2
#define FRAG_MASK 0x03
+unsigned int HPAGE_SHIFT;
+
unsigned long *crst_table_alloc(struct mm_struct *mm)
{
struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
diff --git a/arch/score/include/asm/mm-arch-hooks.h b/arch/score/include/asm/mm-arch-hooks.h
new file mode 100644
index 00000000000000..5e38689f189ab7
--- /dev/null
+++ b/arch/score/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_SCORE_MM_ARCH_HOOKS_H
+#define _ASM_SCORE_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_SCORE_MM_ARCH_HOOKS_H */
diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h
index 699255d6d1c634..ef489a56fcce3e 100644
--- a/arch/sh/include/asm/hugetlb.h
+++ b/arch/sh/include/asm/hugetlb.h
@@ -26,9 +26,6 @@ static inline int prepare_hugepage_range(struct file *file,
return 0;
}
-static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) {
-}
-
static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
unsigned long addr, unsigned long end,
unsigned long floor,
@@ -82,15 +79,6 @@ static inline pte_t huge_ptep_get(pte_t *ptep)
return *ptep;
}
-static inline int arch_prepare_hugepage(struct page *page)
-{
- return 0;
-}
-
-static inline void arch_release_hugepage(struct page *page)
-{
-}
-
static inline void arch_clear_hugepage_flags(struct page *page)
{
clear_bit(PG_dcache_clean, &page->flags);
diff --git a/arch/sh/include/asm/mm-arch-hooks.h b/arch/sh/include/asm/mm-arch-hooks.h
new file mode 100644
index 00000000000000..18087298b72886
--- /dev/null
+++ b/arch/sh/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_SH_MM_ARCH_HOOKS_H
+#define _ASM_SH_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_SH_MM_ARCH_HOOKS_H */
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c
index 534bc978af8a58..6385f60209b65a 100644
--- a/arch/sh/mm/hugetlbpage.c
+++ b/arch/sh/mm/hugetlbpage.c
@@ -62,11 +62,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
return pte;
}
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
-{
- return 0;
-}
-
int pmd_huge(pmd_t pmd)
{
return 0;
diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h
index e4cab465b81f86..139e711ff80cdd 100644
--- a/arch/sparc/include/asm/hugetlb.h
+++ b/arch/sparc/include/asm/hugetlb.h
@@ -11,10 +11,6 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep);
-static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm)
-{
-}
-
static inline int is_hugepage_only_range(struct mm_struct *mm,
unsigned long addr,
unsigned long len) {
@@ -82,15 +78,6 @@ static inline pte_t huge_ptep_get(pte_t *ptep)
return *ptep;
}
-static inline int arch_prepare_hugepage(struct page *page)
-{
- return 0;
-}
-
-static inline void arch_release_hugepage(struct page *page)
-{
-}
-
static inline void arch_clear_hugepage_flags(struct page *page)
{
}
diff --git a/arch/sparc/include/asm/mm-arch-hooks.h b/arch/sparc/include/asm/mm-arch-hooks.h
new file mode 100644
index 00000000000000..b89ba44c16f1a9
--- /dev/null
+++ b/arch/sparc/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_SPARC_MM_ARCH_HOOKS_H
+#define _ASM_SPARC_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_SPARC_MM_ARCH_HOOKS_H */
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 2a52c91d2c8acb..5833dc5ee7d72f 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -717,6 +717,15 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd)
return __pmd(pte_val(pte));
}
+static inline pmd_t pmd_mkclean(pmd_t pmd)
+{
+ pte_t pte = __pte(pmd_val(pmd));
+
+ pte = pte_mkclean(pte);
+
+ return __pmd(pte_val(pte));
+}
+
static inline pmd_t pmd_mkyoung(pmd_t pmd)
{
pte_t pte = __pte(pmd_val(pmd));
@@ -865,10 +874,10 @@ static inline unsigned long pud_pfn(pud_t pud)
void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr,
pte_t *ptep, pte_t orig, int fullmm);
-#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
-static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
- unsigned long addr,
- pmd_t *pmdp)
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
+static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
+ unsigned long addr,
+ pmd_t *pmdp)
{
pmd_t pmd = *pmdp;
set_pmd_at(mm, addr, pmdp, __pmd(0UL));
diff --git a/arch/sparc/kernel/ldc.c b/arch/sparc/kernel/ldc.c
index 7d3ca30fcd1506..1ae5eb1bb04513 100644
--- a/arch/sparc/kernel/ldc.c
+++ b/arch/sparc/kernel/ldc.c
@@ -2086,6 +2086,7 @@ int ldc_map_sg(struct ldc_channel *lp,
struct cookie_state state;
struct ldc_iommu *iommu;
int err;
+ struct scatterlist *s;
if (map_perm & ~LDC_MAP_ALL)
return -EINVAL;
@@ -2112,9 +2113,10 @@ int ldc_map_sg(struct ldc_channel *lp,
state.pte_idx = (base - iommu->page_table);
state.nc = 0;
- for (i = 0; i < num_sg; i++)
- fill_cookies(&state, page_to_pfn(sg_page(&sg[i])) << PAGE_SHIFT,
- sg[i].offset, sg[i].length);
+ for_each_sg(sg, s, num_sg, i) {
+ fill_cookies(&state, page_to_pfn(sg_page(s)) << PAGE_SHIFT,
+ s->offset, s->length);
+ }
return state.nc;
}
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index 4242eab12e1073..131eaf4ad7f598 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -172,11 +172,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
return pte;
}
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
-{
- return 0;
-}
-
void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t entry)
{
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index c5d08b89a96c81..4ac88b7575147f 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -1966,7 +1966,8 @@ static phys_addr_t __init available_memory(void)
phys_addr_t pa_start, pa_end;
u64 i;
- for_each_free_mem_range(i, NUMA_NO_NODE, &pa_start, &pa_end, NULL)
+ for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &pa_start,
+ &pa_end, NULL)
available = available + (pa_end - pa_start);
return available;
@@ -1992,7 +1993,8 @@ static void __init reduce_memory(phys_addr_t limit_ram)
if (limit_ram >= avail_ram)
return;
- for_each_free_mem_range(i, NUMA_NO_NODE, &pa_start, &pa_end, NULL) {
+ for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &pa_start,
+ &pa_end, NULL) {
phys_addr_t region_size = pa_end - pa_start;
phys_addr_t clip_start = pa_start;
diff --git a/arch/tile/include/asm/hugetlb.h b/arch/tile/include/asm/hugetlb.h
index 3257733003f8dc..2fac5be4de26a6 100644
--- a/arch/tile/include/asm/hugetlb.h
+++ b/arch/tile/include/asm/hugetlb.h
@@ -40,10 +40,6 @@ static inline int prepare_hugepage_range(struct file *file,
return 0;
}
-static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm)
-{
-}
-
static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
unsigned long addr, unsigned long end,
unsigned long floor,
@@ -98,15 +94,6 @@ static inline pte_t huge_ptep_get(pte_t *ptep)
return *ptep;
}
-static inline int arch_prepare_hugepage(struct page *page)
-{
- return 0;
-}
-
-static inline void arch_release_hugepage(struct page *page)
-{
-}
-
static inline void arch_clear_hugepage_flags(struct page *page)
{
}
diff --git a/arch/tile/include/asm/mm-arch-hooks.h b/arch/tile/include/asm/mm-arch-hooks.h
new file mode 100644
index 00000000000000..d1709ea774f786
--- /dev/null
+++ b/arch/tile/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_TILE_MM_ARCH_HOOKS_H
+#define _ASM_TILE_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_TILE_MM_ARCH_HOOKS_H */
diff --git a/arch/tile/include/asm/pgtable.h b/arch/tile/include/asm/pgtable.h
index 95a4f19d16c526..2b05ccbebed9b8 100644
--- a/arch/tile/include/asm/pgtable.h
+++ b/arch/tile/include/asm/pgtable.h
@@ -414,10 +414,10 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
}
-#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
-static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
- unsigned long address,
- pmd_t *pmdp)
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
+static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
+ unsigned long address,
+ pmd_t *pmdp)
{
return pte_pmd(ptep_get_and_clear(mm, address, pmdp_ptep(pmdp)));
}
diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c
index 8416240c322c92..c034dc3fe2d42c 100644
--- a/arch/tile/mm/hugetlbpage.c
+++ b/arch/tile/mm/hugetlbpage.c
@@ -160,11 +160,6 @@ int pud_huge(pud_t pud)
return !!(pud_val(pud) & _PAGE_HUGE_PAGE);
}
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
-{
- return 0;
-}
-
#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
unsigned long addr, unsigned long len,
diff --git a/arch/um/include/asm/mm-arch-hooks.h b/arch/um/include/asm/mm-arch-hooks.h
new file mode 100644
index 00000000000000..a7c8b0dfdd4e97
--- /dev/null
+++ b/arch/um/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_UM_MM_ARCH_HOOKS_H
+#define _ASM_UM_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_UM_MM_ARCH_HOOKS_H */
diff --git a/arch/unicore32/include/asm/mm-arch-hooks.h b/arch/unicore32/include/asm/mm-arch-hooks.h
new file mode 100644
index 00000000000000..4d79a850c509b3
--- /dev/null
+++ b/arch/unicore32/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_UNICORE32_MM_ARCH_HOOKS_H
+#define _ASM_UNICORE32_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_UNICORE32_MM_ARCH_HOOKS_H */
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fc1fd8a41540b9..0c028438bc06e2 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -33,6 +33,7 @@ config X86
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
select ARCH_SUPPORTS_ATOMIC_RMW
+ select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT if X86_64
select ARCH_SUPPORTS_INT128 if X86_64
select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
select ARCH_USE_BUILTIN_BSWAP
diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
index 68c05398bba9b4..f8a29d2c97b012 100644
--- a/arch/x86/include/asm/hugetlb.h
+++ b/arch/x86/include/asm/hugetlb.h
@@ -26,9 +26,6 @@ static inline int prepare_hugepage_range(struct file *file,
return 0;
}
-static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) {
-}
-
static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
unsigned long addr, unsigned long end,
unsigned long floor,
@@ -83,15 +80,6 @@ static inline pte_t huge_ptep_get(pte_t *ptep)
return *ptep;
}
-static inline int arch_prepare_hugepage(struct page *page)
-{
- return 0;
-}
-
-static inline void arch_release_hugepage(struct page *page)
-{
-}
-
static inline void arch_clear_hugepage_flags(struct page *page)
{
}
diff --git a/arch/x86/include/asm/mm-arch-hooks.h b/arch/x86/include/asm/mm-arch-hooks.h
new file mode 100644
index 00000000000000..4e881a3422369c
--- /dev/null
+++ b/arch/x86/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_X86_MM_ARCH_HOOKS_H
+#define _ASM_X86_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_X86_MM_ARCH_HOOKS_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index fe57e7a9883980..4383012950b066 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -267,6 +267,11 @@ static inline pmd_t pmd_mkold(pmd_t pmd)
return pmd_clear_flags(pmd, _PAGE_ACCESSED);
}
+static inline pmd_t pmd_mkclean(pmd_t pmd)
+{
+ return pmd_clear_flags(pmd, _PAGE_DIRTY);
+}
+
static inline pmd_t pmd_wrprotect(pmd_t pmd)
{
return pmd_clear_flags(pmd, _PAGE_RW);
@@ -799,8 +804,8 @@ static inline int pmd_write(pmd_t pmd)
return pmd_flags(pmd) & _PAGE_RW;
}
-#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
-static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr,
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
+static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp)
{
pmd_t pmd = native_pmdp_get_and_clear(pmdp);
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index 83a7995625a6de..58118e207a69c4 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -91,7 +91,8 @@ void __init setup_bios_corruption_check(void)
corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
- for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) {
+ for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
+ NULL) {
start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE),
PAGE_SIZE, corruption_check_size);
end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE),
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index e2ce85db228303..c8dda42cb6a326 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1123,7 +1123,8 @@ void __init memblock_find_dma_reserve(void)
nr_pages += end_pfn - start_pfn;
}
- for_each_free_mem_range(u, NUMA_NO_NODE, &start, &end, NULL) {
+ for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
+ NULL) {
start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN);
end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN);
if (start_pfn < end_pfn)
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index ca05f86481aace..ca83f7ac388bec 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -72,15 +72,16 @@ static int setup_cmdline(struct kimage *image, struct boot_params *params,
unsigned long cmdline_len)
{
char *cmdline_ptr = ((char *)params) + cmdline_offset;
- unsigned long cmdline_ptr_phys, len;
+ unsigned long cmdline_ptr_phys, len = 0;
uint32_t cmdline_low_32, cmdline_ext_32;
- memcpy(cmdline_ptr, cmdline, cmdline_len);
if (image->type == KEXEC_TYPE_CRASH) {
- len = sprintf(cmdline_ptr + cmdline_len - 1,
- " elfcorehdr=0x%lx", image->arch.elf_load_addr);
- cmdline_len += len;
+ len = sprintf(cmdline_ptr,
+ "elfcorehdr=0x%lx ", image->arch.elf_load_addr);
}
+ memcpy(cmdline_ptr + len, cmdline, cmdline_len);
+ cmdline_len += len;
+
cmdline_ptr[cmdline_len - 1] = '\0';
pr_debug("Final command line is: %s\n", cmdline_ptr);
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index e1029633f6643c..5e7047263b351a 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -336,6 +336,7 @@ void arch_crash_save_vmcoreinfo(void)
#endif
vmcoreinfo_append_str("KERNELOFFSET=%lx\n",
kaslr_offset());
+ VMCOREINFO_PHYS_BASE(phys_base);
}
/* arch-dependent functionality related to kexec file-based syscall */
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 8e1aaf3e7f1e98..e7975e26e52b6e 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1103,6 +1103,9 @@ void __init setup_arch(char **cmdline_p)
memblock_set_current_limit(ISA_END_ADDRESS);
memblock_x86_fill();
+ if (efi_enabled(EFI_BOOT))
+ efi_find_mirror();
+
/*
* The EFI specification says that boot service code won't be called
* after ExitBootServices(). This is, in fact, a lie.
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index c8140e12816a51..8340e45c891a1d 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -433,7 +433,7 @@ void __init add_highpages_with_active_regions(int nid,
phys_addr_t start, end;
u64 i;
- for_each_free_mem_range(i, nid, &start, &end, NULL) {
+ for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &start, &end, NULL) {
unsigned long pfn = clamp_t(unsigned long, PFN_UP(start),
start_pfn, end_pfn);
unsigned long e_pfn = clamp_t(unsigned long, PFN_DOWN(end),
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 3b984c3aa1b0b5..c1c382c58c60d5 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -117,6 +117,27 @@ void efi_get_time(struct timespec *now)
now->tv_nsec = 0;
}
+void __init efi_find_mirror(void)
+{
+ void *p;
+ u64 mirror_size = 0, total_size = 0;
+
+ for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+ efi_memory_desc_t *md = p;
+ unsigned long long start = md->phys_addr;
+ unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
+
+ total_size += size;
+ if (md->attribute & EFI_MEMORY_MORE_RELIABLE) {
+ memblock_mark_mirror(start, size);
+ mirror_size += size;
+ }
+ }
+ if (mirror_size)
+ pr_info("Memory: %lldM/%lldM mirrored memory\n",
+ mirror_size>>20, total_size>>20);
+}
+
/*
* Tell the kernel about the EFI memory map. This might include
* more than the max 128 entries that can fit in the e820 legacy
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index ef8187f9d28d96..dcc18ea75412d3 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -365,3 +365,4 @@
356 i386 memfd_create sys_memfd_create
357 i386 bpf sys_bpf
358 i386 execveat sys_execveat stub32_execveat
+359 i386 userfaultfd sys_userfaultfd
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index 9ef32d5f1b19e6..81c490634db994 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -329,6 +329,7 @@
320 common kexec_file_load sys_kexec_file_load
321 common bpf sys_bpf
322 64 execveat stub_execveat
+323 common userfaultfd sys_userfaultfd
#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/xtensa/include/asm/dma-mapping.h b/arch/xtensa/include/asm/dma-mapping.h
index ba78ccf651e776..1f5f6dc097365a 100644
--- a/arch/xtensa/include/asm/dma-mapping.h
+++ b/arch/xtensa/include/asm/dma-mapping.h
@@ -52,14 +52,15 @@ dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
}
static inline int
-dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
+dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents,
enum dma_data_direction direction)
{
int i;
+ struct scatterlist *sg;
BUG_ON(direction == DMA_NONE);
- for (i = 0; i < nents; i++, sg++ ) {
+ for_each_sg(sglist, sg, nents, i) {
BUG_ON(!sg_page(sg));
sg->dma_address = sg_phys(sg);
@@ -124,20 +125,24 @@ dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle,
consistent_sync((void *)bus_to_virt(dma_handle)+offset,size,direction);
}
static inline void
-dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
+dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist, int nelems,
enum dma_data_direction dir)
{
int i;
- for (i = 0; i < nelems; i++, sg++)
+ struct scatterlist *sg;
+
+ for_each_sg(sglist, sg, nelems, i)
consistent_sync(sg_virt(sg), sg->length, dir);
}
static inline void
-dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
- enum dma_data_direction dir)
+dma_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
+ int nelems, enum dma_data_direction dir)
{
int i;
- for (i = 0; i < nelems; i++, sg++)
+ struct scatterlist *sg;
+
+ for_each_sg(sglist, sg, nelems, i)
consistent_sync(sg_virt(sg), sg->length, dir);
}
static inline int
diff --git a/arch/xtensa/include/asm/mm-arch-hooks.h b/arch/xtensa/include/asm/mm-arch-hooks.h
new file mode 100644
index 00000000000000..d2e5cfd3dd02cb
--- /dev/null
+++ b/arch/xtensa/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_XTENSA_MM_ARCH_HOOKS_H
+#define _ASM_XTENSA_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_XTENSA_MM_ARCH_HOOKS_H */
diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h
index 201aec0e0446e8..1b19f25bc567d8 100644
--- a/arch/xtensa/include/uapi/asm/mman.h
+++ b/arch/xtensa/include/uapi/asm/mman.h
@@ -80,6 +80,7 @@
#define MADV_SEQUENTIAL 2 /* expect sequential page references */
#define MADV_WILLNEED 3 /* will need these pages */
#define MADV_DONTNEED 4 /* don't need these pages */
+#define MADV_FREE 5 /* free pages only if memory pressure */
/* common parameters: try to keep these consistent across architectures */
#define MADV_REMOVE 9 /* remove these pages & resources */
diff --git a/block/genhd.c b/block/genhd.c
index edc1dbb6c1c96f..59d18b6cb15846 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -850,7 +850,7 @@ static int show_partition(struct seq_file *seqf, void *v)
char buf[BDEVNAME_SIZE];
/* Don't show non-partitionable removeable devices or empty devices */
- if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
+ if (!get_capacity(sgp) || (!(disk_max_parts(sgp) > 1) &&
(sgp->flags & GENHD_FL_REMOVABLE)))
return 0;
if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
diff --git a/drivers/base/node.c b/drivers/base/node.c
index a2aa65b4215d39..31df474d72f4a2 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -359,12 +359,16 @@ int unregister_cpu_under_node(unsigned int cpu, unsigned int nid)
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
#define page_initialized(page) (page->lru.next)
-static int get_nid_for_pfn(unsigned long pfn)
+static int __init_refok get_nid_for_pfn(unsigned long pfn)
{
struct page *page;
if (!pfn_valid_within(pfn))
return -1;
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+ if (system_state == SYSTEM_BOOTING)
+ return early_pfn_to_nid(pfn);
+#endif
page = pfn_to_page(pfn);
if (!page_initialized(page))
return -1;
diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig
index 6489c0fd0ea6e4..386ba3d1a6ee8a 100644
--- a/drivers/block/zram/Kconfig
+++ b/drivers/block/zram/Kconfig
@@ -23,12 +23,4 @@ config ZRAM_LZ4_COMPRESS
default n
help
This option enables LZ4 compression algorithm support. Compression
- algorithm can be changed using `comp_algorithm' device attribute.
-
-config ZRAM_DEBUG
- bool "Compressed RAM block device debug support"
- depends on ZRAM
- default n
- help
- This option adds additional debugging code to the compressed
- RAM block device driver.
+ algorithm can be changed using `comp_algorithm' device attribute. \ No newline at end of file
diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
index f1ff39a3d1c122..965d1afb0eaa72 100644
--- a/drivers/block/zram/zcomp.c
+++ b/drivers/block/zram/zcomp.c
@@ -274,7 +274,7 @@ ssize_t zcomp_available_show(const char *comp, char *buf)
int i = 0;
while (backends[i]) {
- if (sysfs_streq(comp, backends[i]->name))
+ if (!strcmp(comp, backends[i]->name))
sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2,
"[%s] ", backends[i]->name);
else
@@ -286,6 +286,11 @@ ssize_t zcomp_available_show(const char *comp, char *buf)
return sz;
}
+bool zcomp_available_algorithm(const char *comp)
+{
+ return find_backend(comp) != NULL;
+}
+
bool zcomp_set_max_streams(struct zcomp *comp, int num_strm)
{
return comp->set_max_streams(comp, num_strm);
diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
index c59d1fca72c064..46e2b9f8f1f0e3 100644
--- a/drivers/block/zram/zcomp.h
+++ b/drivers/block/zram/zcomp.h
@@ -51,6 +51,7 @@ struct zcomp {
};
ssize_t zcomp_available_show(const char *comp, char *buf);
+bool zcomp_available_algorithm(const char *comp);
struct zcomp *zcomp_create(const char *comp, int max_strm);
void zcomp_destroy(struct zcomp *comp);
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 8dcbced0eafd5f..fb655e8d1e3b17 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -15,10 +15,6 @@
#define KMSG_COMPONENT "zram"
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
-#ifdef CONFIG_ZRAM_DEBUG
-#define DEBUG
-#endif
-
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/bio.h>
@@ -32,12 +28,16 @@
#include <linux/string.h>
#include <linux/vmalloc.h>
#include <linux/err.h>
+#include <linux/idr.h>
+#include <linux/sysfs.h>
#include "zram_drv.h"
-/* Globals */
+static DEFINE_IDR(zram_index_idr);
+/* idr index must be protected */
+static DEFINE_MUTEX(zram_index_mutex);
+
static int zram_major;
-static struct zram *zram_devices;
static const char *default_compressor = "lzo";
/* Module params (documentation at end) */
@@ -53,7 +53,7 @@ static inline void deprecated_attr_warn(const char *name)
}
#define ZRAM_ATTR_RO(name) \
-static ssize_t name##_show(struct device *d, \
+static ssize_t name##_show(struct device *d, \
struct device_attribute *attr, char *b) \
{ \
struct zram *zram = dev_to_zram(d); \
@@ -74,33 +74,117 @@ static inline struct zram *dev_to_zram(struct device *dev)
return (struct zram *)dev_to_disk(dev)->private_data;
}
-static ssize_t compact_store(struct device *dev,
- struct device_attribute *attr, const char *buf, size_t len)
+/* flag operations require table entry bit_spin_lock() being held */
+static int zram_test_flag(struct zram_meta *meta, u32 index,
+ enum zram_pageflags flag)
{
- unsigned long nr_migrated;
- struct zram *zram = dev_to_zram(dev);
- struct zram_meta *meta;
+ return meta->table[index].value & BIT(flag);
+}
- down_read(&zram->init_lock);
- if (!init_done(zram)) {
- up_read(&zram->init_lock);
- return -EINVAL;
- }
+static void zram_set_flag(struct zram_meta *meta, u32 index,
+ enum zram_pageflags flag)
+{
+ meta->table[index].value |= BIT(flag);
+}
- meta = zram->meta;
- nr_migrated = zs_compact(meta->mem_pool);
- atomic64_add(nr_migrated, &zram->stats.num_migrated);
- up_read(&zram->init_lock);
+static void zram_clear_flag(struct zram_meta *meta, u32 index,
+ enum zram_pageflags flag)
+{
+ meta->table[index].value &= ~BIT(flag);
+}
- return len;
+static size_t zram_get_obj_size(struct zram_meta *meta, u32 index)
+{
+ return meta->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1);
}
-static ssize_t disksize_show(struct device *dev,
- struct device_attribute *attr, char *buf)
+static void zram_set_obj_size(struct zram_meta *meta,
+ u32 index, size_t size)
{
- struct zram *zram = dev_to_zram(dev);
+ unsigned long flags = meta->table[index].value >> ZRAM_FLAG_SHIFT;
- return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize);
+ meta->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size;
+}
+
+static inline int is_partial_io(struct bio_vec *bvec)
+{
+ return bvec->bv_len != PAGE_SIZE;
+}
+
+/*
+ * Check if request is within bounds and aligned on zram logical blocks.
+ */
+static inline int valid_io_request(struct zram *zram,
+ sector_t start, unsigned int size)
+{
+ u64 end, bound;
+
+ /* unaligned request */
+ if (unlikely(start & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1)))
+ return 0;
+ if (unlikely(size & (ZRAM_LOGICAL_BLOCK_SIZE - 1)))
+ return 0;
+
+ end = start + (size >> SECTOR_SHIFT);
+ bound = zram->disksize >> SECTOR_SHIFT;
+ /* out of range range */
+ if (unlikely(start >= bound || end > bound || start > end))
+ return 0;
+
+ /* I/O request is valid */
+ return 1;
+}
+
+static void update_position(u32 *index, int *offset, struct bio_vec *bvec)
+{
+ if (*offset + bvec->bv_len >= PAGE_SIZE)
+ (*index)++;
+ *offset = (*offset + bvec->bv_len) % PAGE_SIZE;
+}
+
+static inline void update_used_max(struct zram *zram,
+ const unsigned long pages)
+{
+ unsigned long old_max, cur_max;
+
+ old_max = atomic_long_read(&zram->stats.max_used_pages);
+
+ do {
+ cur_max = old_max;
+ if (pages > cur_max)
+ old_max = atomic_long_cmpxchg(
+ &zram->stats.max_used_pages, cur_max, pages);
+ } while (old_max != cur_max);
+}
+
+static int page_zero_filled(void *ptr)
+{
+ unsigned int pos;
+ unsigned long *page;
+
+ page = (unsigned long *)ptr;
+
+ for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) {
+ if (page[pos])
+ return 0;
+ }
+
+ return 1;
+}
+
+static void handle_zero_page(struct bio_vec *bvec)
+{
+ struct page *page = bvec->bv_page;
+ void *user_mem;
+
+ user_mem = kmap_atomic(page);
+ if (is_partial_io(bvec))
+ memset(user_mem + bvec->bv_offset, 0, bvec->bv_len);
+ else
+ clear_page(user_mem);
+ kunmap_atomic(user_mem);
+
+ flush_dcache_page(page);
}
static ssize_t initstate_show(struct device *dev,
@@ -116,6 +200,14 @@ static ssize_t initstate_show(struct device *dev,
return scnprintf(buf, PAGE_SIZE, "%u\n", val);
}
+static ssize_t disksize_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct zram *zram = dev_to_zram(dev);
+
+ return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize);
+}
+
static ssize_t orig_data_size_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
@@ -143,19 +235,6 @@ static ssize_t mem_used_total_show(struct device *dev,
return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT);
}
-static ssize_t max_comp_streams_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- int val;
- struct zram *zram = dev_to_zram(dev);
-
- down_read(&zram->init_lock);
- val = zram->max_comp_streams;
- up_read(&zram->init_lock);
-
- return scnprintf(buf, PAGE_SIZE, "%d\n", val);
-}
-
static ssize_t mem_limit_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
@@ -225,6 +304,19 @@ static ssize_t mem_used_max_store(struct device *dev,
return len;
}
+static ssize_t max_comp_streams_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ int val;
+ struct zram *zram = dev_to_zram(dev);
+
+ down_read(&zram->init_lock);
+ val = zram->max_comp_streams;
+ up_read(&zram->init_lock);
+
+ return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+}
+
static ssize_t max_comp_streams_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t len)
{
@@ -271,6 +363,8 @@ static ssize_t comp_algorithm_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t len)
{
struct zram *zram = dev_to_zram(dev);
+ size_t sz;
+
down_write(&zram->init_lock);
if (init_done(zram)) {
up_write(&zram->init_lock);
@@ -278,69 +372,108 @@ static ssize_t comp_algorithm_store(struct device *dev,
return -EBUSY;
}
strlcpy(zram->compressor, buf, sizeof(zram->compressor));
+
+ /* ignore trailing newline */
+ sz = strlen(zram->compressor);
+ if (sz > 0 && zram->compressor[sz - 1] == '\n')
+ zram->compressor[sz - 1] = 0x00;
+
+ if (!zcomp_available_algorithm(zram->compressor))
+ len = -EINVAL;
+
up_write(&zram->init_lock);
return len;
}
-/* flag operations needs meta->tb_lock */
-static int zram_test_flag(struct zram_meta *meta, u32 index,
- enum zram_pageflags flag)
+static ssize_t compact_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
{
- return meta->table[index].value & BIT(flag);
-}
+ unsigned long nr_migrated;
+ struct zram *zram = dev_to_zram(dev);
+ struct zram_meta *meta;
-static void zram_set_flag(struct zram_meta *meta, u32 index,
- enum zram_pageflags flag)
-{
- meta->table[index].value |= BIT(flag);
-}
+ down_read(&zram->init_lock);
+ if (!init_done(zram)) {
+ up_read(&zram->init_lock);
+ return -EINVAL;
+ }
-static void zram_clear_flag(struct zram_meta *meta, u32 index,
- enum zram_pageflags flag)
-{
- meta->table[index].value &= ~BIT(flag);
+ meta = zram->meta;
+ nr_migrated = zs_compact(meta->mem_pool);
+ atomic64_add(nr_migrated, &zram->stats.num_migrated);
+ up_read(&zram->init_lock);
+
+ return len;
}
-static size_t zram_get_obj_size(struct zram_meta *meta, u32 index)
+static ssize_t io_stat_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
- return meta->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1);
+ struct zram *zram = dev_to_zram(dev);
+ ssize_t ret;
+
+ down_read(&zram->init_lock);
+ ret = scnprintf(buf, PAGE_SIZE,
+ "%8llu %8llu %8llu %8llu\n",
+ (u64)atomic64_read(&zram->stats.failed_reads),
+ (u64)atomic64_read(&zram->stats.failed_writes),
+ (u64)atomic64_read(&zram->stats.invalid_io),
+ (u64)atomic64_read(&zram->stats.notify_free));
+ up_read(&zram->init_lock);
+
+ return ret;
}
-static void zram_set_obj_size(struct zram_meta *meta,
- u32 index, size_t size)
+static ssize_t mm_stat_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
- unsigned long flags = meta->table[index].value >> ZRAM_FLAG_SHIFT;
+ struct zram *zram = dev_to_zram(dev);
+ u64 orig_size, mem_used = 0;
+ long max_used;
+ ssize_t ret;
- meta->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size;
+ down_read(&zram->init_lock);
+ if (init_done(zram))
+ mem_used = zs_get_total_pages(zram->meta->mem_pool);
+
+ orig_size = atomic64_read(&zram->stats.pages_stored);
+ max_used = atomic_long_read(&zram->stats.max_used_pages);
+
+ ret = scnprintf(buf, PAGE_SIZE,
+ "%8llu %8llu %8llu %8lu %8ld %8llu %8llu\n",
+ orig_size << PAGE_SHIFT,
+ (u64)atomic64_read(&zram->stats.compr_data_size),
+ mem_used << PAGE_SHIFT,
+ zram->limit_pages << PAGE_SHIFT,
+ max_used << PAGE_SHIFT,
+ (u64)atomic64_read(&zram->stats.zero_pages),
+ (u64)atomic64_read(&zram->stats.num_migrated));
+ up_read(&zram->init_lock);
+
+ return ret;
}
-static inline int is_partial_io(struct bio_vec *bvec)
+static DEVICE_ATTR_RO(io_stat);
+static DEVICE_ATTR_RO(mm_stat);
+ZRAM_ATTR_RO(num_reads);
+ZRAM_ATTR_RO(num_writes);
+ZRAM_ATTR_RO(failed_reads);
+ZRAM_ATTR_RO(failed_writes);
+ZRAM_ATTR_RO(invalid_io);
+ZRAM_ATTR_RO(notify_free);
+ZRAM_ATTR_RO(zero_pages);
+ZRAM_ATTR_RO(compr_data_size);
+
+static inline bool zram_meta_get(struct zram *zram)
{
- return bvec->bv_len != PAGE_SIZE;
+ if (atomic_inc_not_zero(&zram->refcount))
+ return true;
+ return false;
}
-/*
- * Check if request is within bounds and aligned on zram logical blocks.
- */
-static inline int valid_io_request(struct zram *zram,
- sector_t start, unsigned int size)
+static inline void zram_meta_put(struct zram *zram)
{
- u64 end, bound;
-
- /* unaligned request */
- if (unlikely(start & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1)))
- return 0;
- if (unlikely(size & (ZRAM_LOGICAL_BLOCK_SIZE - 1)))
- return 0;
-
- end = start + (size >> SECTOR_SHIFT);
- bound = zram->disksize >> SECTOR_SHIFT;
- /* out of range range */
- if (unlikely(start >= bound || end > bound || start > end))
- return 0;
-
- /* I/O request is valid */
- return 1;
+ atomic_dec(&zram->refcount);
}
static void zram_meta_free(struct zram_meta *meta, u64 disksize)
@@ -394,56 +527,6 @@ out_error:
return NULL;
}
-static inline bool zram_meta_get(struct zram *zram)
-{
- if (atomic_inc_not_zero(&zram->refcount))
- return true;
- return false;
-}
-
-static inline void zram_meta_put(struct zram *zram)
-{
- atomic_dec(&zram->refcount);
-}
-
-static void update_position(u32 *index, int *offset, struct bio_vec *bvec)
-{
- if (*offset + bvec->bv_len >= PAGE_SIZE)
- (*index)++;
- *offset = (*offset + bvec->bv_len) % PAGE_SIZE;
-}
-
-static int page_zero_filled(void *ptr)
-{
- unsigned int pos;
- unsigned long *page;
-
- page = (unsigned long *)ptr;
-
- for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) {
- if (page[pos])
- return 0;
- }
-
- return 1;
-}
-
-static void handle_zero_page(struct bio_vec *bvec)
-{
- struct page *page = bvec->bv_page;
- void *user_mem;
-
- user_mem = kmap_atomic(page);
- if (is_partial_io(bvec))
- memset(user_mem + bvec->bv_offset, 0, bvec->bv_len);
- else
- clear_page(user_mem);
- kunmap_atomic(user_mem);
-
- flush_dcache_page(page);
-}
-
-
/*
* To protect concurrent access to the same index entry,
* caller should hold this table index entry's bit_spinlock to
@@ -561,21 +644,6 @@ out_cleanup:
return ret;
}
-static inline void update_used_max(struct zram *zram,
- const unsigned long pages)
-{
- unsigned long old_max, cur_max;
-
- old_max = atomic_long_read(&zram->stats.max_used_pages);
-
- do {
- cur_max = old_max;
- if (pages > cur_max)
- old_max = atomic_long_cmpxchg(
- &zram->stats.max_used_pages, cur_max, pages);
- } while (old_max != cur_max);
-}
-
static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
int offset)
{
@@ -585,8 +653,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
struct page *page;
unsigned char *user_mem, *cmem, *src, *uncmem = NULL;
struct zram_meta *meta = zram->meta;
- struct zcomp_strm *zstrm;
- bool locked = false;
+ struct zcomp_strm *zstrm = NULL;
unsigned long alloced_pages;
page = bvec->bv_page;
@@ -606,7 +673,6 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
}
zstrm = zcomp_strm_find(zram->comp);
- locked = true;
user_mem = kmap_atomic(page);
if (is_partial_io(bvec)) {
@@ -678,7 +744,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
}
zcomp_strm_release(zram->comp, zstrm);
- locked = false;
+ zstrm = NULL;
zs_unmap_object(meta->mem_pool, handle);
/*
@@ -696,42 +762,13 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
atomic64_add(clen, &zram->stats.compr_data_size);
atomic64_inc(&zram->stats.pages_stored);
out:
- if (locked)
+ if (zstrm)
zcomp_strm_release(zram->comp, zstrm);
if (is_partial_io(bvec))
kfree(uncmem);
return ret;
}
-static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
- int offset, int rw)
-{
- unsigned long start_time = jiffies;
- int ret;
-
- generic_start_io_acct(rw, bvec->bv_len >> SECTOR_SHIFT,
- &zram->disk->part0);
-
- if (rw == READ) {
- atomic64_inc(&zram->stats.num_reads);
- ret = zram_bvec_read(zram, bvec, index, offset);
- } else {
- atomic64_inc(&zram->stats.num_writes);
- ret = zram_bvec_write(zram, bvec, index, offset);
- }
-
- generic_end_io_acct(rw, &zram->disk->part0, start_time);
-
- if (unlikely(ret)) {
- if (rw == READ)
- atomic64_inc(&zram->stats.failed_reads);
- else
- atomic64_inc(&zram->stats.failed_writes);
- }
-
- return ret;
-}
-
/*
* zram_bio_discard - handler on discard request
* @index: physical block index in PAGE_SIZE units
@@ -771,149 +808,32 @@ static void zram_bio_discard(struct zram *zram, u32 index,
}
}
-static void zram_reset_device(struct zram *zram)
-{
- struct zram_meta *meta;
- struct zcomp *comp;
- u64 disksize;
-
- down_write(&zram->init_lock);
-
- zram->limit_pages = 0;
-
- if (!init_done(zram)) {
- up_write(&zram->init_lock);
- return;
- }
-
- meta = zram->meta;
- comp = zram->comp;
- disksize = zram->disksize;
- /*
- * Refcount will go down to 0 eventually and r/w handler
- * cannot handle further I/O so it will bail out by
- * check zram_meta_get.
- */
- zram_meta_put(zram);
- /*
- * We want to free zram_meta in process context to avoid
- * deadlock between reclaim path and any other locks.
- */
- wait_event(zram->io_done, atomic_read(&zram->refcount) == 0);
-
- /* Reset stats */
- memset(&zram->stats, 0, sizeof(zram->stats));
- zram->disksize = 0;
- zram->max_comp_streams = 1;
- set_capacity(zram->disk, 0);
-
- up_write(&zram->init_lock);
- /* I/O operation under all of CPU are done so let's free */
- zram_meta_free(meta, disksize);
- zcomp_destroy(comp);
-}
-
-static ssize_t disksize_store(struct device *dev,
- struct device_attribute *attr, const char *buf, size_t len)
-{
- u64 disksize;
- struct zcomp *comp;
- struct zram_meta *meta;
- struct zram *zram = dev_to_zram(dev);
- int err;
-
- disksize = memparse(buf, NULL);
- if (!disksize)
- return -EINVAL;
-
- disksize = PAGE_ALIGN(disksize);
- meta = zram_meta_alloc(zram->disk->first_minor, disksize);
- if (!meta)
- return -ENOMEM;
-
- comp = zcomp_create(zram->compressor, zram->max_comp_streams);
- if (IS_ERR(comp)) {
- pr_info("Cannot initialise %s compressing backend\n",
- zram->compressor);
- err = PTR_ERR(comp);
- goto out_free_meta;
- }
-
- down_write(&zram->init_lock);
- if (init_done(zram)) {
- pr_info("Cannot change disksize for initialized device\n");
- err = -EBUSY;
- goto out_destroy_comp;
- }
-
- init_waitqueue_head(&zram->io_done);
- atomic_set(&zram->refcount, 1);
- zram->meta = meta;
- zram->comp = comp;
- zram->disksize = disksize;
- set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
- up_write(&zram->init_lock);
-
- /*
- * Revalidate disk out of the init_lock to avoid lockdep splat.
- * It's okay because disk's capacity is protected by init_lock
- * so that revalidate_disk always sees up-to-date capacity.
- */
- revalidate_disk(zram->disk);
-
- return len;
-
-out_destroy_comp:
- up_write(&zram->init_lock);
- zcomp_destroy(comp);
-out_free_meta:
- zram_meta_free(meta, disksize);
- return err;
-}
-
-static ssize_t reset_store(struct device *dev,
- struct device_attribute *attr, const char *buf, size_t len)
+static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
+ int offset, int rw)
{
+ unsigned long start_time = jiffies;
int ret;
- unsigned short do_reset;
- struct zram *zram;
- struct block_device *bdev;
- zram = dev_to_zram(dev);
- bdev = bdget_disk(zram->disk, 0);
-
- if (!bdev)
- return -ENOMEM;
+ generic_start_io_acct(rw, bvec->bv_len >> SECTOR_SHIFT,
+ &zram->disk->part0);
- mutex_lock(&bdev->bd_mutex);
- /* Do not reset an active device! */
- if (bdev->bd_openers) {
- ret = -EBUSY;
- goto out;
+ if (rw == READ) {
+ atomic64_inc(&zram->stats.num_reads);
+ ret = zram_bvec_read(zram, bvec, index, offset);
+ } else {
+ atomic64_inc(&zram->stats.num_writes);
+ ret = zram_bvec_write(zram, bvec, index, offset);
}
- ret = kstrtou16(buf, 10, &do_reset);
- if (ret)
- goto out;
+ generic_end_io_acct(rw, &zram->disk->part0, start_time);
- if (!do_reset) {
- ret = -EINVAL;
- goto out;
+ if (unlikely(ret)) {
+ if (rw == READ)
+ atomic64_inc(&zram->stats.failed_reads);
+ else
+ atomic64_inc(&zram->stats.failed_writes);
}
- /* Make sure all pending I/O is finished */
- fsync_bdev(bdev);
- zram_reset_device(zram);
-
- mutex_unlock(&bdev->bd_mutex);
- revalidate_disk(zram->disk);
- bdput(bdev);
-
- return len;
-
-out:
- mutex_unlock(&bdev->bd_mutex);
- bdput(bdev);
return ret;
}
@@ -1053,80 +973,185 @@ out:
return err;
}
-static const struct block_device_operations zram_devops = {
- .swap_slot_free_notify = zram_slot_free_notify,
- .rw_page = zram_rw_page,
- .owner = THIS_MODULE
-};
+static void zram_reset_device(struct zram *zram)
+{
+ struct zram_meta *meta;
+ struct zcomp *comp;
+ u64 disksize;
-static DEVICE_ATTR_WO(compact);
-static DEVICE_ATTR_RW(disksize);
-static DEVICE_ATTR_RO(initstate);
-static DEVICE_ATTR_WO(reset);
-static DEVICE_ATTR_RO(orig_data_size);
-static DEVICE_ATTR_RO(mem_used_total);
-static DEVICE_ATTR_RW(mem_limit);
-static DEVICE_ATTR_RW(mem_used_max);
-static DEVICE_ATTR_RW(max_comp_streams);
-static DEVICE_ATTR_RW(comp_algorithm);
+ down_write(&zram->init_lock);
-static ssize_t io_stat_show(struct device *dev,
- struct device_attribute *attr, char *buf)
+ zram->limit_pages = 0;
+
+ if (!init_done(zram)) {
+ up_write(&zram->init_lock);
+ return;
+ }
+
+ meta = zram->meta;
+ comp = zram->comp;
+ disksize = zram->disksize;
+ /*
+ * Refcount will go down to 0 eventually and r/w handler
+ * cannot handle further I/O so it will bail out by
+ * check zram_meta_get.
+ */
+ zram_meta_put(zram);
+ /*
+ * We want to free zram_meta in process context to avoid
+ * deadlock between reclaim path and any other locks.
+ */
+ wait_event(zram->io_done, atomic_read(&zram->refcount) == 0);
+
+ /* Reset stats */
+ memset(&zram->stats, 0, sizeof(zram->stats));
+ zram->disksize = 0;
+ zram->max_comp_streams = 1;
+
+ set_capacity(zram->disk, 0);
+ part_stat_set_all(&zram->disk->part0, 0);
+
+ up_write(&zram->init_lock);
+ /* I/O operation under all of CPU are done so let's free */
+ zram_meta_free(meta, disksize);
+ zcomp_destroy(comp);
+}
+
+static ssize_t disksize_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
{
+ u64 disksize;
+ struct zcomp *comp;
+ struct zram_meta *meta;
struct zram *zram = dev_to_zram(dev);
- ssize_t ret;
+ int err;
- down_read(&zram->init_lock);
- ret = scnprintf(buf, PAGE_SIZE,
- "%8llu %8llu %8llu %8llu\n",
- (u64)atomic64_read(&zram->stats.failed_reads),
- (u64)atomic64_read(&zram->stats.failed_writes),
- (u64)atomic64_read(&zram->stats.invalid_io),
- (u64)atomic64_read(&zram->stats.notify_free));
- up_read(&zram->init_lock);
+ disksize = memparse(buf, NULL);
+ if (!disksize)
+ return -EINVAL;
- return ret;
+ disksize = PAGE_ALIGN(disksize);
+ meta = zram_meta_alloc(zram->disk->first_minor, disksize);
+ if (!meta)
+ return -ENOMEM;
+
+ comp = zcomp_create(zram->compressor, zram->max_comp_streams);
+ if (IS_ERR(comp)) {
+ pr_info("Cannot initialise %s compressing backend\n",
+ zram->compressor);
+ err = PTR_ERR(comp);
+ goto out_free_meta;
+ }
+
+ down_write(&zram->init_lock);
+ if (init_done(zram)) {
+ pr_info("Cannot change disksize for initialized device\n");
+ err = -EBUSY;
+ goto out_destroy_comp;
+ }
+
+ init_waitqueue_head(&zram->io_done);
+ atomic_set(&zram->refcount, 1);
+ zram->meta = meta;
+ zram->comp = comp;
+ zram->disksize = disksize;
+ set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
+ up_write(&zram->init_lock);
+
+ /*
+ * Revalidate disk out of the init_lock to avoid lockdep splat.
+ * It's okay because disk's capacity is protected by init_lock
+ * so that revalidate_disk always sees up-to-date capacity.
+ */
+ revalidate_disk(zram->disk);
+
+ return len;
+
+out_destroy_comp:
+ up_write(&zram->init_lock);
+ zcomp_destroy(comp);
+out_free_meta:
+ zram_meta_free(meta, disksize);
+ return err;
}
-static ssize_t mm_stat_show(struct device *dev,
- struct device_attribute *attr, char *buf)
+static ssize_t reset_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
{
- struct zram *zram = dev_to_zram(dev);
- u64 orig_size, mem_used = 0;
- long max_used;
- ssize_t ret;
+ int ret;
+ unsigned short do_reset;
+ struct zram *zram;
+ struct block_device *bdev;
- down_read(&zram->init_lock);
- if (init_done(zram))
- mem_used = zs_get_total_pages(zram->meta->mem_pool);
+ ret = kstrtou16(buf, 10, &do_reset);
+ if (ret)
+ return ret;
- orig_size = atomic64_read(&zram->stats.pages_stored);
- max_used = atomic_long_read(&zram->stats.max_used_pages);
+ if (!do_reset)
+ return -EINVAL;
- ret = scnprintf(buf, PAGE_SIZE,
- "%8llu %8llu %8llu %8lu %8ld %8llu %8llu\n",
- orig_size << PAGE_SHIFT,
- (u64)atomic64_read(&zram->stats.compr_data_size),
- mem_used << PAGE_SHIFT,
- zram->limit_pages << PAGE_SHIFT,
- max_used << PAGE_SHIFT,
- (u64)atomic64_read(&zram->stats.zero_pages),
- (u64)atomic64_read(&zram->stats.num_migrated));
- up_read(&zram->init_lock);
+ zram = dev_to_zram(dev);
+ bdev = bdget_disk(zram->disk, 0);
+ if (!bdev)
+ return -ENOMEM;
+
+ mutex_lock(&bdev->bd_mutex);
+ /* Do not reset an active device or claimed device */
+ if (bdev->bd_openers || zram->claim) {
+ mutex_unlock(&bdev->bd_mutex);
+ bdput(bdev);
+ return -EBUSY;
+ }
+
+ /* From now on, anyone can't open /dev/zram[0-9] */
+ zram->claim = true;
+ mutex_unlock(&bdev->bd_mutex);
+
+ /* Make sure all the pending I/O are finished */
+ fsync_bdev(bdev);
+ zram_reset_device(zram);
+ revalidate_disk(zram->disk);
+ bdput(bdev);
+
+ mutex_lock(&bdev->bd_mutex);
+ zram->claim = false;
+ mutex_unlock(&bdev->bd_mutex);
+
+ return len;
+}
+
+static int zram_open(struct block_device *bdev, fmode_t mode)
+{
+ int ret = 0;
+ struct zram *zram;
+
+ WARN_ON(!mutex_is_locked(&bdev->bd_mutex));
+
+ zram = bdev->bd_disk->private_data;
+ /* zram was claimed to reset so open request fails */
+ if (zram->claim)
+ ret = -EBUSY;
return ret;
}
-static DEVICE_ATTR_RO(io_stat);
-static DEVICE_ATTR_RO(mm_stat);
-ZRAM_ATTR_RO(num_reads);
-ZRAM_ATTR_RO(num_writes);
-ZRAM_ATTR_RO(failed_reads);
-ZRAM_ATTR_RO(failed_writes);
-ZRAM_ATTR_RO(invalid_io);
-ZRAM_ATTR_RO(notify_free);
-ZRAM_ATTR_RO(zero_pages);
-ZRAM_ATTR_RO(compr_data_size);
+static const struct block_device_operations zram_devops = {
+ .open = zram_open,
+ .swap_slot_free_notify = zram_slot_free_notify,
+ .rw_page = zram_rw_page,
+ .owner = THIS_MODULE
+};
+
+static DEVICE_ATTR_WO(compact);
+static DEVICE_ATTR_RW(disksize);
+static DEVICE_ATTR_RO(initstate);
+static DEVICE_ATTR_WO(reset);
+static DEVICE_ATTR_RO(orig_data_size);
+static DEVICE_ATTR_RO(mem_used_total);
+static DEVICE_ATTR_RW(mem_limit);
+static DEVICE_ATTR_RW(mem_used_max);
+static DEVICE_ATTR_RW(max_comp_streams);
+static DEVICE_ATTR_RW(comp_algorithm);
static struct attribute *zram_disk_attrs[] = {
&dev_attr_disksize.attr,
@@ -1156,10 +1181,24 @@ static struct attribute_group zram_disk_attr_group = {
.attrs = zram_disk_attrs,
};
-static int create_device(struct zram *zram, int device_id)
+/*
+ * Allocate and initialize new zram device. the function returns
+ * '>= 0' device_id upon success, and negative value otherwise.
+ */
+static int zram_add(void)
{
+ struct zram *zram;
struct request_queue *queue;
- int ret = -ENOMEM;
+ int ret, device_id;
+
+ zram = kzalloc(sizeof(struct zram), GFP_KERNEL);
+ if (!zram)
+ return -ENOMEM;
+
+ ret = idr_alloc(&zram_index_idr, zram, 0, 0, GFP_KERNEL);
+ if (ret < 0)
+ goto out_free_dev;
+ device_id = ret;
init_rwsem(&zram->init_lock);
@@ -1167,12 +1206,13 @@ static int create_device(struct zram *zram, int device_id)
if (!queue) {
pr_err("Error allocating disk queue for device %d\n",
device_id);
- goto out;
+ ret = -ENOMEM;
+ goto out_free_idr;
}
blk_queue_make_request(queue, zram_make_request);
- /* gendisk structure */
+ /* gendisk structure */
zram->disk = alloc_disk(1);
if (!zram->disk) {
pr_warn("Error allocating disk structure for device %d\n",
@@ -1230,90 +1270,177 @@ static int create_device(struct zram *zram, int device_id)
strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor));
zram->meta = NULL;
zram->max_comp_streams = 1;
- return 0;
+
+ pr_info("Added device: %s\n", zram->disk->disk_name);
+ return device_id;
out_free_disk:
del_gendisk(zram->disk);
put_disk(zram->disk);
out_free_queue:
blk_cleanup_queue(queue);
-out:
+out_free_idr:
+ idr_remove(&zram_index_idr, device_id);
+out_free_dev:
+ kfree(zram);
return ret;
}
-static void destroy_devices(unsigned int nr)
+static int zram_remove(struct zram *zram)
+{
+ struct block_device *bdev;
+
+ bdev = bdget_disk(zram->disk, 0);
+ if (!bdev)
+ return -ENOMEM;
+
+ mutex_lock(&bdev->bd_mutex);
+ if (bdev->bd_openers || zram->claim) {
+ mutex_unlock(&bdev->bd_mutex);
+ bdput(bdev);
+ return -EBUSY;
+ }
+
+ zram->claim = true;
+ mutex_unlock(&bdev->bd_mutex);
+
+ /*
+ * Remove sysfs first, so no one will perform a disksize
+ * store while we destroy the devices. This also helps during
+ * hot_remove -- zram_reset_device() is the last holder of
+ * ->init_lock, no later/concurrent disksize_store() or any
+ * other sysfs handlers are possible.
+ */
+ sysfs_remove_group(&disk_to_dev(zram->disk)->kobj,
+ &zram_disk_attr_group);
+
+ /* Make sure all the pending I/O are finished */
+ fsync_bdev(bdev);
+ zram_reset_device(zram);
+ bdput(bdev);
+
+ pr_info("Removed device: %s\n", zram->disk->disk_name);
+
+ idr_remove(&zram_index_idr, zram->disk->first_minor);
+ blk_cleanup_queue(zram->disk->queue);
+ del_gendisk(zram->disk);
+ put_disk(zram->disk);
+ kfree(zram);
+ return 0;
+}
+
+/* zram-control sysfs attributes */
+static ssize_t hot_add_show(struct class *class,
+ struct class_attribute *attr,
+ char *buf)
+{
+ int ret;
+
+ mutex_lock(&zram_index_mutex);
+ ret = zram_add();
+ mutex_unlock(&zram_index_mutex);
+
+ if (ret < 0)
+ return ret;
+ return scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+}
+
+static ssize_t hot_remove_store(struct class *class,
+ struct class_attribute *attr,
+ const char *buf,
+ size_t count)
{
struct zram *zram;
- unsigned int i;
+ int ret, dev_id;
- for (i = 0; i < nr; i++) {
- zram = &zram_devices[i];
- /*
- * Remove sysfs first, so no one will perform a disksize
- * store while we destroy the devices
- */
- sysfs_remove_group(&disk_to_dev(zram->disk)->kobj,
- &zram_disk_attr_group);
+ /* dev_id is gendisk->first_minor, which is `int' */
+ ret = kstrtoint(buf, 10, &dev_id);
+ if (ret)
+ return ret;
+ if (dev_id < 0)
+ return -EINVAL;
- zram_reset_device(zram);
+ mutex_lock(&zram_index_mutex);
- blk_cleanup_queue(zram->disk->queue);
- del_gendisk(zram->disk);
- put_disk(zram->disk);
- }
+ zram = idr_find(&zram_index_idr, dev_id);
+ if (zram)
+ ret = zram_remove(zram);
+ else
+ ret = -ENODEV;
+
+ mutex_unlock(&zram_index_mutex);
+ return ret ? ret : count;
+}
+
+static struct class_attribute zram_control_class_attrs[] = {
+ __ATTR_RO(hot_add),
+ __ATTR_WO(hot_remove),
+ __ATTR_NULL,
+};
+
+static struct class zram_control_class = {
+ .name = "zram-control",
+ .owner = THIS_MODULE,
+ .class_attrs = zram_control_class_attrs,
+};
+
+static int zram_remove_cb(int id, void *ptr, void *data)
+{
+ zram_remove(ptr);
+ return 0;
+}
- kfree(zram_devices);
+static void destroy_devices(void)
+{
+ class_unregister(&zram_control_class);
+ idr_for_each(&zram_index_idr, &zram_remove_cb, NULL);
+ idr_destroy(&zram_index_idr);
unregister_blkdev(zram_major, "zram");
- pr_info("Destroyed %u device(s)\n", nr);
}
static int __init zram_init(void)
{
- int ret, dev_id;
+ int ret;
- if (num_devices > max_num_devices) {
- pr_warn("Invalid value for num_devices: %u\n",
- num_devices);
- return -EINVAL;
+ ret = class_register(&zram_control_class);
+ if (ret) {
+ pr_warn("Unable to register zram-control class\n");
+ return ret;
}
zram_major = register_blkdev(0, "zram");
if (zram_major <= 0) {
pr_warn("Unable to get major number\n");
+ class_unregister(&zram_control_class);
return -EBUSY;
}
- /* Allocate the device array and initialize each one */
- zram_devices = kzalloc(num_devices * sizeof(struct zram), GFP_KERNEL);
- if (!zram_devices) {
- unregister_blkdev(zram_major, "zram");
- return -ENOMEM;
- }
-
- for (dev_id = 0; dev_id < num_devices; dev_id++) {
- ret = create_device(&zram_devices[dev_id], dev_id);
- if (ret)
+ while (num_devices != 0) {
+ mutex_lock(&zram_index_mutex);
+ ret = zram_add();
+ mutex_unlock(&zram_index_mutex);
+ if (ret < 0)
goto out_error;
+ num_devices--;
}
- pr_info("Created %u device(s)\n", num_devices);
return 0;
out_error:
- destroy_devices(dev_id);
+ destroy_devices();
return ret;
}
static void __exit zram_exit(void)
{
- destroy_devices(num_devices);
+ destroy_devices();
}
module_init(zram_init);
module_exit(zram_exit);
module_param(num_devices, uint, 0);
-MODULE_PARM_DESC(num_devices, "Number of zram devices");
+MODULE_PARM_DESC(num_devices, "Number of pre-created zram devices");
MODULE_LICENSE("Dual BSD/GPL");
MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 570c598f4ce9af..6dbe2df506bf0b 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -20,12 +20,6 @@
#include "zcomp.h"
-/*
- * Some arbitrary value. This is just to catch
- * invalid value for num_devices module parameter.
- */
-static const unsigned max_num_devices = 32;
-
/*-- Configurable parameters */
/*
@@ -121,5 +115,9 @@ struct zram {
*/
u64 disksize; /* bytes */
char compressor[10];
+ /*
+ * zram is claimed so open request will be failed
+ */
+ bool claim; /* Protected by bdev->bd_mutex */
};
#endif
diff --git a/drivers/memstick/host/jmb38x_ms.c b/drivers/memstick/host/jmb38x_ms.c
index aeabaa5aedf7d2..48db922075e2ad 100644
--- a/drivers/memstick/host/jmb38x_ms.c
+++ b/drivers/memstick/host/jmb38x_ms.c
@@ -419,10 +419,10 @@ static int jmb38x_ms_issue_cmd(struct memstick_host *msh)
}
if (host->cmd_flags & DMA_DATA) {
- if (1 != pci_map_sg(host->chip->pdev, &host->req->sg, 1,
+ if (1 != dma_map_sg(&host->chip->pdev->dev, &host->req->sg, 1,
host->req->data_dir == READ
- ? PCI_DMA_FROMDEVICE
- : PCI_DMA_TODEVICE)) {
+ ? DMA_FROM_DEVICE
+ : DMA_TO_DEVICE)) {
host->req->error = -ENOMEM;
return host->req->error;
}
@@ -487,9 +487,9 @@ static void jmb38x_ms_complete_cmd(struct memstick_host *msh, int last)
writel(0, host->addr + DMA_CONTROL);
if (host->cmd_flags & DMA_DATA) {
- pci_unmap_sg(host->chip->pdev, &host->req->sg, 1,
+ dma_unmap_sg(&host->chip->pdev->dev, &host->req->sg, 1,
host->req->data_dir == READ
- ? PCI_DMA_FROMDEVICE : PCI_DMA_TODEVICE);
+ ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
} else {
t_val = readl(host->addr + INT_STATUS_ENABLE);
if (host->req->data_dir == READ)
@@ -925,7 +925,7 @@ static int jmb38x_ms_probe(struct pci_dev *pdev,
int pci_dev_busy = 0;
int rc, cnt;
- rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+ rc = dma_set_mask(&pdev->dev, DMA_BIT_MASK(32));
if (rc)
return rc;
diff --git a/drivers/memstick/host/r592.c b/drivers/memstick/host/r592.c
index e2a4f5f415b2be..ef09ba0289d723 100644
--- a/drivers/memstick/host/r592.c
+++ b/drivers/memstick/host/r592.c
@@ -754,7 +754,7 @@ static int r592_probe(struct pci_dev *pdev, const struct pci_device_id *id)
goto error2;
pci_set_master(pdev);
- error = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+ error = dma_set_mask(&pdev->dev, DMA_BIT_MASK(32));
if (error)
goto error3;
@@ -787,8 +787,8 @@ static int r592_probe(struct pci_dev *pdev, const struct pci_device_id *id)
}
/* This is just a precation, so don't fail */
- dev->dummy_dma_page = pci_alloc_consistent(pdev, PAGE_SIZE,
- &dev->dummy_dma_page_physical_address);
+ dev->dummy_dma_page = dma_alloc_coherent(&pdev->dev, PAGE_SIZE,
+ &dev->dummy_dma_page_physical_address, GFP_KERNEL);
r592_stop_dma(dev , 0);
if (request_irq(dev->irq, &r592_irq, IRQF_SHARED,
@@ -805,7 +805,7 @@ error7:
free_irq(dev->irq, dev);
error6:
if (dev->dummy_dma_page)
- pci_free_consistent(pdev, PAGE_SIZE, dev->dummy_dma_page,
+ dma_free_coherent(&pdev->dev, PAGE_SIZE, dev->dummy_dma_page,
dev->dummy_dma_page_physical_address);
kthread_stop(dev->io_thread);
@@ -845,7 +845,7 @@ static void r592_remove(struct pci_dev *pdev)
memstick_free_host(dev->host);
if (dev->dummy_dma_page)
- pci_free_consistent(pdev, PAGE_SIZE, dev->dummy_dma_page,
+ dma_free_coherent(&pdev->dev, PAGE_SIZE, dev->dummy_dma_page,
dev->dummy_dma_page_physical_address);
}
diff --git a/drivers/misc/kgdbts.c b/drivers/misc/kgdbts.c
index 36f5d52775a98a..9a60bd4d3c4948 100644
--- a/drivers/misc/kgdbts.c
+++ b/drivers/misc/kgdbts.c
@@ -220,7 +220,7 @@ static unsigned long lookup_addr(char *arg)
else if (!strcmp(arg, "sys_open"))
addr = (unsigned long)do_sys_open;
else if (!strcmp(arg, "do_fork"))
- addr = (unsigned long)do_fork;
+ addr = (unsigned long)_do_fork;
else if (!strcmp(arg, "hw_break_val"))
addr = (unsigned long)&hw_break_val;
addr = (unsigned long) dereference_function_descriptor((void *)addr);
diff --git a/drivers/misc/spear13xx_pcie_gadget.c b/drivers/misc/spear13xx_pcie_gadget.c
index fe3ad0ca9a3ee5..b8374cdaf9c940 100644
--- a/drivers/misc/spear13xx_pcie_gadget.c
+++ b/drivers/misc/spear13xx_pcie_gadget.c
@@ -2,7 +2,7 @@
* drivers/misc/spear13xx_pcie_gadget.c
*
* Copyright (C) 2010 ST Microelectronics
- * Pratyush Anand<pratyush.anand@st.com>
+ * Pratyush Anand<pratyush.anand@gmail.com>
*
* This file is licensed under the terms of the GNU General Public
* License version 2. This program is licensed "as is" without any
diff --git a/drivers/net/netconsole.c b/drivers/net/netconsole.c
index 15731d1db918c3..97f3acd4479858 100644
--- a/drivers/net/netconsole.c
+++ b/drivers/net/netconsole.c
@@ -79,6 +79,12 @@ static LIST_HEAD(target_list);
/* This needs to be a spinlock because write_msg() cannot sleep */
static DEFINE_SPINLOCK(target_list_lock);
+/*
+ * Console driver for extended netconsoles. Registered on the first use to
+ * avoid unnecessarily enabling ext message formatting.
+ */
+static struct console netconsole_ext;
+
/**
* struct netconsole_target - Represents a configured netconsole target.
* @list: Links this target into the target_list.
@@ -104,14 +110,15 @@ struct netconsole_target {
#ifdef CONFIG_NETCONSOLE_DYNAMIC
struct config_item item;
#endif
- int enabled;
- struct mutex mutex;
+ bool enabled;
+ bool extended;
struct netpoll np;
};
#ifdef CONFIG_NETCONSOLE_DYNAMIC
static struct configfs_subsystem netconsole_subsys;
+static DEFINE_MUTEX(dynamic_netconsole_mutex);
static int __init dynamic_netconsole_init(void)
{
@@ -185,9 +192,13 @@ static struct netconsole_target *alloc_param_target(char *target_config)
strlcpy(nt->np.dev_name, "eth0", IFNAMSIZ);
nt->np.local_port = 6665;
nt->np.remote_port = 6666;
- mutex_init(&nt->mutex);
eth_broadcast_addr(nt->np.remote_mac);
+ if (*target_config == '+') {
+ nt->extended = true;
+ target_config++;
+ }
+
/* Parse parameters and setup netpoll */
err = netpoll_parse_options(&nt->np, target_config);
if (err)
@@ -197,7 +208,7 @@ static struct netconsole_target *alloc_param_target(char *target_config)
if (err)
goto fail;
- nt->enabled = 1;
+ nt->enabled = true;
return nt;
@@ -258,6 +269,11 @@ static ssize_t show_enabled(struct netconsole_target *nt, char *buf)
return snprintf(buf, PAGE_SIZE, "%d\n", nt->enabled);
}
+static ssize_t show_extended(struct netconsole_target *nt, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", nt->extended);
+}
+
static ssize_t show_dev_name(struct netconsole_target *nt, char *buf)
{
return snprintf(buf, PAGE_SIZE, "%s\n", nt->np.dev_name);
@@ -322,13 +338,18 @@ static ssize_t store_enabled(struct netconsole_target *nt,
return err;
if (enabled < 0 || enabled > 1)
return -EINVAL;
- if (enabled == nt->enabled) {
+ if ((bool)enabled == nt->enabled) {
pr_info("network logging has already %s\n",
nt->enabled ? "started" : "stopped");
return -EINVAL;
}
- if (enabled) { /* 1 */
+ if (enabled) { /* true */
+ if (nt->extended && !(netconsole_ext.flags & CON_ENABLED)) {
+ netconsole_ext.flags |= CON_ENABLED;
+ register_console(&netconsole_ext);
+ }
+
/*
* Skip netpoll_parse_options() -- all the attributes are
* already configured via configfs. Just print them out.
@@ -340,13 +361,13 @@ static ssize_t store_enabled(struct netconsole_target *nt,
return err;
pr_info("netconsole: network logging started\n");
- } else { /* 0 */
+ } else { /* false */
/* We need to disable the netconsole before cleaning it up
* otherwise we might end up in write_msg() with
- * nt->np.dev == NULL and nt->enabled == 1
+ * nt->np.dev == NULL and nt->enabled == true
*/
spin_lock_irqsave(&target_list_lock, flags);
- nt->enabled = 0;
+ nt->enabled = false;
spin_unlock_irqrestore(&target_list_lock, flags);
netpoll_cleanup(&nt->np);
}
@@ -356,6 +377,30 @@ static ssize_t store_enabled(struct netconsole_target *nt,
return strnlen(buf, count);
}
+static ssize_t store_extended(struct netconsole_target *nt,
+ const char *buf,
+ size_t count)
+{
+ int extended;
+ int err;
+
+ if (nt->enabled) {
+ pr_err("target (%s) is enabled, disable to update parameters\n",
+ config_item_name(&nt->item));
+ return -EINVAL;
+ }
+
+ err = kstrtoint(buf, 10, &extended);
+ if (err < 0)
+ return err;
+ if (extended < 0 || extended > 1)
+ return -EINVAL;
+
+ nt->extended = extended;
+
+ return strnlen(buf, count);
+}
+
static ssize_t store_dev_name(struct netconsole_target *nt,
const char *buf,
size_t count)
@@ -508,6 +553,7 @@ static struct netconsole_target_attr netconsole_target_##_name = \
__CONFIGFS_ATTR(_name, S_IRUGO | S_IWUSR, show_##_name, store_##_name)
NETCONSOLE_TARGET_ATTR_RW(enabled);
+NETCONSOLE_TARGET_ATTR_RW(extended);
NETCONSOLE_TARGET_ATTR_RW(dev_name);
NETCONSOLE_TARGET_ATTR_RW(local_port);
NETCONSOLE_TARGET_ATTR_RW(remote_port);
@@ -518,6 +564,7 @@ NETCONSOLE_TARGET_ATTR_RW(remote_mac);
static struct configfs_attribute *netconsole_target_attrs[] = {
&netconsole_target_enabled.attr,
+ &netconsole_target_extended.attr,
&netconsole_target_dev_name.attr,
&netconsole_target_local_port.attr,
&netconsole_target_remote_port.attr,
@@ -562,10 +609,10 @@ static ssize_t netconsole_target_attr_store(struct config_item *item,
struct netconsole_target_attr *na =
container_of(attr, struct netconsole_target_attr, attr);
- mutex_lock(&nt->mutex);
+ mutex_lock(&dynamic_netconsole_mutex);
if (na->store)
ret = na->store(nt, buf, count);
- mutex_unlock(&nt->mutex);
+ mutex_unlock(&dynamic_netconsole_mutex);
return ret;
}
@@ -594,7 +641,7 @@ static struct config_item *make_netconsole_target(struct config_group *group,
/*
* Allocate and initialize with defaults.
- * Target is disabled at creation (enabled == 0).
+ * Target is disabled at creation (!enabled).
*/
nt = kzalloc(sizeof(*nt), GFP_KERNEL);
if (!nt)
@@ -604,7 +651,6 @@ static struct config_item *make_netconsole_target(struct config_group *group,
strlcpy(nt->np.dev_name, "eth0", IFNAMSIZ);
nt->np.local_port = 6665;
nt->np.remote_port = 6666;
- mutex_init(&nt->mutex);
eth_broadcast_addr(nt->np.remote_mac);
/* Initialize the config_item member */
@@ -695,7 +741,7 @@ restart:
spin_lock_irqsave(&target_list_lock, flags);
dev_put(nt->np.dev);
nt->np.dev = NULL;
- nt->enabled = 0;
+ nt->enabled = false;
stopped = true;
netconsole_target_put(nt);
goto restart;
@@ -729,6 +775,82 @@ static struct notifier_block netconsole_netdev_notifier = {
.notifier_call = netconsole_netdev_event,
};
+/**
+ * send_ext_msg_udp - send extended log message to target
+ * @nt: target to send message to
+ * @msg: extended log message to send
+ * @msg_len: length of message
+ *
+ * Transfer extended log @msg to @nt. If @msg is longer than
+ * MAX_PRINT_CHUNK, it'll be split and transmitted in multiple chunks with
+ * ncfrag header field added to identify them.
+ */
+static void send_ext_msg_udp(struct netconsole_target *nt, const char *msg,
+ int msg_len)
+{
+ static char buf[MAX_PRINT_CHUNK]; /* protected by target_list_lock */
+ const char *header, *body;
+ int offset = 0;
+ int header_len, body_len;
+
+ if (msg_len <= MAX_PRINT_CHUNK) {
+ netpoll_send_udp(&nt->np, msg, msg_len);
+ return;
+ }
+
+ /* need to insert extra header fields, detect header and body */
+ header = msg;
+ body = memchr(msg, ';', msg_len);
+ if (WARN_ON_ONCE(!body))
+ return;
+
+ header_len = body - header;
+ body_len = msg_len - header_len - 1;
+ body++;
+
+ /*
+ * Transfer multiple chunks with the following extra header.
+ * "ncfrag=<byte-offset>/<total-bytes>"
+ */
+ memcpy(buf, header, header_len);
+
+ while (offset < body_len) {
+ int this_header = header_len;
+ int this_chunk;
+
+ this_header += scnprintf(buf + this_header,
+ sizeof(buf) - this_header,
+ ",ncfrag=%d/%d;", offset, body_len);
+
+ this_chunk = min(body_len - offset,
+ MAX_PRINT_CHUNK - this_header);
+ if (WARN_ON_ONCE(this_chunk <= 0))
+ return;
+
+ memcpy(buf + this_header, body + offset, this_chunk);
+
+ netpoll_send_udp(&nt->np, buf, this_header + this_chunk);
+
+ offset += this_chunk;
+ }
+}
+
+static void write_ext_msg(struct console *con, const char *msg,
+ unsigned int len)
+{
+ struct netconsole_target *nt;
+ unsigned long flags;
+
+ if ((oops_only && !oops_in_progress) || list_empty(&target_list))
+ return;
+
+ spin_lock_irqsave(&target_list_lock, flags);
+ list_for_each_entry(nt, &target_list, list)
+ if (nt->extended && nt->enabled && netif_running(nt->np.dev))
+ send_ext_msg_udp(nt, msg, len);
+ spin_unlock_irqrestore(&target_list_lock, flags);
+}
+
static void write_msg(struct console *con, const char *msg, unsigned int len)
{
int frag, left;
@@ -744,8 +866,7 @@ static void write_msg(struct console *con, const char *msg, unsigned int len)
spin_lock_irqsave(&target_list_lock, flags);
list_for_each_entry(nt, &target_list, list) {
- netconsole_target_get(nt);
- if (nt->enabled && netif_running(nt->np.dev)) {
+ if (!nt->extended && nt->enabled && netif_running(nt->np.dev)) {
/*
* We nest this inside the for-each-target loop above
* so that we're able to get as much logging out to
@@ -760,11 +881,16 @@ static void write_msg(struct console *con, const char *msg, unsigned int len)
left -= frag;
}
}
- netconsole_target_put(nt);
}
spin_unlock_irqrestore(&target_list_lock, flags);
}
+static struct console netconsole_ext = {
+ .name = "netcon_ext",
+ .flags = CON_EXTENDED, /* starts disabled, registered on first use */
+ .write = write_ext_msg,
+};
+
static struct console netconsole = {
.name = "netcon",
.flags = CON_ENABLED,
@@ -787,7 +913,11 @@ static int __init init_netconsole(void)
goto fail;
}
/* Dump existing printks when we register */
- netconsole.flags |= CON_PRINTBUFFER;
+ if (nt->extended)
+ netconsole_ext.flags |= CON_PRINTBUFFER |
+ CON_ENABLED;
+ else
+ netconsole.flags |= CON_PRINTBUFFER;
spin_lock_irqsave(&target_list_lock, flags);
list_add(&nt->list, &target_list);
@@ -803,6 +933,8 @@ static int __init init_netconsole(void)
if (err)
goto undonotifier;
+ if (netconsole_ext.flags & CON_ENABLED)
+ register_console(&netconsole_ext);
register_console(&netconsole);
pr_info("network logging started\n");
@@ -831,6 +963,7 @@ static void __exit cleanup_netconsole(void)
{
struct netconsole_target *nt, *tmp;
+ unregister_console(&netconsole_ext);
unregister_console(&netconsole);
dynamic_netconsole_exit();
unregister_netdevice_notifier(&netconsole_netdev_notifier);
diff --git a/drivers/pci/host/pcie-spear13xx.c b/drivers/pci/host/pcie-spear13xx.c
index 020d788907191f..fb0b3e9b7e0a32 100644
--- a/drivers/pci/host/pcie-spear13xx.c
+++ b/drivers/pci/host/pcie-spear13xx.c
@@ -4,8 +4,8 @@
* SPEAr13xx PCIe Glue Layer Source Code
*
* Copyright (C) 2010-2014 ST Microelectronics
- * Pratyush Anand <pratyush.anand@st.com>
- * Mohit Kumar <mohit.kumar@st.com>
+ * Pratyush Anand <pratyush.anand@gmail.com>
+ * Mohit Kumar <mohit.kumar.dhaka@gmail.com>
*
* This file is licensed under the terms of the GNU General Public
* License version 2. This program is licensed "as is" without any
@@ -387,5 +387,5 @@ static int __init spear13xx_pcie_init(void)
module_init(spear13xx_pcie_init);
MODULE_DESCRIPTION("ST Microelectronics SPEAr13xx PCIe host controller driver");
-MODULE_AUTHOR("Pratyush Anand <pratyush.anand@st.com>");
+MODULE_AUTHOR("Pratyush Anand <pratyush.anand@gmail.com>");
MODULE_LICENSE("GPL v2");
diff --git a/drivers/phy/phy-spear1310-miphy.c b/drivers/phy/phy-spear1310-miphy.c
index 65ae640cfbd1cf..45d0005b2203f1 100644
--- a/drivers/phy/phy-spear1310-miphy.c
+++ b/drivers/phy/phy-spear1310-miphy.c
@@ -2,8 +2,8 @@
* ST SPEAr1310-miphy driver
*
* Copyright (C) 2014 ST Microelectronics
- * Pratyush Anand <pratyush.anand@st.com>
- * Mohit Kumar <mohit.kumar@st.com>
+ * Pratyush Anand <pratyush.anand@gmail.com>
+ * Mohit Kumar <mohit.kumar.dhaka@gmail.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -257,5 +257,5 @@ static struct platform_driver spear1310_miphy_driver = {
module_platform_driver(spear1310_miphy_driver);
MODULE_DESCRIPTION("ST SPEAR1310-MIPHY driver");
-MODULE_AUTHOR("Pratyush Anand <pratyush.anand@st.com>");
+MODULE_AUTHOR("Pratyush Anand <pratyush.anand@gmail.com>");
MODULE_LICENSE("GPL v2");
diff --git a/drivers/phy/phy-spear1340-miphy.c b/drivers/phy/phy-spear1340-miphy.c
index 1a00c2817f34f8..494240da4a398e 100644
--- a/drivers/phy/phy-spear1340-miphy.c
+++ b/drivers/phy/phy-spear1340-miphy.c
@@ -2,8 +2,8 @@
* ST spear1340-miphy driver
*
* Copyright (C) 2014 ST Microelectronics
- * Pratyush Anand <pratyush.anand@st.com>
- * Mohit Kumar <mohit.kumar@st.com>
+ * Pratyush Anand <pratyush.anand@gmail.com>
+ * Mohit Kumar <mohit.kumar.dhaka@gmail.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -290,5 +290,5 @@ static struct platform_driver spear1340_miphy_driver = {
module_platform_driver(spear1340_miphy_driver);
MODULE_DESCRIPTION("ST SPEAR1340-MIPHY driver");
-MODULE_AUTHOR("Pratyush Anand <pratyush.anand@st.com>");
+MODULE_AUTHOR("Pratyush Anand <pratyush.anand@gmail.com>");
MODULE_LICENSE("GPL v2");
diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c
index ea2a315df6b7bb..272fc934ce6b9b 100644
--- a/drivers/rtc/class.c
+++ b/drivers/rtc/class.c
@@ -61,6 +61,8 @@ static int rtc_suspend(struct device *dev)
if (strcmp(dev_name(&rtc->dev), CONFIG_RTC_HCTOSYS_DEVICE) != 0)
return 0;
+ rtc->valid_alarm = !rtc_read_alarm(rtc, &rtc->alarm);
+
/* snapshot the current RTC and system time at suspend*/
err = rtc_read_time(rtc, &tm);
if (err < 0) {
@@ -105,6 +107,27 @@ static int rtc_resume(struct device *dev)
if (timekeeping_rtc_skipresume())
return 0;
+ /*
+ * Ensure that the platform hasn't overwritten a pending alarm while
+ * suspended
+ */
+ if (rtc->valid_alarm) {
+ long now, scheduled;
+
+ rtc_read_time(rtc, &tm);
+ rtc_tm_to_time(&rtc->alarm.time, &scheduled);
+ rtc_tm_to_time(&tm, &now);
+
+ /* Clear the alarm registers if it went off during suspend */
+ if (scheduled <= now) {
+ rtc_time_to_tm(0, &rtc->alarm.time);
+ rtc->alarm.enabled = 0;
+ }
+
+ if (rtc->ops && rtc->ops->set_alarm)
+ rtc->ops->set_alarm(rtc->dev.parent, &rtc->alarm);
+ }
+
rtc_hctosys_ret = -ENODEV;
if (strcmp(dev_name(&rtc->dev), CONFIG_RTC_HCTOSYS_DEVICE) != 0)
return 0;
@@ -141,6 +164,7 @@ static int rtc_resume(struct device *dev)
if (sleep_time.tv_sec >= 0)
timekeeping_inject_sleeptime64(&sleep_time);
rtc_hctosys_ret = 0;
+
return 0;
}
diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c
index a6b14c24d7e3df..31f11ab214a020 100644
--- a/drivers/rtc/interface.c
+++ b/drivers/rtc/interface.c
@@ -494,7 +494,10 @@ int rtc_update_irq_enable(struct rtc_device *rtc, unsigned int enabled)
struct rtc_time tm;
ktime_t now, onesec;
- __rtc_read_time(rtc, &tm);
+ err = __rtc_read_time(rtc, &tm);
+ if (err < 0)
+ goto out;
+
onesec = ktime_set(1, 0);
now = rtc_tm_to_ktime(tm);
rtc->uie_rtctimer.node.expires = ktime_add(now, onesec);
@@ -872,13 +875,17 @@ void rtc_timer_do_work(struct work_struct *work)
struct timerqueue_node *next;
ktime_t now;
struct rtc_time tm;
+ int err = 0;
struct rtc_device *rtc =
container_of(work, struct rtc_device, irqwork);
mutex_lock(&rtc->ops_lock);
again:
- __rtc_read_time(rtc, &tm);
+ err = __rtc_read_time(rtc, &tm);
+ if (err < 0)
+ goto out;
+
now = rtc_tm_to_ktime(tm);
while ((next = timerqueue_getnext(&rtc->timerqueue))) {
if (next->expires.tv64 > now.tv64)
@@ -903,7 +910,6 @@ again:
/* Set next alarm */
if (next) {
struct rtc_wkalrm alarm;
- int err;
int retry = 3;
alarm.time = rtc_ktime_to_tm(next->expires);
@@ -925,6 +931,7 @@ reprogram:
} else
rtc_alarm_disable(rtc);
+out:
pm_relax(rtc->dev.parent);
mutex_unlock(&rtc->ops_lock);
}
diff --git a/drivers/rtc/rtc-ds1307.c b/drivers/rtc/rtc-ds1307.c
index 4ffabb322a9a55..6e76de1856fc14 100644
--- a/drivers/rtc/rtc-ds1307.c
+++ b/drivers/rtc/rtc-ds1307.c
@@ -742,17 +742,17 @@ static int mcp794xx_set_alarm(struct device *dev, struct rtc_wkalrm *t)
regs[6] &= ~MCP794XX_BIT_ALMX_IF;
/* Set alarm match: second, minute, hour, day, date, month. */
regs[6] |= MCP794XX_MSK_ALMX_MATCH;
-
- if (t->enabled)
- regs[0] |= MCP794XX_BIT_ALM0_EN;
- else
- regs[0] &= ~MCP794XX_BIT_ALM0_EN;
+ /* Disable interrupt. We will not enable until completely programmed */
+ regs[0] &= ~MCP794XX_BIT_ALM0_EN;
ret = ds1307->write_block_data(client, MCP794XX_REG_CONTROL, 10, regs);
if (ret < 0)
return ret;
- return 0;
+ if (!t->enabled)
+ return 0;
+ regs[0] |= MCP794XX_BIT_ALM0_EN;
+ return i2c_smbus_write_byte_data(client, MCP794XX_REG_CONTROL, regs[0]);
}
static int mcp794xx_alarm_irq_enable(struct device *dev, unsigned int enabled)
diff --git a/drivers/rtc/rtc-omap.c b/drivers/rtc/rtc-omap.c
index 8b6355ffaff990..7fc666a0bb2e37 100644
--- a/drivers/rtc/rtc-omap.c
+++ b/drivers/rtc/rtc-omap.c
@@ -107,6 +107,8 @@
/* OMAP_RTC_OSC_REG bit fields: */
#define OMAP_RTC_OSC_32KCLK_EN BIT(6)
+#define OMAP_RTC_OSC_OSC32K_GZ BIT(4)
+#define OMAP_RTC_OSC_EXT_32K BIT(3)
/* OMAP_RTC_IRQWAKEEN bit fields: */
#define OMAP_RTC_IRQWAKEEN_ALARM_WAKEEN BIT(1)
@@ -122,6 +124,7 @@ struct omap_rtc;
struct omap_rtc_device_type {
bool has_32kclk_en;
+ bool has_osc_ext_32k;
bool has_irqwakeen;
bool has_pmic_mode;
bool has_power_up_reset;
@@ -481,6 +484,7 @@ static const struct omap_rtc_device_type omap_rtc_default_type = {
static const struct omap_rtc_device_type omap_rtc_am3352_type = {
.has_32kclk_en = true,
+ .has_osc_ext_32k = true,
.has_irqwakeen = true,
.has_pmic_mode = true,
.lock = am3352_rtc_lock,
@@ -577,7 +581,15 @@ static int omap_rtc_probe(struct platform_device *pdev)
if (rtc->type->has_32kclk_en) {
reg = rtc_read(rtc, OMAP_RTC_OSC_REG);
rtc_writel(rtc, OMAP_RTC_OSC_REG,
- reg | OMAP_RTC_OSC_32KCLK_EN);
+ reg | OMAP_RTC_OSC_32KCLK_EN);
+ }
+
+ /* Enable external clock as the source */
+ if (rtc->type->has_osc_ext_32k) {
+ rtc_writel(rtc, OMAP_RTC_OSC_REG,
+ (OMAP_RTC_OSC_EXT_32K |
+ rtc_read(rtc, OMAP_RTC_OSC_REG)) &
+ (~OMAP_RTC_OSC_OSC32K_GZ));
}
/* clear old status */
diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c
index 1f8e2dc9c616a0..30268bb2ddb6a3 100644
--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@@ -2363,17 +2363,13 @@ do_device_access(struct scsi_cmnd *scmd, u64 lba, u32 num, bool do_write)
u64 block, rest = 0;
struct scsi_data_buffer *sdb;
enum dma_data_direction dir;
- size_t (*func)(struct scatterlist *, unsigned int, void *, size_t,
- off_t);
if (do_write) {
sdb = scsi_out(scmd);
dir = DMA_TO_DEVICE;
- func = sg_pcopy_to_buffer;
} else {
sdb = scsi_in(scmd);
dir = DMA_FROM_DEVICE;
- func = sg_pcopy_from_buffer;
}
if (!sdb->length)
@@ -2385,16 +2381,16 @@ do_device_access(struct scsi_cmnd *scmd, u64 lba, u32 num, bool do_write)
if (block + num > sdebug_store_sectors)
rest = block + num - sdebug_store_sectors;
- ret = func(sdb->table.sgl, sdb->table.nents,
+ ret = sg_copy_buffer(sdb->table.sgl, sdb->table.nents,
fake_storep + (block * scsi_debug_sector_size),
- (num - rest) * scsi_debug_sector_size, 0);
+ (num - rest) * scsi_debug_sector_size, 0, do_write);
if (ret != (num - rest) * scsi_debug_sector_size)
return ret;
if (rest) {
- ret += func(sdb->table.sgl, sdb->table.nents,
+ ret += sg_copy_buffer(sdb->table.sgl, sdb->table.nents,
fake_storep, rest * scsi_debug_sector_size,
- (num - rest) * scsi_debug_sector_size);
+ (num - rest) * scsi_debug_sector_size, do_write);
}
return ret;
diff --git a/drivers/staging/android/lowmemorykiller.c b/drivers/staging/android/lowmemorykiller.c
index defddf5f80dddf..872bd603fd0d8b 100644
--- a/drivers/staging/android/lowmemorykiller.c
+++ b/drivers/staging/android/lowmemorykiller.c
@@ -167,7 +167,7 @@ static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc)
* infrastructure. There is no real reason why the selected
* task should have access to the memory reserves.
*/
- mark_tsk_oom_victim(selected);
+ mark_oom_victim(selected);
task_unlock(selected);
lowmem_print(1, "send sigkill to %d (%s), adj %hd, size %d\n",
selected->pid, selected->comm,
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index 2d285fab98688c..f2fc8cc3dbee31 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -353,9 +353,10 @@ static struct sysrq_key_op sysrq_term_op = {
static void moom_callback(struct work_struct *ignored)
{
- if (!out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL),
- GFP_KERNEL, 0, NULL, true))
+ mutex_lock(&oom_lock);
+ if (!force_out_of_memory())
pr_info("OOM request ignored because killer is disabled\n");
+ mutex_unlock(&oom_lock);
}
static DECLARE_WORK(moom_work, moom_callback);
diff --git a/drivers/usb/misc/lvstest.c b/drivers/usb/misc/lvstest.c
index 62cb8cd08403d8..86b4e4b2ab9a60 100644
--- a/drivers/usb/misc/lvstest.c
+++ b/drivers/usb/misc/lvstest.c
@@ -4,7 +4,7 @@
* Test pattern generation for Link Layer Validation System Tests
*
* Copyright (C) 2014 ST Microelectronics
- * Pratyush Anand <pratyush.anand@st.com>
+ * Pratyush Anand <pratyush.anand@gmail.com>
*
* This file is licensed under the terms of the GNU General Public
* License version 2. This program is licensed "as is" without any
diff --git a/drivers/w1/masters/omap_hdq.c b/drivers/w1/masters/omap_hdq.c
index e7d448963a2435..0e2f43bccf1ffb 100644
--- a/drivers/w1/masters/omap_hdq.c
+++ b/drivers/w1/masters/omap_hdq.c
@@ -17,6 +17,7 @@
#include <linux/io.h>
#include <linux/sched.h>
#include <linux/pm_runtime.h>
+#include <linux/of.h>
#include "../w1.h"
#include "../w1_int.h"
@@ -27,21 +28,23 @@
#define OMAP_HDQ_TX_DATA 0x04
#define OMAP_HDQ_RX_DATA 0x08
#define OMAP_HDQ_CTRL_STATUS 0x0c
-#define OMAP_HDQ_CTRL_STATUS_INTERRUPTMASK (1<<6)
-#define OMAP_HDQ_CTRL_STATUS_CLOCKENABLE (1<<5)
-#define OMAP_HDQ_CTRL_STATUS_GO (1<<4)
-#define OMAP_HDQ_CTRL_STATUS_INITIALIZATION (1<<2)
-#define OMAP_HDQ_CTRL_STATUS_DIR (1<<1)
-#define OMAP_HDQ_CTRL_STATUS_MODE (1<<0)
+#define OMAP_HDQ_CTRL_STATUS_SINGLE BIT(7)
+#define OMAP_HDQ_CTRL_STATUS_INTERRUPTMASK BIT(6)
+#define OMAP_HDQ_CTRL_STATUS_CLOCKENABLE BIT(5)
+#define OMAP_HDQ_CTRL_STATUS_GO BIT(4)
+#define OMAP_HDQ_CTRL_STATUS_PRESENCE BIT(3)
+#define OMAP_HDQ_CTRL_STATUS_INITIALIZATION BIT(2)
+#define OMAP_HDQ_CTRL_STATUS_DIR BIT(1)
#define OMAP_HDQ_INT_STATUS 0x10
-#define OMAP_HDQ_INT_STATUS_TXCOMPLETE (1<<2)
-#define OMAP_HDQ_INT_STATUS_RXCOMPLETE (1<<1)
-#define OMAP_HDQ_INT_STATUS_TIMEOUT (1<<0)
+#define OMAP_HDQ_INT_STATUS_TXCOMPLETE BIT(2)
+#define OMAP_HDQ_INT_STATUS_RXCOMPLETE BIT(1)
+#define OMAP_HDQ_INT_STATUS_TIMEOUT BIT(0)
#define OMAP_HDQ_SYSCONFIG 0x14
-#define OMAP_HDQ_SYSCONFIG_SOFTRESET (1<<1)
-#define OMAP_HDQ_SYSCONFIG_AUTOIDLE (1<<0)
+#define OMAP_HDQ_SYSCONFIG_SOFTRESET BIT(1)
+#define OMAP_HDQ_SYSCONFIG_AUTOIDLE BIT(0)
+#define OMAP_HDQ_SYSCONFIG_NOIDLE 0x0
#define OMAP_HDQ_SYSSTATUS 0x18
-#define OMAP_HDQ_SYSSTATUS_RESETDONE (1<<0)
+#define OMAP_HDQ_SYSSTATUS_RESETDONE BIT(0)
#define OMAP_HDQ_FLAG_CLEAR 0
#define OMAP_HDQ_FLAG_SET 1
@@ -67,6 +70,10 @@ struct hdq_data {
* the data wrire or read.
*/
int init_trans;
+ int rrw;
+ /* mode: 0-HDQ 1-W1 */
+ int mode;
+
};
static int omap_hdq_probe(struct platform_device *pdev);
@@ -74,6 +81,7 @@ static int omap_hdq_remove(struct platform_device *pdev);
static const struct of_device_id omap_hdq_dt_ids[] = {
{ .compatible = "ti,omap3-1w" },
+ { .compatible = "ti,am4372-hdq" },
{}
};
MODULE_DEVICE_TABLE(of, omap_hdq_dt_ids);
@@ -90,15 +98,12 @@ static struct platform_driver omap_hdq_driver = {
static u8 omap_w1_read_byte(void *_hdq);
static void omap_w1_write_byte(void *_hdq, u8 byte);
static u8 omap_w1_reset_bus(void *_hdq);
-static void omap_w1_search_bus(void *_hdq, struct w1_master *master_dev,
- u8 search_type, w1_slave_found_callback slave_found);
static struct w1_bus_master omap_w1_master = {
.read_byte = omap_w1_read_byte,
.write_byte = omap_w1_write_byte,
.reset_bus = omap_w1_reset_bus,
- .search = omap_w1_search_bus,
};
/* HDQ register I/O routines */
@@ -122,6 +127,15 @@ static inline u8 hdq_reg_merge(struct hdq_data *hdq_data, u32 offset,
return new_val;
}
+static void hdq_disable_interrupt(struct hdq_data *hdq_data, u32 offset,
+ u32 mask)
+{
+ u32 ie;
+
+ ie = readl(hdq_data->hdq_base + offset);
+ writel(ie & mask, hdq_data->hdq_base + offset);
+}
+
/*
* Wait for one or more bits in flag change.
* HDQ_FLAG_SET: wait until any bit in the flag is set.
@@ -229,13 +243,7 @@ static irqreturn_t hdq_isr(int irq, void *_hdq)
return IRQ_HANDLED;
}
-/* HDQ Mode: always return success */
-static u8 omap_w1_reset_bus(void *_hdq)
-{
- return 0;
-}
-
-/* W1 search callback function */
+/* W1 search callback function in HDQ mode */
static void omap_w1_search_bus(void *_hdq, struct w1_master *master_dev,
u8 search_type, w1_slave_found_callback slave_found)
{
@@ -262,9 +270,10 @@ static int _omap_hdq_reset(struct hdq_data *hdq_data)
int ret;
u8 tmp_status;
- hdq_reg_out(hdq_data, OMAP_HDQ_SYSCONFIG, OMAP_HDQ_SYSCONFIG_SOFTRESET);
+ hdq_reg_out(hdq_data, OMAP_HDQ_SYSCONFIG,
+ OMAP_HDQ_SYSCONFIG_SOFTRESET);
/*
- * Select HDQ mode & enable clocks.
+ * Select HDQ/1W mode & enable clocks.
* It is observed that INT flags can't be cleared via a read and GO/INIT
* won't return to zero if interrupt is disabled. So we always enable
* interrupt.
@@ -282,7 +291,8 @@ static int _omap_hdq_reset(struct hdq_data *hdq_data)
else {
hdq_reg_out(hdq_data, OMAP_HDQ_CTRL_STATUS,
OMAP_HDQ_CTRL_STATUS_CLOCKENABLE |
- OMAP_HDQ_CTRL_STATUS_INTERRUPTMASK);
+ OMAP_HDQ_CTRL_STATUS_INTERRUPTMASK |
+ hdq_data->mode);
hdq_reg_out(hdq_data, OMAP_HDQ_SYSCONFIG,
OMAP_HDQ_SYSCONFIG_AUTOIDLE);
}
@@ -334,6 +344,18 @@ static int omap_hdq_break(struct hdq_data *hdq_data)
ret = -ETIMEDOUT;
goto out;
}
+
+ /*
+ * check for the presence detect bit to get
+ * set to show that the slave is responding
+ */
+ if (!(hdq_reg_in(hdq_data, OMAP_HDQ_CTRL_STATUS) &
+ OMAP_HDQ_CTRL_STATUS_PRESENCE)) {
+ dev_dbg(hdq_data->dev, "Presence bit not set\n");
+ ret = -ETIMEDOUT;
+ goto out;
+ }
+
/*
* wait for both INIT and GO bits rerurn to zero.
* zero wait time expected for interrupt mode.
@@ -368,6 +390,8 @@ static int hdq_read_byte(struct hdq_data *hdq_data, u8 *val)
goto out;
}
+ hdq_data->hdq_irqstatus = 0;
+
if (!(hdq_data->hdq_irqstatus & OMAP_HDQ_INT_STATUS_RXCOMPLETE)) {
hdq_reg_merge(hdq_data, OMAP_HDQ_CTRL_STATUS,
OMAP_HDQ_CTRL_STATUS_DIR | OMAP_HDQ_CTRL_STATUS_GO,
@@ -400,7 +424,7 @@ rtn:
}
-/* Enable clocks and set the controller to HDQ mode */
+/* Enable clocks and set the controller to HDQ/1W mode */
static int omap_hdq_get(struct hdq_data *hdq_data)
{
int ret = 0;
@@ -422,7 +446,7 @@ static int omap_hdq_get(struct hdq_data *hdq_data)
pm_runtime_get_sync(hdq_data->dev);
- /* make sure HDQ is out of reset */
+ /* make sure HDQ/1W is out of reset */
if (!(hdq_reg_in(hdq_data, OMAP_HDQ_SYSSTATUS) &
OMAP_HDQ_SYSSTATUS_RESETDONE)) {
ret = _omap_hdq_reset(hdq_data);
@@ -430,12 +454,13 @@ static int omap_hdq_get(struct hdq_data *hdq_data)
/* back up the count */
hdq_data->hdq_usecount--;
} else {
- /* select HDQ mode & enable clocks */
+ /* select HDQ/1W mode & enable clocks */
hdq_reg_out(hdq_data, OMAP_HDQ_CTRL_STATUS,
OMAP_HDQ_CTRL_STATUS_CLOCKENABLE |
- OMAP_HDQ_CTRL_STATUS_INTERRUPTMASK);
+ OMAP_HDQ_CTRL_STATUS_INTERRUPTMASK |
+ hdq_data->mode);
hdq_reg_out(hdq_data, OMAP_HDQ_SYSCONFIG,
- OMAP_HDQ_SYSCONFIG_AUTOIDLE);
+ OMAP_HDQ_SYSCONFIG_NOIDLE);
hdq_reg_in(hdq_data, OMAP_HDQ_INT_STATUS);
}
}
@@ -456,6 +481,8 @@ static int omap_hdq_put(struct hdq_data *hdq_data)
if (ret < 0)
return -EINTR;
+ hdq_reg_out(hdq_data, OMAP_HDQ_SYSCONFIG,
+ OMAP_HDQ_SYSCONFIG_AUTOIDLE);
if (0 == hdq_data->hdq_usecount) {
dev_dbg(hdq_data->dev, "attempt to decrement use count"
" when it is zero");
@@ -471,6 +498,100 @@ static int omap_hdq_put(struct hdq_data *hdq_data)
return ret;
}
+/*
+ * W1 triplet callback function - used for searching ROM addresses.
+ * Registered only when controller is in 1-wire mode.
+ */
+static u8 omap_w1_triplet(void *_hdq, u8 bdir)
+{
+ u8 id_bit, comp_bit;
+ int err;
+ u8 ret = 0x3; /* no slaves responded */
+ struct hdq_data *hdq_data = _hdq;
+ u8 ctrl = OMAP_HDQ_CTRL_STATUS_SINGLE | OMAP_HDQ_CTRL_STATUS_GO |
+ OMAP_HDQ_CTRL_STATUS_INTERRUPTMASK;
+ u8 mask = ctrl | OMAP_HDQ_CTRL_STATUS_DIR;
+
+ omap_hdq_get(_hdq);
+
+ err = mutex_lock_interruptible(&hdq_data->hdq_mutex);
+ if (err < 0) {
+ dev_dbg(hdq_data->dev, "Could not acquire mutex\n");
+ goto rtn;
+ }
+
+ hdq_data->hdq_irqstatus = 0;
+ /* read id_bit */
+ hdq_reg_merge(_hdq, OMAP_HDQ_CTRL_STATUS,
+ ctrl | OMAP_HDQ_CTRL_STATUS_DIR, mask);
+ err = wait_event_timeout(hdq_wait_queue,
+ (hdq_data->hdq_irqstatus
+ & OMAP_HDQ_INT_STATUS_RXCOMPLETE),
+ OMAP_HDQ_TIMEOUT);
+ if (err == 0) {
+ dev_dbg(hdq_data->dev, "RX wait elapsed\n");
+ goto out;
+ }
+ id_bit = (hdq_reg_in(_hdq, OMAP_HDQ_RX_DATA) & 0x01);
+
+ hdq_data->hdq_irqstatus = 0;
+ /* read comp_bit */
+ hdq_reg_merge(_hdq, OMAP_HDQ_CTRL_STATUS,
+ ctrl | OMAP_HDQ_CTRL_STATUS_DIR, mask);
+ err = wait_event_timeout(hdq_wait_queue,
+ (hdq_data->hdq_irqstatus
+ & OMAP_HDQ_INT_STATUS_RXCOMPLETE),
+ OMAP_HDQ_TIMEOUT);
+ if (err == 0) {
+ dev_dbg(hdq_data->dev, "RX wait elapsed\n");
+ goto out;
+ }
+ comp_bit = (hdq_reg_in(_hdq, OMAP_HDQ_RX_DATA) & 0x01);
+
+ if (id_bit && comp_bit) {
+ ret = 0x03; /* no slaves responded */
+ goto out;
+ }
+ if (!id_bit && !comp_bit) {
+ /* Both bits are valid, take the direction given */
+ ret = bdir ? 0x04 : 0;
+ } else {
+ /* Only one bit is valid, take that direction */
+ bdir = id_bit;
+ ret = id_bit ? 0x05 : 0x02;
+ }
+
+ /* write bdir bit */
+ hdq_reg_out(_hdq, OMAP_HDQ_TX_DATA, bdir);
+ hdq_reg_merge(_hdq, OMAP_HDQ_CTRL_STATUS, ctrl, mask);
+ err = wait_event_timeout(hdq_wait_queue,
+ (hdq_data->hdq_irqstatus
+ & OMAP_HDQ_INT_STATUS_TXCOMPLETE),
+ OMAP_HDQ_TIMEOUT);
+ if (err == 0) {
+ dev_dbg(hdq_data->dev, "TX wait elapsed\n");
+ goto out;
+ }
+
+ hdq_reg_merge(_hdq, OMAP_HDQ_CTRL_STATUS, 0,
+ OMAP_HDQ_CTRL_STATUS_SINGLE);
+
+out:
+ mutex_unlock(&hdq_data->hdq_mutex);
+rtn:
+ omap_hdq_put(_hdq);
+ return ret;
+}
+
+/* reset callback */
+static u8 omap_w1_reset_bus(void *_hdq)
+{
+ omap_hdq_get(_hdq);
+ omap_hdq_break(_hdq);
+ omap_hdq_put(_hdq);
+ return 0;
+}
+
/* Read a byte of data from the device */
static u8 omap_w1_read_byte(void *_hdq)
{
@@ -478,6 +599,10 @@ static u8 omap_w1_read_byte(void *_hdq)
u8 val = 0;
int ret;
+ /* First write to initialize the transfer */
+ if (hdq_data->init_trans == 0)
+ omap_hdq_get(hdq_data);
+
ret = hdq_read_byte(hdq_data, &val);
if (ret) {
ret = mutex_lock_interruptible(&hdq_data->hdq_mutex);
@@ -491,6 +616,10 @@ static u8 omap_w1_read_byte(void *_hdq)
return -1;
}
+ hdq_disable_interrupt(hdq_data, OMAP_HDQ_CTRL_STATUS,
+ ~OMAP_HDQ_CTRL_STATUS_INTERRUPTMASK);
+ hdq_data->hdq_usecount = 0;
+
/* Write followed by a read, release the module */
if (hdq_data->init_trans) {
ret = mutex_lock_interruptible(&hdq_data->hdq_mutex);
@@ -517,6 +646,14 @@ static void omap_w1_write_byte(void *_hdq, u8 byte)
if (hdq_data->init_trans == 0)
omap_hdq_get(hdq_data);
+ /*
+ * We need to reset the slave before
+ * issuing the SKIP ROM command, else
+ * the slave will not work.
+ */
+ if (byte == W1_SKIP_ROM)
+ omap_hdq_break(hdq_data);
+
ret = mutex_lock_interruptible(&hdq_data->hdq_mutex);
if (ret < 0) {
dev_dbg(hdq_data->dev, "Could not acquire mutex\n");
@@ -551,6 +688,7 @@ static int omap_hdq_probe(struct platform_device *pdev)
struct resource *res;
int ret, irq;
u8 rev;
+ const char *mode;
hdq_data = devm_kzalloc(dev, sizeof(*hdq_data), GFP_KERNEL);
if (!hdq_data) {
@@ -567,10 +705,21 @@ static int omap_hdq_probe(struct platform_device *pdev)
return PTR_ERR(hdq_data->hdq_base);
hdq_data->hdq_usecount = 0;
+ hdq_data->rrw = 0;
mutex_init(&hdq_data->hdq_mutex);
pm_runtime_enable(&pdev->dev);
- pm_runtime_get_sync(&pdev->dev);
+ ret = pm_runtime_get_sync(&pdev->dev);
+ if (ret < 0) {
+ dev_dbg(&pdev->dev, "pm_runtime_get_sync failed\n");
+ goto err_w1;
+ }
+
+ ret = _omap_hdq_reset(hdq_data);
+ if (ret) {
+ dev_dbg(&pdev->dev, "reset failed\n");
+ return -EINVAL;
+ }
rev = hdq_reg_in(hdq_data, OMAP_HDQ_REVISION);
dev_info(&pdev->dev, "OMAP HDQ Hardware Rev %c.%c. Driver in %s mode\n",
@@ -594,6 +743,15 @@ static int omap_hdq_probe(struct platform_device *pdev)
pm_runtime_put_sync(&pdev->dev);
+ ret = of_property_read_string(pdev->dev.of_node, "ti,mode", &mode);
+ if (ret < 0 || !strcmp(mode, "hdq")) {
+ hdq_data->mode = 0;
+ omap_w1_master.search = omap_w1_search_bus;
+ } else {
+ hdq_data->mode = 1;
+ omap_w1_master.triplet = omap_w1_triplet;
+ }
+
omap_w1_master.data = hdq_data;
ret = w1_add_master_device(&omap_w1_master);
@@ -635,8 +793,8 @@ static int omap_hdq_remove(struct platform_device *pdev)
module_platform_driver(omap_hdq_driver);
module_param(w1_id, int, S_IRUSR);
-MODULE_PARM_DESC(w1_id, "1-wire id for the slave detection");
+MODULE_PARM_DESC(w1_id, "1-wire id for the slave detection in HDQ mode");
MODULE_AUTHOR("Texas Instruments");
-MODULE_DESCRIPTION("HDQ driver Library");
+MODULE_DESCRIPTION("HDQ-1W driver Library");
MODULE_LICENSE("GPL");
diff --git a/drivers/xen/tmem.c b/drivers/xen/tmem.c
index c4211a31612d44..d88f36754bf7ef 100644
--- a/drivers/xen/tmem.c
+++ b/drivers/xen/tmem.c
@@ -381,15 +381,9 @@ static int __init xen_tmem_init(void)
#ifdef CONFIG_FRONTSWAP
if (tmem_enabled && frontswap) {
char *s = "";
- struct frontswap_ops *old_ops;
tmem_frontswap_poolid = -1;
- old_ops = frontswap_register_ops(&tmem_frontswap_ops);
- if (IS_ERR(old_ops) || old_ops) {
- if (IS_ERR(old_ops))
- return PTR_ERR(old_ops);
- s = " (WARNING: frontswap_ops overridden)";
- }
+ frontswap_register_ops(&tmem_frontswap_ops);
pr_info("frontswap enabled, RAM provided by Xen Transcendent Memory%s\n",
s);
}
diff --git a/fs/Makefile b/fs/Makefile
index cb20e4bf230398..0f3299fa7bca40 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_ANON_INODES) += anon_inodes.o
obj-$(CONFIG_SIGNALFD) += signalfd.o
obj-$(CONFIG_TIMERFD) += timerfd.o
obj-$(CONFIG_EVENTFD) += eventfd.o
+obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_AIO) += aio.o
obj-$(CONFIG_FS_DAX) += dax.o
obj-$(CONFIG_FILE_LOCKING) += locks.o
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index a19c31d3f36946..4d4a0df8344fe2 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -242,7 +242,7 @@ static struct kmem_cache *adfs_inode_cachep;
static struct inode *adfs_alloc_inode(struct super_block *sb)
{
struct adfs_inode_info *ei;
- ei = (struct adfs_inode_info *)kmem_cache_alloc(adfs_inode_cachep, GFP_KERNEL);
+ ei = kmem_cache_alloc(adfs_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
return &ei->vfs_inode;
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index a8f463c028ce9e..5fa92bc790ef7e 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -140,7 +140,7 @@ affs_remove_link(struct dentry *dentry)
{
struct inode *dir, *inode = d_inode(dentry);
struct super_block *sb = inode->i_sb;
- struct buffer_head *bh = NULL, *link_bh = NULL;
+ struct buffer_head *bh, *link_bh = NULL;
u32 link_ino, ino;
int retval;
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index a022f4accd76b8..17349500592d55 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -346,7 +346,7 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3
{
struct super_block *sb = dir->i_sb;
struct buffer_head *inode_bh = NULL;
- struct buffer_head *bh = NULL;
+ struct buffer_head *bh;
u32 block = 0;
int retval;
diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c
index f39b71c3981e7d..ea5b69a18ba9ce 100644
--- a/fs/affs/symlink.c
+++ b/fs/affs/symlink.c
@@ -16,14 +16,12 @@ static int affs_symlink_readpage(struct file *file, struct page *page)
struct inode *inode = page->mapping->host;
char *link = kmap(page);
struct slink_front *lf;
- int err;
int i, j;
char c;
char lc;
pr_debug("follow_link(ino=%lu)\n", inode->i_ino);
- err = -EIO;
bh = affs_bread(inode->i_sb, inode->i_ino);
if (!bh)
goto fail;
@@ -66,7 +64,7 @@ fail:
SetPageError(page);
kunmap(page);
unlock_page(page);
- return err;
+ return -EIO;
}
const struct address_space_operations affs_symlink_aops = {
diff --git a/fs/befs/btree.c b/fs/befs/btree.c
index 0826e91dacda31..22c16628088306 100644
--- a/fs/befs/btree.c
+++ b/fs/befs/btree.c
@@ -137,8 +137,8 @@ static int
befs_bt_read_super(struct super_block *sb, befs_data_stream * ds,
befs_btree_super * sup)
{
- struct buffer_head *bh = NULL;
- befs_disk_btree_super *od_sup = NULL;
+ struct buffer_head *bh;
+ befs_disk_btree_super *od_sup;
befs_debug(sb, "---> %s", __func__);
@@ -250,7 +250,7 @@ int
befs_btree_find(struct super_block *sb, befs_data_stream * ds,
const char *key, befs_off_t * value)
{
- struct befs_btree_node *this_node = NULL;
+ struct befs_btree_node *this_node;
befs_btree_super bt_super;
befs_off_t node_off;
int res;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 3f50cee79df9d3..2ac2d8471393f7 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3390,13 +3390,13 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
* should have access to this page, we're safe to simply set
* PG_locked without checking it first.
*/
- __set_page_locked(page);
+ __SetPageLocked(page);
rc = add_to_page_cache_locked(page, mapping,
page->index, GFP_KERNEL);
/* give up if we can't stick it in the cache */
if (rc) {
- __clear_page_locked(page);
+ __ClearPageLocked(page);
return rc;
}
@@ -3417,10 +3417,10 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
if (*bytes + PAGE_CACHE_SIZE > rsize)
break;
- __set_page_locked(page);
+ __SetPageLocked(page);
if (add_to_page_cache_locked(page, mapping, page->index,
GFP_KERNEL)) {
- __clear_page_locked(page);
+ __ClearPageLocked(page);
break;
}
list_move_tail(&page->lru, tmplist);
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index e65f9ffbb99951..4d6a30e76168c3 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -47,12 +47,11 @@ static void config_item_release(struct kref *kref);
* config_item_init - initialize item.
* @item: item in question.
*/
-void config_item_init(struct config_item *item)
+static void config_item_init(struct config_item *item)
{
kref_init(&item->ci_kref);
INIT_LIST_HEAD(&item->ci_entry);
}
-EXPORT_SYMBOL(config_item_init);
/**
* config_item_set_name - Set the name of an item
diff --git a/fs/coredump.c b/fs/coredump.c
index e9f499fd8f6a44..50d92e0483511c 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -71,7 +71,8 @@ static int expand_corename(struct core_name *cn, int size)
return 0;
}
-static int cn_vprintf(struct core_name *cn, const char *fmt, va_list arg)
+static __printf(2, 0) int cn_vprintf(struct core_name *cn, const char *fmt,
+ va_list arg)
{
int free, need;
va_list arg_copy;
@@ -94,7 +95,7 @@ again:
return -ENOMEM;
}
-static int cn_printf(struct core_name *cn, const char *fmt, ...)
+static __printf(2, 3) int cn_printf(struct core_name *cn, const char *fmt, ...)
{
va_list arg;
int ret;
@@ -106,7 +107,8 @@ static int cn_printf(struct core_name *cn, const char *fmt, ...)
return ret;
}
-static int cn_esc_printf(struct core_name *cn, const char *fmt, ...)
+static __printf(2, 3)
+int cn_esc_printf(struct core_name *cn, const char *fmt, ...)
{
int cur = cn->used;
va_list arg;
@@ -210,11 +212,15 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
break;
/* uid */
case 'u':
- err = cn_printf(cn, "%d", cred->uid);
+ err = cn_printf(cn, "%u",
+ from_kuid(&init_user_ns,
+ cred->uid));
break;
/* gid */
case 'g':
- err = cn_printf(cn, "%d", cred->gid);
+ err = cn_printf(cn, "%u",
+ from_kgid(&init_user_ns,
+ cred->gid));
break;
case 'd':
err = cn_printf(cn, "%d",
@@ -222,7 +228,8 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
break;
/* signal that caused the coredump */
case 's':
- err = cn_printf(cn, "%ld", cprm->siginfo->si_signo);
+ err = cn_printf(cn, "%d",
+ cprm->siginfo->si_signo);
break;
/* UNIX time of coredump */
case 't': {
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index add566303c6843..c35ffdc12bbafd 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -142,6 +142,8 @@ static inline struct super_block *pts_sb_from_inode(struct inode *inode)
if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
return inode->i_sb;
#endif
+ if (!devpts_mnt)
+ return NULL;
return devpts_mnt->mnt_sb;
}
@@ -525,10 +527,14 @@ static struct file_system_type devpts_fs_type = {
int devpts_new_index(struct inode *ptmx_inode)
{
struct super_block *sb = pts_sb_from_inode(ptmx_inode);
- struct pts_fs_info *fsi = DEVPTS_SB(sb);
+ struct pts_fs_info *fsi;
int index;
int ida_ret;
+ if (!sb)
+ return -ENODEV;
+
+ fsi = DEVPTS_SB(sb);
retry:
if (!ida_pre_get(&fsi->allocated_ptys, GFP_KERNEL))
return -ENOMEM;
@@ -584,11 +590,18 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index,
struct dentry *dentry;
struct super_block *sb = pts_sb_from_inode(ptmx_inode);
struct inode *inode;
- struct dentry *root = sb->s_root;
- struct pts_fs_info *fsi = DEVPTS_SB(sb);
- struct pts_mount_opts *opts = &fsi->mount_opts;
+ struct dentry *root;
+ struct pts_fs_info *fsi;
+ struct pts_mount_opts *opts;
char s[12];
+ if (!sb)
+ return ERR_PTR(-ENODEV);
+
+ root = sb->s_root;
+ fsi = DEVPTS_SB(sb);
+ opts = &fsi->mount_opts;
+
inode = new_inode(sb);
if (!inode)
return ERR_PTR(-ENOMEM);
@@ -676,12 +689,16 @@ static int __init init_devpts_fs(void)
struct ctl_table_header *table;
if (!err) {
+ struct vfsmount *mnt;
+
table = register_sysctl_table(pty_root_table);
- devpts_mnt = kern_mount(&devpts_fs_type);
- if (IS_ERR(devpts_mnt)) {
- err = PTR_ERR(devpts_mnt);
+ mnt = kern_mount(&devpts_fs_type);
+ if (IS_ERR(mnt)) {
+ err = PTR_ERR(mnt);
unregister_filesystem(&devpts_fs_type);
unregister_sysctl_table(table);
+ } else {
+ devpts_mnt = mnt;
}
}
return err;
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 7fca462ea4e341..c8411a30f7dacd 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -67,7 +67,7 @@ static struct kmem_cache * efs_inode_cachep;
static struct inode *efs_alloc_inode(struct super_block *sb)
{
struct efs_inode_info *ei;
- ei = (struct efs_inode_info *)kmem_cache_alloc(efs_inode_cachep, GFP_KERNEL);
+ ei = kmem_cache_alloc(efs_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
return &ei->vfs_inode;
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 8850254136ae31..7002467bfbace8 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -106,7 +106,10 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
}
if (!journal) {
- ret = generic_file_fsync(file, start, end, datasync);
+ if (test_opt(inode->i_sb, BARRIER))
+ ret = generic_file_fsync(file, start, end, datasync);
+ else
+ ret = __generic_file_fsync(file, start, end, datasync);
if (!ret && !hlist_empty(&inode->i_dentry))
ret = ext4_sync_parent(inode);
goto out;
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 93fc62232ec21e..5d384921524d97 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -301,15 +301,59 @@ static int fat_bmap_cluster(struct inode *inode, int cluster)
return dclus;
}
-int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
- unsigned long *mapped_blocks, int create)
+int fat_get_mapped_cluster(struct inode *inode, sector_t sector,
+ sector_t last_block,
+ unsigned long *mapped_blocks, sector_t *bmap)
{
struct super_block *sb = inode->i_sb;
struct msdos_sb_info *sbi = MSDOS_SB(sb);
+ int cluster, offset;
+
+ cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits);
+ offset = sector & (sbi->sec_per_clus - 1);
+ cluster = fat_bmap_cluster(inode, cluster);
+ if (cluster < 0)
+ return cluster;
+ else if (cluster) {
+ *bmap = fat_clus_to_blknr(sbi, cluster) + offset;
+ *mapped_blocks = sbi->sec_per_clus - offset;
+ if (*mapped_blocks > last_block - sector)
+ *mapped_blocks = last_block - sector;
+ }
+
+ return 0;
+}
+
+static int is_exceed_eof(struct inode *inode, sector_t sector,
+ sector_t *last_block, int create)
+{
+ struct super_block *sb = inode->i_sb;
const unsigned long blocksize = sb->s_blocksize;
const unsigned char blocksize_bits = sb->s_blocksize_bits;
+
+ *last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits;
+ if (sector >= *last_block) {
+ if (!create)
+ return 1;
+
+ /*
+ * ->mmu_private can access on only allocation path.
+ * (caller must hold ->i_mutex)
+ */
+ *last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1))
+ >> blocksize_bits;
+ if (sector >= *last_block)
+ return 1;
+ }
+
+ return 0;
+}
+
+int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
+ unsigned long *mapped_blocks, int create, bool from_bmap)
+{
+ struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
sector_t last_block;
- int cluster, offset;
*phys = 0;
*mapped_blocks = 0;
@@ -321,31 +365,16 @@ int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
return 0;
}
- last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits;
- if (sector >= last_block) {
- if (!create)
+ if (!from_bmap) {
+ if (is_exceed_eof(inode, sector, &last_block, create))
return 0;
-
- /*
- * ->mmu_private can access on only allocation path.
- * (caller must hold ->i_mutex)
- */
- last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1))
- >> blocksize_bits;
+ } else {
+ last_block = inode->i_blocks >>
+ (inode->i_sb->s_blocksize_bits - 9);
if (sector >= last_block)
return 0;
}
- cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits);
- offset = sector & (sbi->sec_per_clus - 1);
- cluster = fat_bmap_cluster(inode, cluster);
- if (cluster < 0)
- return cluster;
- else if (cluster) {
- *phys = fat_clus_to_blknr(sbi, cluster) + offset;
- *mapped_blocks = sbi->sec_per_clus - offset;
- if (*mapped_blocks > last_block - sector)
- *mapped_blocks = last_block - sector;
- }
- return 0;
+ return fat_get_mapped_cluster(inode, sector, last_block, mapped_blocks,
+ phys);
}
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 4afc4d9d2e4127..4c71c8c764263a 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -91,7 +91,7 @@ next:
*bh = NULL;
iblock = *pos >> sb->s_blocksize_bits;
- err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0);
+ err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0, false);
if (err || !phys)
return -1; /* beyond EOF or error */
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index be5e15323bab45..4307cd4f8da096 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -285,8 +285,11 @@ static inline void fatwchar_to16(__u8 *dst, const wchar_t *src, size_t len)
extern void fat_cache_inval_inode(struct inode *inode);
extern int fat_get_cluster(struct inode *inode, int cluster,
int *fclus, int *dclus);
+extern int fat_get_mapped_cluster(struct inode *inode, sector_t sector,
+ sector_t last_block,
+ unsigned long *mapped_blocks, sector_t *bmap);
extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
- unsigned long *mapped_blocks, int create);
+ unsigned long *mapped_blocks, int create, bool from_bmap);
/* fat/dir.c */
extern const struct file_operations fat_dir_operations;
@@ -384,6 +387,7 @@ static inline unsigned long fat_dir_hash(int logstart)
{
return hash_32(logstart, FAT_HASH_BITS);
}
+extern int fat_add_cluster(struct inode *inode);
/* fat/misc.c */
extern __printf(3, 4) __cold
diff --git a/fs/fat/file.c b/fs/fat/file.c
index a08f1039909a76..43d3475da83a79 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -14,8 +14,12 @@
#include <linux/backing-dev.h>
#include <linux/fsnotify.h>
#include <linux/security.h>
+#include <linux/falloc.h>
#include "fat.h"
+static long fat_fallocate(struct file *file, int mode,
+ loff_t offset, loff_t len);
+
static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr)
{
u32 attr;
@@ -177,6 +181,7 @@ const struct file_operations fat_file_operations = {
#endif
.fsync = fat_file_fsync,
.splice_read = generic_file_splice_read,
+ .fallocate = fat_fallocate,
};
static int fat_cont_expand(struct inode *inode, loff_t size)
@@ -215,6 +220,62 @@ out:
return err;
}
+/*
+ * Preallocate space for a file. This implements fat's fallocate file
+ * operation, which gets called from sys_fallocate system call. User
+ * space requests len bytes at offset. If FALLOC_FL_KEEP_SIZE is set
+ * we just allocate clusters without zeroing them out. Otherwise we
+ * allocate and zero out clusters via an expanding truncate.
+ */
+static long fat_fallocate(struct file *file, int mode,
+ loff_t offset, loff_t len)
+{
+ int nr_cluster; /* Number of clusters to be allocated */
+ loff_t mm_bytes; /* Number of bytes to be allocated for file */
+ loff_t ondisksize; /* block aligned on-disk size in bytes*/
+ struct inode *inode = file->f_mapping->host;
+ struct super_block *sb = inode->i_sb;
+ struct msdos_sb_info *sbi = MSDOS_SB(sb);
+ int err = 0;
+
+ /* No support for hole punch or other fallocate flags. */
+ if (mode & ~FALLOC_FL_KEEP_SIZE)
+ return -EOPNOTSUPP;
+
+ /* No support for dir */
+ if (!S_ISREG(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ mutex_lock(&inode->i_mutex);
+ if (mode & FALLOC_FL_KEEP_SIZE) {
+ ondisksize = inode->i_blocks << 9;
+ if ((offset + len) <= ondisksize)
+ goto error;
+
+ /* First compute the number of clusters to be allocated */
+ mm_bytes = offset + len - ondisksize;
+ nr_cluster = (mm_bytes + (sbi->cluster_size - 1)) >>
+ sbi->cluster_bits;
+
+ /* Start the allocation.We are not zeroing out the clusters */
+ while (nr_cluster-- > 0) {
+ err = fat_add_cluster(inode);
+ if (err)
+ goto error;
+ }
+ } else {
+ if ((offset + len) <= i_size_read(inode))
+ goto error;
+
+ /* This is just an expanding truncate */
+ err = fat_cont_expand(inode, (offset + len));
+ }
+
+error:
+ mutex_unlock(&inode->i_mutex);
+ return err;
+}
+
/* Free all clusters after the skip'th cluster. */
static int fat_free(struct inode *inode, int skip)
{
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 509411dd369895..d04c87da425535 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -93,7 +93,7 @@ static struct fat_floppy_defaults {
},
};
-static int fat_add_cluster(struct inode *inode)
+int fat_add_cluster(struct inode *inode)
{
int err, cluster;
@@ -115,10 +115,10 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
struct super_block *sb = inode->i_sb;
struct msdos_sb_info *sbi = MSDOS_SB(sb);
unsigned long mapped_blocks;
- sector_t phys;
+ sector_t phys, last_block;
int err, offset;
- err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create);
+ err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create, false);
if (err)
return err;
if (phys) {
@@ -135,8 +135,14 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
return -EIO;
}
+ last_block = inode->i_blocks >> (sb->s_blocksize_bits - 9);
offset = (unsigned long)iblock & (sbi->sec_per_clus - 1);
- if (!offset) {
+ /*
+ * allocate a cluster according to the following.
+ * 1) no more available blocks
+ * 2) not part of fallocate region
+ */
+ if (!offset && !(iblock < last_block)) {
/* TODO: multiple cluster allocation would be desirable. */
err = fat_add_cluster(inode);
if (err)
@@ -148,7 +154,7 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
*max_blocks = min(mapped_blocks, *max_blocks);
MSDOS_I(inode)->mmu_private += *max_blocks << sb->s_blocksize_bits;
- err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create);
+ err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create, false);
if (err)
return err;
@@ -273,13 +279,38 @@ static ssize_t fat_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
return ret;
}
+static int fat_get_block_bmap(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ struct super_block *sb = inode->i_sb;
+ unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
+ int err;
+ sector_t bmap;
+ unsigned long mapped_blocks;
+
+ BUG_ON(create != 0);
+
+ err = fat_bmap(inode, iblock, &bmap, &mapped_blocks, create, true);
+ if (err)
+ return err;
+
+ if (bmap) {
+ map_bh(bh_result, sb, bmap);
+ max_blocks = min(mapped_blocks, max_blocks);
+ }
+
+ bh_result->b_size = max_blocks << sb->s_blocksize_bits;
+
+ return 0;
+}
+
static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
{
sector_t blocknr;
/* fat_get_cluster() assumes the requested blocknr isn't truncated. */
down_read(&MSDOS_I(mapping->host)->truncate_lock);
- blocknr = generic_block_bmap(mapping, block, fat_get_block);
+ blocknr = generic_block_bmap(mapping, block, fat_get_block_bmap);
up_read(&MSDOS_I(mapping->host)->truncate_lock);
return blocknr;
@@ -553,13 +584,43 @@ out:
EXPORT_SYMBOL_GPL(fat_build_inode);
+static int __fat_write_inode(struct inode *inode, int wait);
+
+static void fat_free_eofblocks(struct inode *inode)
+{
+ /* Release unwritten fallocated blocks on inode eviction. */
+ if ((inode->i_blocks << 9) >
+ round_up(MSDOS_I(inode)->mmu_private,
+ MSDOS_SB(inode->i_sb)->cluster_size)) {
+ int err;
+
+ fat_truncate_blocks(inode, MSDOS_I(inode)->mmu_private);
+ /* Fallocate results in updating the i_start/iogstart
+ * for the zero byte file. So, make it return to
+ * original state during evict and commit it to avoid
+ * any corruption on the next access to the cluster
+ * chain for the file.
+ */
+ err = __fat_write_inode(inode, inode_needs_sync(inode));
+ if (err) {
+ fat_msg(inode->i_sb, KERN_WARNING, "Failed to "
+ "update on disk inode for unused "
+ "fallocated blocks, inode could be "
+ "corrupted. Please run fsck");
+ }
+
+ }
+}
+
static void fat_evict_inode(struct inode *inode)
{
truncate_inode_pages_final(&inode->i_data);
if (!inode->i_nlink) {
inode->i_size = 0;
fat_truncate_blocks(inode, 0);
- }
+ } else
+ fat_free_eofblocks(inode);
+
invalidate_inode_buffers(inode);
clear_inode(inode);
fat_cache_inval_inode(inode);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 87724c1d7be663..0cf74df68617b8 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -130,7 +130,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
goto out;
ret = 0;
- hugetlb_prefault_arch_hook(vma->vm_mm);
if (vma->vm_flags & VM_WRITE && inode->i_size < len)
inode->i_size = len;
out:
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index b96bd8076b7060..0bc333b4a594db 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -371,16 +371,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
*/
J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
-retry_alloc:
- new_bh = alloc_buffer_head(GFP_NOFS);
- if (!new_bh) {
- /*
- * Failure is not an option, but __GFP_NOFAIL is going
- * away; so we retry ourselves here.
- */
- congestion_wait(BLK_RW_ASYNC, HZ/50);
- goto retry_alloc;
- }
+ new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
/* keep subsequent assertions sane */
atomic_set(&new_bh->b_count, 1);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index ff2f2e6ad31146..799242cecffbca 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -278,22 +278,16 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
alloc_transaction:
if (!journal->j_running_transaction) {
+ /*
+ * If __GFP_FS is not present, then we may be being called from
+ * inside the fs writeback layer, so we MUST NOT fail.
+ */
+ if ((gfp_mask & __GFP_FS) == 0)
+ gfp_mask |= __GFP_NOFAIL;
new_transaction = kmem_cache_zalloc(transaction_cache,
gfp_mask);
- if (!new_transaction) {
- /*
- * If __GFP_FS is not present, then we may be
- * being called from inside the fs writeback
- * layer, so we MUST NOT fail. Since
- * __GFP_NOFAIL is going away, we will arrange
- * to retry the allocation ourselves.
- */
- if ((gfp_mask & __GFP_FS) == 0) {
- congestion_wait(BLK_RW_ASYNC, HZ/50);
- goto alloc_transaction;
- }
+ if (!new_transaction)
return -ENOMEM;
- }
}
jbd_debug(3, "New handle %p going live.\n", handle);
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 1182d1e26a9ca4..086cd0a61e8015 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -62,7 +62,7 @@ static struct kmem_cache * minix_inode_cachep;
static struct inode *minix_alloc_inode(struct super_block *sb)
{
struct minix_inode_info *ei;
- ei = (struct minix_inode_info *)kmem_cache_alloc(minix_inode_cachep, GFP_KERNEL);
+ ei = kmem_cache_alloc(minix_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
return &ei->vfs_inode;
diff --git a/fs/mount.h b/fs/mount.h
index b5b8082bfa4208..14db05d424f705 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -118,7 +118,6 @@ static inline void unlock_mount_hash(void)
}
struct proc_mounts {
- struct seq_file m;
struct mnt_namespace *ns;
struct path root;
int (*show)(struct seq_file *, struct vfsmount *);
@@ -127,8 +126,6 @@ struct proc_mounts {
loff_t cached_index;
};
-#define proc_mounts(p) (container_of((p), struct proc_mounts, m))
-
extern const struct seq_operations mounts_op;
extern bool __is_local_mountpoint(struct dentry *dentry);
diff --git a/fs/mpage.c b/fs/mpage.c
index ca0244b69de8ba..dde689d0759daf 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -482,6 +482,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
struct buffer_head map_bh;
loff_t i_size = i_size_read(inode);
int ret = 0;
+ int wr = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
if (page_has_buffers(page)) {
struct buffer_head *head = page_buffers(page);
@@ -590,7 +591,7 @@ page_is_mapped:
* This page will go to BIO. Do we need to send this BIO off first?
*/
if (bio && mpd->last_block_in_bio != blocks[0] - 1)
- bio = mpage_bio_submit(WRITE, bio);
+ bio = mpage_bio_submit(wr, bio);
alloc_new:
if (bio == NULL) {
@@ -617,7 +618,7 @@ alloc_new:
wbc_account_io(wbc, page, PAGE_SIZE);
length = first_unmapped << blkbits;
if (bio_add_page(bio, page, length, 0) < length) {
- bio = mpage_bio_submit(WRITE, bio);
+ bio = mpage_bio_submit(wr, bio);
goto alloc_new;
}
@@ -627,7 +628,7 @@ alloc_new:
set_page_writeback(page);
unlock_page(page);
if (boundary || (first_unmapped != blocks_per_page)) {
- bio = mpage_bio_submit(WRITE, bio);
+ bio = mpage_bio_submit(wr, bio);
if (boundary_block) {
write_boundary_block(boundary_bdev,
boundary_block, 1 << blkbits);
@@ -639,7 +640,7 @@ alloc_new:
confused:
if (bio)
- bio = mpage_bio_submit(WRITE, bio);
+ bio = mpage_bio_submit(wr, bio);
if (mpd->use_writepage) {
ret = mapping->a_ops->writepage(page, wbc);
@@ -695,8 +696,11 @@ mpage_writepages(struct address_space *mapping,
};
ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
- if (mpd.bio)
- mpage_bio_submit(WRITE, mpd.bio);
+ if (mpd.bio) {
+ int wr = (wbc->sync_mode == WB_SYNC_ALL ?
+ WRITE_SYNC : WRITE);
+ mpage_bio_submit(wr, mpd.bio);
+ }
}
blk_finish_plug(&plug);
return ret;
@@ -713,8 +717,11 @@ int mpage_writepage(struct page *page, get_block_t get_block,
.use_writepage = 0,
};
int ret = __mpage_writepage(page, wbc, &mpd);
- if (mpd.bio)
- mpage_bio_submit(WRITE, mpd.bio);
+ if (mpd.bio) {
+ int wr = (wbc->sync_mode == WB_SYNC_ALL ?
+ WRITE_SYNC : WRITE);
+ mpage_bio_submit(wr, mpd.bio);
+ }
return ret;
}
EXPORT_SYMBOL(mpage_writepage);
diff --git a/fs/namespace.c b/fs/namespace.c
index f30c78a2b878c5..ecc277107e93f2 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1226,7 +1226,7 @@ EXPORT_SYMBOL(replace_mount_options);
/* iterator; we want it to have access to namespace_sem, thus here... */
static void *m_start(struct seq_file *m, loff_t *pos)
{
- struct proc_mounts *p = proc_mounts(m);
+ struct proc_mounts *p = m->private;
down_read(&namespace_sem);
if (p->cached_event == p->ns->event) {
@@ -1247,7 +1247,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
static void *m_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct proc_mounts *p = proc_mounts(m);
+ struct proc_mounts *p = m->private;
p->cached_mount = seq_list_next(v, &p->ns->list, pos);
p->cached_index = *pos;
@@ -1261,7 +1261,7 @@ static void m_stop(struct seq_file *m, void *v)
static int m_show(struct seq_file *m, void *v)
{
- struct proc_mounts *p = proc_mounts(m);
+ struct proc_mounts *p = m->private;
struct mount *r = list_entry(v, struct mount, mnt_list);
return p->show(m, &r->mnt);
}
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 22180836ec2201..37dd6b05b1b5bf 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -496,8 +496,7 @@ static struct dentry *nilfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
{
struct nilfs_fid *fid = (struct nilfs_fid *)fh;
- if ((fh_len != NILFS_FID_SIZE_NON_CONNECTABLE &&
- fh_len != NILFS_FID_SIZE_CONNECTABLE) ||
+ if (fh_len < NILFS_FID_SIZE_NON_CONNECTABLE ||
(fh_type != FILEID_NILFS_WITH_PARENT &&
fh_type != FILEID_NILFS_WITHOUT_PARENT))
return NULL;
@@ -510,7 +509,7 @@ static struct dentry *nilfs_fh_to_parent(struct super_block *sb, struct fid *fh,
{
struct nilfs_fid *fid = (struct nilfs_fid *)fh;
- if (fh_len != NILFS_FID_SIZE_CONNECTABLE ||
+ if (fh_len < NILFS_FID_SIZE_CONNECTABLE ||
fh_type != FILEID_NILFS_WITH_PARENT)
return NULL;
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 7bb487e663b478..2cd65367076458 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -525,7 +525,8 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
}
}
err = add_to_page_cache_lru(*cached_page, mapping,
- index, GFP_KERNEL);
+ index,
+ GFP_KERNEL & mapping_gfp_mask(mapping));
if (unlikely(err)) {
if (err == -EEXIST)
continue;
diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h
index a44b14cbceebcd..ab172e5f51d91f 100644
--- a/fs/ntfs/malloc.h
+++ b/fs/ntfs/malloc.h
@@ -85,12 +85,7 @@ static inline void *ntfs_malloc_nofs_nofail(unsigned long size)
static inline void ntfs_free(void *addr)
{
- if (!is_vmalloc_addr(addr)) {
- kfree(addr);
- /* free_page((unsigned long)addr); */
- return;
- }
- vfree(addr);
+ kvfree(addr);
}
#endif /* _LINUX_NTFS_MALLOC_H */
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index c58a1bcfda0fdf..0cdf497c91efbb 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -284,7 +284,19 @@ int ocfs2_set_acl(handle_t *handle,
int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
- return ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL);
+ struct buffer_head *bh = NULL;
+ int status = 0;
+
+ status = ocfs2_inode_lock(inode, &bh, 1);
+ if (status < 0) {
+ if (status != -ENOENT)
+ mlog_errno(status);
+ return status;
+ }
+ status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL);
+ ocfs2_inode_unlock(inode, 1);
+ brelse(bh);
+ return status;
}
struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
@@ -292,19 +304,21 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
struct ocfs2_super *osb;
struct buffer_head *di_bh = NULL;
struct posix_acl *acl;
- int ret = -EAGAIN;
+ int ret;
osb = OCFS2_SB(inode->i_sb);
if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
return NULL;
-
- ret = ocfs2_read_inode_block(inode, &di_bh);
- if (ret < 0)
+ ret = ocfs2_inode_lock(inode, &di_bh, 0);
+ if (ret < 0) {
+ if (ret != -ENOENT)
+ mlog_errno(ret);
return ERR_PTR(ret);
+ }
acl = ocfs2_get_acl_nolock(inode, type, di_bh);
+ ocfs2_inode_unlock(inode, 0);
brelse(di_bh);
-
return acl;
}
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 2d7f76e52c379c..0afb4cb7ce1bb9 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -908,32 +908,30 @@ static int ocfs2_validate_extent_block(struct super_block *sb,
*/
if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
- ocfs2_error(sb,
- "Extent block #%llu has bad signature %.*s",
- (unsigned long long)bh->b_blocknr, 7,
- eb->h_signature);
- return -EINVAL;
+ rc = ocfs2_error(sb,
+ "Extent block #%llu has bad signature %.*s\n",
+ (unsigned long long)bh->b_blocknr, 7,
+ eb->h_signature);
+ goto bail;
}
if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
- ocfs2_error(sb,
- "Extent block #%llu has an invalid h_blkno "
- "of %llu",
- (unsigned long long)bh->b_blocknr,
- (unsigned long long)le64_to_cpu(eb->h_blkno));
- return -EINVAL;
+ rc = ocfs2_error(sb,
+ "Extent block #%llu has an invalid h_blkno of %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(eb->h_blkno));
+ goto bail;
}
if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
- ocfs2_error(sb,
- "Extent block #%llu has an invalid "
- "h_fs_generation of #%u",
- (unsigned long long)bh->b_blocknr,
- le32_to_cpu(eb->h_fs_generation));
- return -EINVAL;
+ rc = ocfs2_error(sb,
+ "Extent block #%llu has an invalid h_fs_generation of #%u\n",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(eb->h_fs_generation));
+ goto bail;
}
-
- return 0;
+bail:
+ return rc;
}
int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
@@ -1446,8 +1444,7 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
while(le16_to_cpu(el->l_tree_depth) > 1) {
if (le16_to_cpu(el->l_next_free_rec) == 0) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu has empty "
- "extent list (next_free_rec == 0)",
+ "Owner %llu has empty extent list (next_free_rec == 0)\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
status = -EIO;
goto bail;
@@ -1456,9 +1453,7 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
blkno = le64_to_cpu(el->l_recs[i].e_blkno);
if (!blkno) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu has extent "
- "list where extent # %d has no physical "
- "block start",
+ "Owner %llu has extent list where extent # %d has no physical block start\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i);
status = -EIO;
goto bail;
@@ -1788,8 +1783,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
while (el->l_tree_depth) {
if (le16_to_cpu(el->l_next_free_rec) == 0) {
ocfs2_error(ocfs2_metadata_cache_get_super(ci),
- "Owner %llu has empty extent list at "
- "depth %u\n",
+ "Owner %llu has empty extent list at depth %u\n",
(unsigned long long)ocfs2_metadata_cache_owner(ci),
le16_to_cpu(el->l_tree_depth));
ret = -EROFS;
@@ -1814,8 +1808,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
blkno = le64_to_cpu(el->l_recs[i].e_blkno);
if (blkno == 0) {
ocfs2_error(ocfs2_metadata_cache_get_super(ci),
- "Owner %llu has bad blkno in extent list "
- "at depth %u (index %d)\n",
+ "Owner %llu has bad blkno in extent list at depth %u (index %d)\n",
(unsigned long long)ocfs2_metadata_cache_owner(ci),
le16_to_cpu(el->l_tree_depth), i);
ret = -EROFS;
@@ -1836,8 +1829,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
if (le16_to_cpu(el->l_next_free_rec) >
le16_to_cpu(el->l_count)) {
ocfs2_error(ocfs2_metadata_cache_get_super(ci),
- "Owner %llu has bad count in extent list "
- "at block %llu (next free=%u, count=%u)\n",
+ "Owner %llu has bad count in extent list at block %llu (next free=%u, count=%u)\n",
(unsigned long long)ocfs2_metadata_cache_owner(ci),
(unsigned long long)bh->b_blocknr,
le16_to_cpu(el->l_next_free_rec),
@@ -2116,8 +2108,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle,
if (left_el->l_next_free_rec != left_el->l_count) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Inode %llu has non-full interior leaf node %llu"
- "(next free = %u)",
+ "Inode %llu has non-full interior leaf node %llu (next free = %u)\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
(unsigned long long)left_leaf_bh->b_blocknr,
le16_to_cpu(left_el->l_next_free_rec));
@@ -2256,8 +2247,7 @@ int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
* If we got here, we never found a valid node where
* the tree indicated one should be.
*/
- ocfs2_error(sb,
- "Invalid extent tree at extent block %llu\n",
+ ocfs2_error(sb, "Invalid extent tree at extent block %llu\n",
(unsigned long long)blkno);
ret = -EROFS;
goto out;
@@ -2526,21 +2516,6 @@ static int ocfs2_update_edge_lengths(handle_t *handle,
struct ocfs2_extent_block *eb;
u32 range;
- /*
- * In normal tree rotation process, we will never touch the
- * tree branch above subtree_index and ocfs2_extend_rotate_transaction
- * doesn't reserve the credits for them either.
- *
- * But we do have a special case here which will update the rightmost
- * records for all the bh in the path.
- * So we have to allocate extra credits and access them.
- */
- ret = ocfs2_extend_trans(handle, subtree_index);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
-
ret = ocfs2_journal_access_path(et->et_ci, handle, path);
if (ret) {
mlog_errno(ret);
@@ -2872,8 +2847,7 @@ int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
* If we got here, we never found a valid node where
* the tree indicated one should be.
*/
- ocfs2_error(sb,
- "Invalid extent tree at extent block %llu\n",
+ ocfs2_error(sb, "Invalid extent tree at extent block %llu\n",
(unsigned long long)blkno);
ret = -EROFS;
goto out;
@@ -2925,7 +2899,8 @@ static int __ocfs2_rotate_tree_left(handle_t *handle,
struct ocfs2_path *right_path = NULL;
struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
- BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
+ if (!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])))
+ return 0;
*empty_extent_path = NULL;
@@ -2966,7 +2941,7 @@ static int __ocfs2_rotate_tree_left(handle_t *handle,
right_path->p_node[subtree_root].bh->b_blocknr,
right_path->p_tree_depth);
- ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
orig_credits, left_path);
if (ret) {
mlog_errno(ret);
@@ -3039,21 +3014,9 @@ static int ocfs2_remove_rightmost_path(handle_t *handle,
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el;
-
ret = ocfs2_et_sanity_check(et);
if (ret)
goto out;
- /*
- * There's two ways we handle this depending on
- * whether path is the only existing one.
- */
- ret = ocfs2_extend_rotate_transaction(handle, 0,
- handle->h_buffer_credits,
- path);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
ret = ocfs2_journal_access_path(et->et_ci, handle, path);
if (ret) {
@@ -3130,6 +3093,30 @@ out:
return ret;
}
+static int ocfs2_remove_rightmost_empty_extent(struct ocfs2_super *osb,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_path *path,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ handle_t *handle;
+ int ret;
+ int credits = path->p_tree_depth * 2 + 1;
+
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ return ret;
+ }
+
+ ret = ocfs2_remove_rightmost_path(handle, et, path, dealloc);
+ if (ret)
+ mlog_errno(ret);
+
+ ocfs2_commit_trans(osb, handle);
+ return ret;
+}
+
/*
* Left rotation of btree records.
*
@@ -3199,7 +3186,7 @@ rightmost_no_delete:
if (le16_to_cpu(el->l_next_free_rec) == 0) {
ret = -EIO;
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu has empty extent block at %llu",
+ "Owner %llu has empty extent block at %llu\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
(unsigned long long)le64_to_cpu(eb->h_blkno));
goto out;
@@ -3627,6 +3614,14 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
*/
if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
le16_to_cpu(el->l_next_free_rec) == 1) {
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ right_path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
ret = ocfs2_remove_rightmost_path(handle, et,
right_path,
@@ -3665,6 +3660,14 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
/*
* The merge code will need to create an empty
* extent to take the place of the newly
@@ -3713,6 +3716,15 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
*/
BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
/* The merge left us with an empty extent, remove it. */
ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
if (ret) {
@@ -3734,6 +3746,15 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
goto out;
}
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
/*
* Error from this last rotate is not critical, so
@@ -3769,6 +3790,16 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
}
if (ctxt->c_split_covers_rec) {
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ ret = 0;
+ goto out;
+ }
+
/*
* The merge may have left an empty extent in
* our leaf. Try to rotate it away.
@@ -3929,7 +3960,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle,
next_free = le16_to_cpu(el->l_next_free_rec);
if (next_free == 0) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu has a bad extent list",
+ "Owner %llu has a bad extent list\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
ret = -EIO;
return;
@@ -4311,13 +4342,13 @@ out:
return ret;
}
-static enum ocfs2_contig_type
-ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
+static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
struct ocfs2_path *path,
struct ocfs2_extent_list *el, int index,
- struct ocfs2_extent_rec *split_rec)
+ struct ocfs2_extent_rec *split_rec,
+ struct ocfs2_merge_ctxt *ctxt)
{
- int status;
+ int status = 0;
enum ocfs2_contig_type ret = CONTIG_NONE;
u32 left_cpos, right_cpos;
struct ocfs2_extent_rec *rec = NULL;
@@ -4336,8 +4367,11 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
if (left_cpos != 0) {
left_path = ocfs2_new_path_from_path(path);
- if (!left_path)
+ if (!left_path) {
+ status = -ENOMEM;
+ mlog_errno(status);
goto exit;
+ }
status = ocfs2_find_path(et->et_ci, left_path,
left_cpos);
@@ -4351,10 +4385,7 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
bh = path_leaf_bh(left_path);
eb = (struct ocfs2_extent_block *)bh->b_data;
ocfs2_error(sb,
- "Extent block #%llu has an "
- "invalid l_next_free_rec of "
- "%d. It should have "
- "matched the l_count of %d",
+ "Extent block #%llu has an invalid l_next_free_rec of %d. It should have matched the l_count of %d\n",
(unsigned long long)le64_to_cpu(eb->h_blkno),
le16_to_cpu(new_el->l_next_free_rec),
le16_to_cpu(new_el->l_count));
@@ -4392,8 +4423,11 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
goto free_left_path;
right_path = ocfs2_new_path_from_path(path);
- if (!right_path)
+ if (!right_path) {
+ status = -ENOMEM;
+ mlog_errno(status);
goto free_left_path;
+ }
status = ocfs2_find_path(et->et_ci, right_path, right_cpos);
if (status)
@@ -4406,8 +4440,7 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
bh = path_leaf_bh(right_path);
eb = (struct ocfs2_extent_block *)bh->b_data;
ocfs2_error(sb,
- "Extent block #%llu has an "
- "invalid l_next_free_rec of %d",
+ "Extent block #%llu has an invalid l_next_free_rec of %d\n",
(unsigned long long)le64_to_cpu(eb->h_blkno),
le16_to_cpu(new_el->l_next_free_rec));
status = -EINVAL;
@@ -4433,7 +4466,10 @@ free_right_path:
free_left_path:
ocfs2_free_path(left_path);
exit:
- return ret;
+ if (status == 0)
+ ctxt->c_contig_type = ret;
+
+ return status;
}
static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et,
@@ -4960,10 +4996,9 @@ leftright:
split_index = ocfs2_search_extent_list(el, cpos);
if (split_index == -1) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu has an extent at cpos %u "
- "which can no longer be found.\n",
- (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
- cpos);
+ "Owner %llu has an extent at cpos %u which can no longer be found\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ cpos);
ret = -EROFS;
goto out;
}
@@ -5039,9 +5074,14 @@ int ocfs2_split_extent(handle_t *handle,
goto out;
}
- ctxt.c_contig_type = ocfs2_figure_merge_contig_type(et, path, el,
- split_index,
- split_rec);
+ ret = ocfs2_figure_merge_contig_type(et, path, el,
+ split_index,
+ split_rec,
+ &ctxt);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
/*
* The core merge / split code wants to know how much room is
@@ -5143,10 +5183,9 @@ int ocfs2_change_extent_flag(handle_t *handle,
index = ocfs2_search_extent_list(el, cpos);
if (index == -1) {
ocfs2_error(sb,
- "Owner %llu has an extent at cpos %u which can no "
- "longer be found.\n",
- (unsigned long long)
- ocfs2_metadata_cache_owner(et->et_ci), cpos);
+ "Owner %llu has an extent at cpos %u which can no longer be found\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ cpos);
ret = -EROFS;
goto out;
}
@@ -5213,9 +5252,7 @@ int ocfs2_mark_extent_written(struct inode *inode,
cpos, len, phys);
if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
- ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
- "that are being written to, but the feature bit "
- "is not set in the super block.",
+ ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents that are being written to, but the feature bit is not set in the super block\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
ret = -EROFS;
goto out;
@@ -5322,6 +5359,15 @@ static int ocfs2_truncate_rec(handle_t *handle,
struct ocfs2_extent_block *eb;
if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
if (ret) {
mlog_errno(ret);
@@ -5499,8 +5545,7 @@ int ocfs2_remove_extent(handle_t *handle,
index = ocfs2_search_extent_list(el, cpos);
if (index == -1) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu has an extent at cpos %u which can no "
- "longer be found.\n",
+ "Owner %llu has an extent at cpos %u which can no longer be found\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
cpos);
ret = -EROFS;
@@ -5565,7 +5610,7 @@ int ocfs2_remove_extent(handle_t *handle,
index = ocfs2_search_extent_list(el, cpos);
if (index == -1) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu: split at cpos %u lost record.",
+ "Owner %llu: split at cpos %u lost record\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
cpos);
ret = -EROFS;
@@ -5581,8 +5626,7 @@ int ocfs2_remove_extent(handle_t *handle,
ocfs2_rec_clusters(el, rec);
if (rec_range != trunc_range) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu: error after split at cpos %u"
- "trunc len %u, existing record is (%u,%u)",
+ "Owner %llu: error after split at cpos %u trunc len %u, existing record is (%u,%u)\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
cpos, len, le32_to_cpu(rec->e_cpos),
ocfs2_rec_clusters(el, rec));
@@ -5910,16 +5954,6 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
ocfs2_journal_dirty(handle, tl_bh);
- /* TODO: Perhaps we can calculate the bulk of the
- * credits up front rather than extending like
- * this. */
- status = ocfs2_extend_trans(handle,
- OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
rec = tl->tl_recs[i];
start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
le32_to_cpu(rec.t_start));
@@ -5940,6 +5974,13 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
goto bail;
}
}
+
+ status = ocfs2_extend_trans(handle,
+ OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
i--;
}
@@ -5998,7 +6039,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
goto out_mutex;
}
- handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
+ handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
mlog_errno(status);
@@ -7096,12 +7137,20 @@ start:
ocfs2_error(inode->i_sb, "Inode %lu has an empty "
"extent record, depth %u\n", inode->i_ino,
le16_to_cpu(root_el->l_tree_depth));
- status = -EROFS;
- goto bail;
+ status = ocfs2_remove_rightmost_empty_extent(osb,
+ &et, path, &dealloc);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ ocfs2_reinit_path(path, 1);
+ goto start;
+ } else {
+ trunc_cpos = le32_to_cpu(rec->e_cpos);
+ trunc_len = 0;
+ blkno = 0;
}
- trunc_cpos = le32_to_cpu(rec->e_cpos);
- trunc_len = 0;
- blkno = 0;
} else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) {
/*
* Truncate entire record.
@@ -7189,8 +7238,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
!ocfs2_supports_inline_data(osb)) {
ocfs2_error(inode->i_sb,
- "Inline data flags for inode %llu don't agree! "
- "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
+ "Inline data flags for inode %llu don't agree! Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
le16_to_cpu(di->i_dyn_features),
OCFS2_I(inode)->ip_dyn_features,
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index f906a250da6add..519cf8cb0473d8 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -227,7 +227,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) {
- ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag",
+ ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
return -EROFS;
}
@@ -237,7 +237,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
if (size > PAGE_CACHE_SIZE ||
size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) {
ocfs2_error(inode->i_sb,
- "Inode %llu has with inline data has bad size: %Lu",
+ "Inode %llu has with inline data has bad size: %Lu\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(unsigned long long)size);
return -EROFS;
@@ -523,7 +523,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
unsigned long len = bh_result->b_size;
- unsigned int clusters_to_alloc = 0;
+ unsigned int clusters_to_alloc = 0, contig_clusters = 0;
cpos = ocfs2_blocks_to_clusters(inode->i_sb, iblock);
@@ -533,10 +533,14 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
+ down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
/* This figures out the size of the next contiguous block, and
* our logical offset */
ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
&contig_blocks, &ext_flags);
+ up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
if (ret) {
mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
(unsigned long long)iblock);
@@ -557,16 +561,21 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
alloc_locked = 1;
+ down_write(&OCFS2_I(inode)->ip_alloc_sem);
+
/* fill hole, allocate blocks can't be larger than the size
* of the hole */
clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len);
- if (clusters_to_alloc > contig_blocks)
- clusters_to_alloc = contig_blocks;
+ contig_clusters = ocfs2_clusters_for_blocks(inode->i_sb,
+ contig_blocks);
+ if (clusters_to_alloc > contig_clusters)
+ clusters_to_alloc = contig_clusters;
/* allocate extent and insert them into the extent tree */
ret = ocfs2_extend_allocation(inode, cpos,
clusters_to_alloc, 0);
if (ret < 0) {
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
mlog_errno(ret);
goto bail;
}
@@ -574,11 +583,13 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
&contig_blocks, &ext_flags);
if (ret < 0) {
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
(unsigned long long)iblock);
ret = -EIO;
goto bail;
}
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
}
/*
@@ -619,9 +630,6 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
/* this io's submitter should not have unlocked this before we could */
BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
- if (ocfs2_iocb_is_sem_locked(iocb))
- ocfs2_iocb_clear_sem_locked(iocb);
-
if (ocfs2_iocb_is_unaligned_aio(iocb)) {
ocfs2_iocb_clear_unaligned_aio(iocb);
@@ -833,12 +841,17 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
/* zeroing out the previously allocated cluster tail
* that but not zeroed */
- if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+ if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
+ down_read(&OCFS2_I(inode)->ip_alloc_sem);
ret = ocfs2_direct_IO_zero_extend(osb, inode, offset,
zero_len_tail, cluster_align_tail);
- else
+ up_read(&OCFS2_I(inode)->ip_alloc_sem);
+ } else {
+ down_write(&OCFS2_I(inode)->ip_alloc_sem);
ret = ocfs2_direct_IO_extend_no_holes(osb, inode,
offset);
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
+ }
if (ret < 0) {
mlog_errno(ret);
ocfs2_inode_unlock(inode, 1);
@@ -925,13 +938,23 @@ clean_orphan:
int update_isize = written > 0 ? 1 : 0;
loff_t end = update_isize ? offset + written : 0;
- tmp_ret = ocfs2_del_inode_from_orphan(osb, inode,
+ tmp_ret = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (tmp_ret < 0) {
+ ret = tmp_ret;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh,
update_isize, end);
if (tmp_ret < 0) {
ret = tmp_ret;
+ mlog_errno(ret);
goto out;
}
+ ocfs2_inode_unlock(inode, 1);
+
tmp_ret = jbd2_journal_force_commit(journal);
if (tmp_ret < 0) {
ret = tmp_ret;
@@ -2176,10 +2199,7 @@ try_again:
if (ret)
goto out_commit;
}
- /*
- * We don't want this to fail in ocfs2_write_end(), so do it
- * here.
- */
+
ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
@@ -2336,7 +2356,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
- int i;
+ int i, ret;
unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1);
struct inode *inode = mapping->host;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -2345,6 +2365,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
handle_t *handle = wc->w_handle;
struct page *tmppage;
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ copied = ret;
+ mlog_errno(ret);
+ goto out;
+ }
+
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
ocfs2_write_end_inline(inode, pos, len, &copied, di, wc);
goto out_write_size;
@@ -2400,6 +2428,7 @@ out_write_size:
ocfs2_update_inode_fsync_trans(handle, inode, 1);
ocfs2_journal_dirty(handle, wc->w_di_bh);
+out:
/* unlock pages before dealloc since it needs acquiring j_trans_barrier
* lock, or it will cause a deadlock since journal commit threads holds
* this lock and will ask for the page lock when flushing the data.
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index dd59599b022d5a..24e496d6bdcdba 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -79,7 +79,6 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
enum ocfs2_iocb_lock_bits {
OCFS2_IOCB_RW_LOCK = 0,
OCFS2_IOCB_RW_LOCK_LEVEL,
- OCFS2_IOCB_SEM,
OCFS2_IOCB_UNALIGNED_IO,
OCFS2_IOCB_NUM_LOCKS
};
@@ -88,12 +87,6 @@ enum ocfs2_iocb_lock_bits {
clear_bit(OCFS2_IOCB_RW_LOCK, (unsigned long *)&iocb->private)
#define ocfs2_iocb_rw_locked_level(iocb) \
test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_set_sem_locked(iocb) \
- set_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_clear_sem_locked(iocb) \
- clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_is_sem_locked(iocb) \
- test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
#define ocfs2_iocb_set_unaligned_aio(iocb) \
set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 1edcb141f63930..fe50ded1b4ce76 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -316,6 +316,12 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
bh = bhs[i];
if (!(flags & OCFS2_BH_READAHEAD)) {
+ if (status) {
+ /* Clear the rest of the buffers on error */
+ put_bh(bh);
+ bhs[i] = NULL;
+ continue;
+ }
/* We know this can't have changed as we hold the
* owner sem. Avoid doing any work on the bh if the
* journal has it. */
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 16eff45727eeaa..3a60c83218dba6 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -36,7 +36,7 @@
#include <linux/debugfs.h>
#include <linux/slab.h>
#include <linux/bitmap.h>
-
+#include <linux/ktime.h>
#include "heartbeat.h"
#include "tcp.h"
#include "nodemanager.h"
@@ -1061,37 +1061,6 @@ bail:
return ret;
}
-/* Subtract b from a, storing the result in a. a *must* have a larger
- * value than b. */
-static void o2hb_tv_subtract(struct timeval *a,
- struct timeval *b)
-{
- /* just return 0 when a is after b */
- if (a->tv_sec < b->tv_sec ||
- (a->tv_sec == b->tv_sec && a->tv_usec < b->tv_usec)) {
- a->tv_sec = 0;
- a->tv_usec = 0;
- return;
- }
-
- a->tv_sec -= b->tv_sec;
- a->tv_usec -= b->tv_usec;
- while ( a->tv_usec < 0 ) {
- a->tv_sec--;
- a->tv_usec += 1000000;
- }
-}
-
-static unsigned int o2hb_elapsed_msecs(struct timeval *start,
- struct timeval *end)
-{
- struct timeval res = *end;
-
- o2hb_tv_subtract(&res, start);
-
- return res.tv_sec * 1000 + res.tv_usec / 1000;
-}
-
/*
* we ride the region ref that the region dir holds. before the region
* dir is removed and drops it ref it will wait to tear down this
@@ -1102,7 +1071,7 @@ static int o2hb_thread(void *data)
int i, ret;
struct o2hb_region *reg = data;
struct o2hb_bio_wait_ctxt write_wc;
- struct timeval before_hb, after_hb;
+ ktime_t before_hb, after_hb;
unsigned int elapsed_msec;
mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n");
@@ -1119,18 +1088,18 @@ static int o2hb_thread(void *data)
* hr_timeout_ms between disk writes. On busy systems
* this should result in a heartbeat which is less
* likely to time itself out. */
- do_gettimeofday(&before_hb);
+ before_hb = ktime_get_real();
ret = o2hb_do_disk_heartbeat(reg);
- do_gettimeofday(&after_hb);
- elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
+ after_hb = ktime_get_real();
+
+ elapsed_msec = (unsigned int)
+ ktime_ms_delta(after_hb, before_hb);
mlog(ML_HEARTBEAT,
- "start = %lu.%lu, end = %lu.%lu, msec = %u, ret = %d\n",
- before_hb.tv_sec, (unsigned long) before_hb.tv_usec,
- after_hb.tv_sec, (unsigned long) after_hb.tv_usec,
- elapsed_msec, ret);
+ "start = %lld, end = %lld, msec = %u, ret = %d\n",
+ before_hb.tv64, after_hb.tv64, elapsed_msec, ret);
if (!kthread_should_stop() &&
elapsed_msec < reg->hr_timeout_ms) {
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index af7598bff1b513..dfe162f5fd4cf3 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -64,6 +64,40 @@ static ssize_t mlog_mask_store(u64 mask, const char *buf, size_t count)
return count;
}
+void __mlog_printk(const u64 *mask, const char *func, int line,
+ const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+ const char *level;
+ const char *prefix = "";
+
+ if (!__mlog_test_u64(*mask, mlog_and_bits) ||
+ __mlog_test_u64(*mask, mlog_not_bits))
+ return;
+
+ if (*mask & ML_ERROR) {
+ level = KERN_ERR;
+ prefix = "ERROR: ";
+ } else if (*mask & ML_NOTICE) {
+ level = KERN_NOTICE;
+ } else {
+ level = KERN_INFO;
+ }
+
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ printk("%s(%s,%u,%u):%s:%d %s%pV",
+ level, current->comm, task_pid_nr(current),
+ raw_smp_processor_id(), func, line, prefix, &vaf);
+
+ va_end(args);
+}
+EXPORT_SYMBOL_GPL(__mlog_printk);
+
struct mlog_attribute {
struct attribute attr;
u64 mask;
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 7fdc25a4d8c0e7..308ea0eb35fd11 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -162,38 +162,20 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
#endif
-/*
- * smp_processor_id() "helpfully" screams when called outside preemptible
- * regions in current kernels. sles doesn't have the variants that don't
- * scream. just do this instead of trying to guess which we're building
- * against.. *sigh*.
- */
-#define __mlog_cpu_guess ({ \
- unsigned long _cpu = get_cpu(); \
- put_cpu(); \
- _cpu; \
-})
+__printf(4, 5)
+void __mlog_printk(const u64 *m, const char *func, int line,
+ const char *fmt, ...);
-/* In the following two macros, the whitespace after the ',' just
- * before ##args is intentional. Otherwise, gcc 2.95 will eat the
- * previous token if args expands to nothing.
+/*
+ * Testing before the __mlog_printk call lets the compiler eliminate the
+ * call completely when (m & ML_ALLOWED_BITS) is 0.
*/
-#define __mlog_printk(level, fmt, args...) \
- printk(level "(%s,%u,%lu):%s:%d " fmt, current->comm, \
- task_pid_nr(current), __mlog_cpu_guess, \
- __PRETTY_FUNCTION__, __LINE__ , ##args)
-
-#define mlog(mask, fmt, args...) do { \
- u64 __m = MLOG_MASK_PREFIX | (mask); \
- if ((__m & ML_ALLOWED_BITS) && \
- __mlog_test_u64(__m, mlog_and_bits) && \
- !__mlog_test_u64(__m, mlog_not_bits)) { \
- if (__m & ML_ERROR) \
- __mlog_printk(KERN_ERR, "ERROR: "fmt , ##args); \
- else if (__m & ML_NOTICE) \
- __mlog_printk(KERN_NOTICE, fmt , ##args); \
- else __mlog_printk(KERN_INFO, fmt , ##args); \
- } \
+#define mlog(mask, fmt, ...) \
+do { \
+ u64 _m = MLOG_MASK_PREFIX | (mask); \
+ if (_m & ML_ALLOWED_BITS) \
+ __mlog_printk(&_m, __func__, __LINE__, fmt, \
+ ##__VA_ARGS__); \
} while (0)
#define mlog_errno(st) ({ \
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 56c403a563bc3d..2d0acd6678fe4c 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -2204,7 +2204,7 @@ out:
kfree(o2net_hand);
kfree(o2net_keep_req);
kfree(o2net_keep_resp);
-
+ o2net_debugfs_exit();
o2quo_exit();
return -ENOMEM;
}
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index ccd4dcfc36457c..a9513ff0a47f78 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -480,33 +480,26 @@ static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh)
trailer = ocfs2_trailer_from_bh(bh, dir->i_sb);
if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
- rc = -EINVAL;
- ocfs2_error(dir->i_sb,
- "Invalid dirblock #%llu: "
- "signature = %.*s\n",
- (unsigned long long)bh->b_blocknr, 7,
- trailer->db_signature);
+ rc = ocfs2_error(dir->i_sb,
+ "Invalid dirblock #%llu: signature = %.*s\n",
+ (unsigned long long)bh->b_blocknr, 7,
+ trailer->db_signature);
goto out;
}
if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) {
- rc = -EINVAL;
- ocfs2_error(dir->i_sb,
- "Directory block #%llu has an invalid "
- "db_blkno of %llu",
- (unsigned long long)bh->b_blocknr,
- (unsigned long long)le64_to_cpu(trailer->db_blkno));
+ rc = ocfs2_error(dir->i_sb,
+ "Directory block #%llu has an invalid db_blkno of %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(trailer->db_blkno));
goto out;
}
if (le64_to_cpu(trailer->db_parent_dinode) !=
OCFS2_I(dir)->ip_blkno) {
- rc = -EINVAL;
- ocfs2_error(dir->i_sb,
- "Directory block #%llu on dinode "
- "#%llu has an invalid parent_dinode "
- "of %llu",
- (unsigned long long)bh->b_blocknr,
- (unsigned long long)OCFS2_I(dir)->ip_blkno,
- (unsigned long long)le64_to_cpu(trailer->db_blkno));
+ rc = ocfs2_error(dir->i_sb,
+ "Directory block #%llu on dinode #%llu has an invalid parent_dinode of %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)OCFS2_I(dir)->ip_blkno,
+ (unsigned long long)le64_to_cpu(trailer->db_blkno));
goto out;
}
out:
@@ -604,14 +597,13 @@ static int ocfs2_validate_dx_root(struct super_block *sb,
}
if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) {
- ocfs2_error(sb,
- "Dir Index Root # %llu has bad signature %.*s",
- (unsigned long long)le64_to_cpu(dx_root->dr_blkno),
- 7, dx_root->dr_signature);
- return -EINVAL;
+ ret = ocfs2_error(sb,
+ "Dir Index Root # %llu has bad signature %.*s\n",
+ (unsigned long long)le64_to_cpu(dx_root->dr_blkno),
+ 7, dx_root->dr_signature);
}
- return 0;
+ return ret;
}
static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di,
@@ -648,12 +640,11 @@ static int ocfs2_validate_dx_leaf(struct super_block *sb,
}
if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) {
- ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s",
- 7, dx_leaf->dl_signature);
- return -EROFS;
+ ret = ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s\n",
+ 7, dx_leaf->dl_signature);
}
- return 0;
+ return ret;
}
static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno,
@@ -812,11 +803,10 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
el = &eb->h_list;
if (el->l_tree_depth) {
- ocfs2_error(inode->i_sb,
- "Inode %lu has non zero tree depth in "
- "btree tree block %llu\n", inode->i_ino,
- (unsigned long long)eb_bh->b_blocknr);
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb,
+ "Inode %lu has non zero tree depth in btree tree block %llu\n",
+ inode->i_ino,
+ (unsigned long long)eb_bh->b_blocknr);
goto out;
}
}
@@ -832,11 +822,11 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
}
if (!found) {
- ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
- "record (%u, %u, 0) in btree", inode->i_ino,
- le32_to_cpu(rec->e_cpos),
- ocfs2_rec_clusters(el, rec));
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb,
+ "Inode %lu has bad extent record (%u, %u, 0) in btree\n",
+ inode->i_ino,
+ le32_to_cpu(rec->e_cpos),
+ ocfs2_rec_clusters(el, rec));
goto out;
}
@@ -1617,7 +1607,7 @@ int __ocfs2_add_entry(handle_t *handle,
struct ocfs2_dir_entry *de, *de1;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_fe_bh->b_data;
struct super_block *sb = dir->i_sb;
- int retval, status;
+ int retval;
unsigned int size = sb->s_blocksize;
struct buffer_head *insert_bh = lookup->dl_leaf_bh;
char *data_start = insert_bh->b_data;
@@ -1695,25 +1685,25 @@ int __ocfs2_add_entry(handle_t *handle,
}
if (insert_bh == parent_fe_bh)
- status = ocfs2_journal_access_di(handle,
+ retval = ocfs2_journal_access_di(handle,
INODE_CACHE(dir),
insert_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
else {
- status = ocfs2_journal_access_db(handle,
+ retval = ocfs2_journal_access_db(handle,
INODE_CACHE(dir),
insert_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
- if (ocfs2_dir_indexed(dir)) {
- status = ocfs2_dx_dir_insert(dir,
+ if (!retval && ocfs2_dir_indexed(dir))
+ retval = ocfs2_dx_dir_insert(dir,
handle,
lookup);
- if (status) {
- mlog_errno(status);
- goto bail;
- }
- }
+ }
+
+ if (retval) {
+ mlog_errno(retval);
+ goto bail;
}
/* By now the buffer is marked for journaling */
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index fae17c640df3ee..e88ccf8c83fff7 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -1014,7 +1014,6 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
/* will exit holding res->spinlock, but may drop in function */
void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags);
-void __dlm_wait_on_lockres_flags_set(struct dlm_lock_resource *res, int flags);
/* will exit holding res->spinlock, but may drop in function */
static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index fdf4b41d0609a0..46b8b2bbc95ae7 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -498,16 +498,6 @@ static void dlm_lockres_release(struct kref *kref)
mlog(0, "destroying lockres %.*s\n", res->lockname.len,
res->lockname.name);
- spin_lock(&dlm->track_lock);
- if (!list_empty(&res->tracking))
- list_del_init(&res->tracking);
- else {
- mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n",
- res->lockname.len, res->lockname.name);
- dlm_print_one_lock_resource(res);
- }
- spin_unlock(&dlm->track_lock);
-
atomic_dec(&dlm->res_cur_count);
if (!hlist_unhashed(&res->hash_node) ||
@@ -795,8 +785,18 @@ lookup:
dlm_lockres_grab_inflight_ref(dlm, tmpres);
spin_unlock(&tmpres->spinlock);
- if (res)
+ if (res) {
+ spin_lock(&dlm->track_lock);
+ if (!list_empty(&res->tracking))
+ list_del_init(&res->tracking);
+ else
+ mlog(ML_ERROR, "Resource %.*s not "
+ "on the Tracking list\n",
+ res->lockname.len,
+ res->lockname.name);
+ spin_unlock(&dlm->track_lock);
dlm_lockres_put(res);
+ }
res = tmpres;
goto leave;
}
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 69aac6f088ada7..2e5e6d5fffe8d8 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -211,6 +211,16 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
__dlm_unhash_lockres(dlm, res);
+ spin_lock(&dlm->track_lock);
+ if (!list_empty(&res->tracking))
+ list_del_init(&res->tracking);
+ else {
+ mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n",
+ res->lockname.len, res->lockname.name);
+ __dlm_print_one_lock_resource(res);
+ }
+ spin_unlock(&dlm->track_lock);
+
/* lockres is not in the hash now. drop the flag and wake up
* any processes waiting in dlm_get_lock_resource. */
if (!master) {
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 8b23aa2f52ddaf..23157e40dd7402 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -4025,9 +4025,13 @@ static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb)
osb->dc_work_sequence = osb->dc_wake_sequence;
processed = osb->blocked_lock_count;
- while (processed) {
- BUG_ON(list_empty(&osb->blocked_lock_list));
-
+ /*
+ * blocked lock processing in this loop might call iput which can
+ * remove items off osb->blocked_lock_list. Downconvert up to
+ * 'processed' number of locks, but stop short if we had some
+ * removed in ocfs2_mark_lockres_freeing when downconverting.
+ */
+ while (processed && !list_empty(&osb->blocked_lock_list)) {
lockres = list_entry(osb->blocked_lock_list.next,
struct ocfs2_lock_res, l_blocked_list);
list_del_init(&lockres->l_blocked_list);
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 767370b656ca67..e4719e0a3f9993 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -305,8 +305,8 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
if (el->l_tree_depth) {
ocfs2_error(inode->i_sb,
- "Inode %lu has non zero tree depth in "
- "leaf block %llu\n", inode->i_ino,
+ "Inode %lu has non zero tree depth in leaf block %llu\n",
+ inode->i_ino,
(unsigned long long)eb_bh->b_blocknr);
ret = -EROFS;
goto out;
@@ -441,8 +441,8 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
if (el->l_tree_depth) {
ocfs2_error(inode->i_sb,
- "Inode %lu has non zero tree depth in "
- "leaf block %llu\n", inode->i_ino,
+ "Inode %lu has non zero tree depth in leaf block %llu\n",
+ inode->i_ino,
(unsigned long long)eb_bh->b_blocknr);
ret = -EROFS;
goto out;
@@ -475,8 +475,9 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
if (!rec->e_blkno) {
- ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
- "record (%u, %u, 0)", inode->i_ino,
+ ocfs2_error(inode->i_sb,
+ "Inode %lu has bad extent record (%u, %u, 0)\n",
+ inode->i_ino,
le32_to_cpu(rec->e_cpos),
ocfs2_rec_clusters(el, rec));
ret = -EROFS;
@@ -564,8 +565,8 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
if (el->l_tree_depth) {
ocfs2_error(inode->i_sb,
- "Inode %lu has non zero tree depth in "
- "xattr leaf block %llu\n", inode->i_ino,
+ "Inode %lu has non zero tree depth in xattr leaf block %llu\n",
+ inode->i_ino,
(unsigned long long)eb_bh->b_blocknr);
ret = -EROFS;
goto out;
@@ -582,8 +583,9 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
if (!rec->e_blkno) {
- ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
- "record (%u, %u, 0) in xattr", inode->i_ino,
+ ocfs2_error(inode->i_sb,
+ "Inode %lu has bad extent record (%u, %u, 0) in xattr\n",
+ inode->i_ino,
le32_to_cpu(rec->e_cpos),
ocfs2_rec_clusters(el, rec));
ret = -EROFS;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 8f1feca89fb08f..ebd506e577e6d7 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1127,6 +1127,7 @@ out:
int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
{
int status = 0, size_change;
+ int inode_locked = 0;
struct inode *inode = d_inode(dentry);
struct super_block *sb = inode->i_sb;
struct ocfs2_super *osb = OCFS2_SB(sb);
@@ -1172,6 +1173,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
mlog_errno(status);
goto bail_unlock_rw;
}
+ inode_locked = 1;
if (size_change) {
status = inode_newsize_ok(inode, attr->ia_size);
@@ -1252,7 +1254,10 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
bail_commit:
ocfs2_commit_trans(osb, handle);
bail_unlock:
- ocfs2_inode_unlock(inode, 1);
+ if (status) {
+ ocfs2_inode_unlock(inode, 1);
+ inode_locked = 0;
+ }
bail_unlock_rw:
if (size_change)
ocfs2_rw_unlock(inode, 1);
@@ -1268,6 +1273,8 @@ bail:
if (status < 0)
mlog_errno(status);
}
+ if (inode_locked)
+ ocfs2_inode_unlock(inode, 1);
return status;
}
@@ -2251,7 +2258,7 @@ out:
static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
struct iov_iter *from)
{
- int direct_io, appending, rw_level, have_alloc_sem = 0;
+ int direct_io, appending, rw_level;
int can_do_direct, has_refcount = 0;
ssize_t written = 0;
ssize_t ret;
@@ -2280,16 +2287,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
mutex_lock(&inode->i_mutex);
- ocfs2_iocb_clear_sem_locked(iocb);
-
relock:
- /* to match setattr's i_mutex -> rw_lock ordering */
- if (direct_io) {
- have_alloc_sem = 1;
- /* communicate with ocfs2_dio_end_io */
- ocfs2_iocb_set_sem_locked(iocb);
- }
-
/*
* Concurrent O_DIRECT writes are allowed with
* mount_option "coherency=buffered".
@@ -2299,7 +2297,7 @@ relock:
ret = ocfs2_rw_lock(inode, rw_level);
if (ret < 0) {
mlog_errno(ret);
- goto out_sems;
+ goto out_mutex;
}
/*
@@ -2348,7 +2346,6 @@ relock:
if (direct_io && !can_do_direct) {
ocfs2_rw_unlock(inode, rw_level);
- have_alloc_sem = 0;
rw_level = -1;
direct_io = 0;
@@ -2417,7 +2414,6 @@ no_sync:
*/
if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
rw_level = -1;
- have_alloc_sem = 0;
unaligned_dio = 0;
}
@@ -2430,10 +2426,7 @@ out:
if (rw_level != -1)
ocfs2_rw_unlock(inode, rw_level);
-out_sems:
- if (have_alloc_sem)
- ocfs2_iocb_clear_sem_locked(iocb);
-
+out_mutex:
mutex_unlock(&inode->i_mutex);
if (written)
@@ -2474,7 +2467,7 @@ bail:
static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
struct iov_iter *to)
{
- int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;
+ int ret = 0, rw_level = -1, lock_level = 0;
struct file *filp = iocb->ki_filp;
struct inode *inode = file_inode(filp);
@@ -2491,16 +2484,11 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
goto bail;
}
- ocfs2_iocb_clear_sem_locked(iocb);
-
/*
* buffered reads protect themselves in ->readpage(). O_DIRECT reads
* need locks to protect pending reads from racing with truncate.
*/
if (iocb->ki_flags & IOCB_DIRECT) {
- have_alloc_sem = 1;
- ocfs2_iocb_set_sem_locked(iocb);
-
ret = ocfs2_rw_lock(inode, 0);
if (ret < 0) {
mlog_errno(ret);
@@ -2536,13 +2524,9 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
/* see ocfs2_file_write_iter */
if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
rw_level = -1;
- have_alloc_sem = 0;
}
bail:
- if (have_alloc_sem)
- ocfs2_iocb_clear_sem_locked(iocb);
-
if (rw_level != -1)
ocfs2_rw_unlock(inode, rw_level);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index b254416dc8d92d..8f87e05ee25d38 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -971,6 +971,7 @@ static void ocfs2_delete_inode(struct inode *inode)
int wipe, status;
sigset_t oldset;
struct buffer_head *di_bh = NULL;
+ struct ocfs2_dinode *di = NULL;
trace_ocfs2_delete_inode(inode->i_ino,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -1025,6 +1026,14 @@ static void ocfs2_delete_inode(struct inode *inode)
goto bail_unlock_nfs_sync;
}
+ di = (struct ocfs2_dinode *)di_bh->b_data;
+ /* Skip inode deletion and wait for dio orphan entry recovered
+ * first */
+ if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
+ ocfs2_cleanup_delete_inode(inode, 0);
+ goto bail_unlock_inode;
+ }
+
/* Query the cluster. This will be the final decision made
* before we go ahead and wipe the inode. */
status = ocfs2_query_inode_wipe(inode, di_bh, &wipe);
@@ -1191,17 +1200,19 @@ void ocfs2_evict_inode(struct inode *inode)
int ocfs2_drop_inode(struct inode *inode)
{
struct ocfs2_inode_info *oi = OCFS2_I(inode);
- int res;
trace_ocfs2_drop_inode((unsigned long long)oi->ip_blkno,
inode->i_nlink, oi->ip_flags);
- if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)
- res = 1;
- else
- res = generic_drop_inode(inode);
+ assert_spin_locked(&inode->i_lock);
+ inode->i_state |= I_WILL_FREE;
+ spin_unlock(&inode->i_lock);
+ write_inode_now(inode, 1);
+ spin_lock(&inode->i_lock);
+ WARN_ON(inode->i_state & I_NEW);
+ inode->i_state &= ~I_WILL_FREE;
- return res;
+ return 1;
}
/*
@@ -1350,32 +1361,32 @@ int ocfs2_validate_inode_block(struct super_block *sb,
rc = -EINVAL;
if (!OCFS2_IS_VALID_DINODE(di)) {
- ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n",
- (unsigned long long)bh->b_blocknr, 7,
- di->i_signature);
+ rc = ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n",
+ (unsigned long long)bh->b_blocknr, 7,
+ di->i_signature);
goto bail;
}
if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
- ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n",
- (unsigned long long)bh->b_blocknr,
- (unsigned long long)le64_to_cpu(di->i_blkno));
+ rc = ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(di->i_blkno));
goto bail;
}
if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
- ocfs2_error(sb,
- "Invalid dinode #%llu: OCFS2_VALID_FL not set\n",
- (unsigned long long)bh->b_blocknr);
+ rc = ocfs2_error(sb,
+ "Invalid dinode #%llu: OCFS2_VALID_FL not set\n",
+ (unsigned long long)bh->b_blocknr);
goto bail;
}
if (le32_to_cpu(di->i_fs_generation) !=
OCFS2_SB(sb)->fs_generation) {
- ocfs2_error(sb,
- "Invalid dinode #%llu: fs_generation is %u\n",
- (unsigned long long)bh->b_blocknr,
- le32_to_cpu(di->i_fs_generation));
+ rc = ocfs2_error(sb,
+ "Invalid dinode #%llu: fs_generation is %u\n",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(di->i_fs_generation));
goto bail;
}
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index ff531928269ed1..ad4c3a0009be2d 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -374,7 +374,7 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
mlog_errno(PTR_ERR(handle));
if (is_journal_aborted(journal)) {
- ocfs2_abort(osb->sb, "Detected aborted journal");
+ ocfs2_abort(osb->sb, "Detected aborted journal\n");
handle = ERR_PTR(-EROFS);
}
} else {
@@ -571,9 +571,7 @@ static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers,
(unsigned long)bh,
(unsigned long long)bh->b_blocknr);
- /* We aren't guaranteed to have the superblock here - but if we
- * don't, it'll just crash. */
- ocfs2_error(bh->b_assoc_map->host->i_sb,
+ ocfs2_error(bh->b_bdev->bd_super,
"JBD2 has aborted our journal, ocfs2 cannot continue\n");
}
@@ -775,7 +773,20 @@ void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh)
trace_ocfs2_journal_dirty((unsigned long long)bh->b_blocknr);
status = jbd2_journal_dirty_metadata(handle, bh);
- BUG_ON(status);
+ if (status) {
+ mlog_errno(status);
+ if (!is_handle_aborted(handle)) {
+ journal_t *journal = handle->h_transaction->t_journal;
+ struct super_block *sb = bh->b_bdev->bd_super;
+
+ mlog(ML_ERROR, "jbd2_journal_dirty_metadata failed. "
+ "Aborting transaction and journal.\n");
+ handle->h_err = status;
+ jbd2_journal_abort_handle(handle);
+ jbd2_journal_abort(journal, status);
+ ocfs2_abort(sb, "Journal already aborted.\n");
+ }
+ }
}
#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
@@ -2137,6 +2148,8 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
struct inode *inode = NULL;
struct inode *iter;
struct ocfs2_inode_info *oi;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_dinode *di = NULL;
trace_ocfs2_recover_orphans(slot);
@@ -2157,16 +2170,22 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
iter = oi->ip_next_orphan;
oi->ip_next_orphan = NULL;
+ ret = ocfs2_rw_lock(inode, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto next;
+ }
/*
* We need to take and drop the inode lock to
* force read inode from disk.
*/
- ret = ocfs2_inode_lock(inode, NULL, 0);
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
if (ret) {
mlog_errno(ret);
- goto next;
+ goto unlock_rw;
}
- ocfs2_inode_unlock(inode, 0);
+
+ di = (struct ocfs2_dinode *)di_bh->b_data;
if (inode->i_nlink == 0) {
spin_lock(&oi->ip_lock);
@@ -2174,43 +2193,32 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
* ocfs2_delete_inode. */
oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
spin_unlock(&oi->ip_lock);
- } else if (orphan_reco_type == ORPHAN_NEED_TRUNCATE) {
- struct buffer_head *di_bh = NULL;
-
- ret = ocfs2_rw_lock(inode, 1);
- if (ret) {
- mlog_errno(ret);
- goto next;
- }
-
- ret = ocfs2_inode_lock(inode, &di_bh, 1);
- if (ret < 0) {
- ocfs2_rw_unlock(inode, 1);
- mlog_errno(ret);
- goto next;
- }
+ }
+ if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) &&
+ (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
ret = ocfs2_truncate_file(inode, di_bh,
i_size_read(inode));
- ocfs2_inode_unlock(inode, 1);
- ocfs2_rw_unlock(inode, 1);
- brelse(di_bh);
if (ret < 0) {
if (ret != -ENOSPC)
mlog_errno(ret);
- goto next;
+ goto unlock_inode;
}
- ret = ocfs2_del_inode_from_orphan(osb, inode, 0, 0);
+ ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0);
if (ret)
mlog_errno(ret);
wake_up(&OCFS2_I(inode)->append_dio_wq);
} /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */
-
+unlock_inode:
+ ocfs2_inode_unlock(inode, 1);
+unlock_rw:
+ ocfs2_rw_unlock(inode, 1);
next:
iput(inode);
-
+ brelse(di_bh);
+ di_bh = NULL;
inode = iter;
}
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 857bbbcd39f3b6..0a4457fb0711b7 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -665,8 +665,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
#ifdef CONFIG_OCFS2_DEBUG_FS
if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
ocfs2_local_alloc_count_bits(alloc)) {
- ocfs2_error(osb->sb, "local alloc inode %llu says it has "
- "%u used bits, but a count shows %u",
+ ocfs2_error(osb->sb, "local alloc inode %llu says it has %u used bits, but a count shows %u\n",
(unsigned long long)le64_to_cpu(alloc->i_blkno),
le32_to_cpu(alloc->id1.bitmap1.i_used),
ocfs2_local_alloc_count_bits(alloc));
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 56a768d06aa6fd..124471d26a73f4 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -99,11 +99,9 @@ static int __ocfs2_move_extent(handle_t *handle,
index = ocfs2_search_extent_list(el, cpos);
if (index == -1) {
- ocfs2_error(inode->i_sb,
- "Inode %llu has an extent at cpos %u which can no "
- "longer be found.\n",
- (unsigned long long)ino, cpos);
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb,
+ "Inode %llu has an extent at cpos %u which can no longer be found\n",
+ (unsigned long long)ino, cpos);
goto out;
}
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 176fe6afd94ecc..a1507488d52181 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1291,6 +1291,15 @@ static int ocfs2_rename(struct inode *old_dir,
}
parents_locked = 1;
+ if (!new_dir->i_nlink) {
+ mlog(ML_ERROR, "new dir %llu has been removed, inode %llu "
+ "can not be moved into it.",
+ (unsigned long long)new_dir->i_ino,
+ (unsigned long long)old_inode->i_ino);
+ status = -EACCES;
+ goto bail;
+ }
+
/* make sure both dirs have bhs
* get an extra ref on old_dir_bh if old==new */
if (!new_dir_bh) {
@@ -1551,12 +1560,25 @@ static int ocfs2_rename(struct inode *old_dir,
status = ocfs2_find_entry(old_dentry->d_name.name,
old_dentry->d_name.len, old_dir,
&old_entry_lookup);
- if (status)
+ if (status) {
+ if (!is_journal_aborted(osb->journal->j_journal)) {
+ ocfs2_error(osb->sb, "new entry %.*s is added, but old entry %.*s "
+ "is not deleted.",
+ new_dentry->d_name.len, new_dentry->d_name.name,
+ old_dentry->d_name.len, old_dentry->d_name.name);
+ }
goto bail;
+ }
status = ocfs2_delete_entry(handle, old_dir, &old_entry_lookup);
if (status < 0) {
mlog_errno(status);
+ if (!is_journal_aborted(osb->journal->j_journal)) {
+ ocfs2_error(osb->sb, "new entry %.*s is added, but old entry %.*s "
+ "is not deleted.",
+ new_dentry->d_name.len, new_dentry->d_name.name,
+ old_dentry->d_name.len, old_dentry->d_name.name);
+ }
goto bail;
}
@@ -2670,30 +2692,22 @@ bail:
}
int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
- struct inode *inode, int update_isize,
- loff_t end)
+ struct inode *inode, struct buffer_head *di_bh,
+ int update_isize, loff_t end)
{
struct inode *orphan_dir_inode = NULL;
struct buffer_head *orphan_dir_bh = NULL;
- struct buffer_head *di_bh = NULL;
- struct ocfs2_dinode *di = NULL;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
handle_t *handle = NULL;
int status = 0;
- status = ocfs2_inode_lock(inode, &di_bh, 1);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
- di = (struct ocfs2_dinode *) di_bh->b_data;
-
orphan_dir_inode = ocfs2_get_system_file_inode(osb,
ORPHAN_DIR_SYSTEM_INODE,
le16_to_cpu(di->i_dio_orphaned_slot));
if (!orphan_dir_inode) {
status = -ENOENT;
mlog_errno(status);
- goto bail_unlock_inode;
+ goto bail;
}
mutex_lock(&orphan_dir_inode->i_mutex);
@@ -2702,7 +2716,7 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
mutex_unlock(&orphan_dir_inode->i_mutex);
iput(orphan_dir_inode);
mlog_errno(status);
- goto bail_unlock_inode;
+ goto bail;
}
handle = ocfs2_start_trans(osb,
@@ -2749,10 +2763,6 @@ bail_unlock_orphan:
brelse(orphan_dir_bh);
iput(orphan_dir_inode);
-bail_unlock_inode:
- ocfs2_inode_unlock(inode, 1);
- brelse(di_bh);
-
bail:
return status;
}
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
index 5ddecce172fad7..e173329eb83057 100644
--- a/fs/ocfs2/namei.h
+++ b/fs/ocfs2/namei.h
@@ -42,8 +42,8 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
struct inode *inode);
int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
- struct inode *inode, int update_isize,
- loff_t end);
+ struct inode *inode, struct buffer_head *di_bh,
+ int update_isize, loff_t end);
int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
struct inode *new_inode,
struct dentry *new_dentry);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 460c6c37e683f8..7a012626784766 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -286,6 +286,8 @@ enum ocfs2_mount_options
OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */
OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15, /* Journal Async Commit */
+ OCFS2_MOUNT_ERRORS_CONT = 1 << 16, /* Return EIO to the calling process on error */
+ OCFS2_MOUNT_ERRORS_ROFS = 1 << 17, /* Change filesystem to read-only on error */
};
#define OCFS2_OSB_SOFT_RO 0x0001
@@ -717,6 +719,16 @@ static inline u64 ocfs2_clusters_to_blocks(struct super_block *sb,
return (u64)clusters << c_to_b_bits;
}
+static inline u32 ocfs2_clusters_for_blocks(struct super_block *sb,
+ u64 blocks)
+{
+ int b_to_c_bits = OCFS2_SB(sb)->s_clustersize_bits -
+ sb->s_blocksize_bits;
+
+ blocks += (1 << b_to_c_bits) - 1;
+ return (u32)(blocks >> b_to_c_bits);
+}
+
static inline u32 ocfs2_blocks_to_clusters(struct super_block *sb,
u64 blocks)
{
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 3d0b63d34225e8..964b727f4ee29a 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -138,8 +138,7 @@ static int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) {
ocfs2_error(inode->i_sb,
- "Quota file %llu is probably corrupted! Requested "
- "to read block %Lu but file has size only %Lu\n",
+ "Quota file %llu is probably corrupted! Requested to read block %Lu but file has size only %Lu\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(unsigned long long)v_block,
(unsigned long long)i_size_read(inode));
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index d8c6af101f3ff7..6f66268d03ca67 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -102,32 +102,30 @@ static int ocfs2_validate_refcount_block(struct super_block *sb,
if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) {
- ocfs2_error(sb,
- "Refcount block #%llu has bad signature %.*s",
- (unsigned long long)bh->b_blocknr, 7,
- rb->rf_signature);
- return -EINVAL;
+ rc = ocfs2_error(sb,
+ "Refcount block #%llu has bad signature %.*s\n",
+ (unsigned long long)bh->b_blocknr, 7,
+ rb->rf_signature);
+ goto out;
}
if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) {
- ocfs2_error(sb,
- "Refcount block #%llu has an invalid rf_blkno "
- "of %llu",
- (unsigned long long)bh->b_blocknr,
- (unsigned long long)le64_to_cpu(rb->rf_blkno));
- return -EINVAL;
+ rc = ocfs2_error(sb,
+ "Refcount block #%llu has an invalid rf_blkno of %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(rb->rf_blkno));
+ goto out;
}
if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) {
- ocfs2_error(sb,
- "Refcount block #%llu has an invalid "
- "rf_fs_generation of #%u",
- (unsigned long long)bh->b_blocknr,
- le32_to_cpu(rb->rf_fs_generation));
- return -EINVAL;
+ rc = ocfs2_error(sb,
+ "Refcount block #%llu has an invalid rf_fs_generation of #%u\n",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(rb->rf_fs_generation));
+ goto out;
}
-
- return 0;
+out:
+ return rc;
}
static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci,
@@ -1102,12 +1100,10 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
el = &eb->h_list;
if (el->l_tree_depth) {
- ocfs2_error(sb,
- "refcount tree %llu has non zero tree "
- "depth in leaf btree tree block %llu\n",
- (unsigned long long)ocfs2_metadata_cache_owner(ci),
- (unsigned long long)eb_bh->b_blocknr);
- ret = -EROFS;
+ ret = ocfs2_error(sb,
+ "refcount tree %llu has non zero tree depth in leaf btree tree block %llu\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long)eb_bh->b_blocknr);
goto out;
}
}
@@ -2361,10 +2357,8 @@ static int ocfs2_mark_extent_refcounted(struct inode *inode,
cpos, len, phys);
if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
- ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
- "tree, but the feature bit is not set in the "
- "super block.", inode->i_ino);
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n",
+ inode->i_ino);
goto out;
}
@@ -2547,10 +2541,8 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno);
if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
- ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
- "tree, but the feature bit is not set in the "
- "super block.", inode->i_ino);
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n",
+ inode->i_ino);
goto out;
}
@@ -2674,11 +2666,10 @@ static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
el = &eb->h_list;
if (el->l_tree_depth) {
- ocfs2_error(inode->i_sb,
- "Inode %lu has non zero tree depth in "
- "leaf block %llu\n", inode->i_ino,
- (unsigned long long)eb_bh->b_blocknr);
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb,
+ "Inode %lu has non zero tree depth in leaf block %llu\n",
+ inode->i_ino,
+ (unsigned long long)eb_bh->b_blocknr);
goto out;
}
}
@@ -3108,11 +3099,9 @@ static int ocfs2_clear_ext_refcount(handle_t *handle,
index = ocfs2_search_extent_list(el, cpos);
if (index == -1) {
- ocfs2_error(sb,
- "Inode %llu has an extent at cpos %u which can no "
- "longer be found.\n",
- (unsigned long long)ino, cpos);
- ret = -EROFS;
+ ret = ocfs2_error(sb,
+ "Inode %llu has an extent at cpos %u which can no longer be found\n",
+ (unsigned long long)ino, cpos);
goto out;
}
@@ -3378,10 +3367,8 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
- ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
- "tree, but the feature bit is not set in the "
- "super block.", inode->i_ino);
- return -EROFS;
+ return ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n",
+ inode->i_ino);
}
ocfs2_init_dealloc_ctxt(&context->dealloc);
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 4479029630bb37..0456ae399bf71f 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -167,12 +167,12 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
}
#define do_error(fmt, ...) \
- do{ \
- if (resize) \
- mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \
- else \
- ocfs2_error(sb, fmt, ##__VA_ARGS__); \
- } while (0)
+do { \
+ if (resize) \
+ mlog(ML_ERROR, fmt, ##__VA_ARGS__); \
+ else \
+ return ocfs2_error(sb, fmt, ##__VA_ARGS__); \
+} while (0)
static int ocfs2_validate_gd_self(struct super_block *sb,
struct buffer_head *bh,
@@ -181,44 +181,35 @@ static int ocfs2_validate_gd_self(struct super_block *sb,
struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
- do_error("Group descriptor #%llu has bad signature %.*s",
+ do_error("Group descriptor #%llu has bad signature %.*s\n",
(unsigned long long)bh->b_blocknr, 7,
gd->bg_signature);
- return -EINVAL;
}
if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
- do_error("Group descriptor #%llu has an invalid bg_blkno "
- "of %llu",
+ do_error("Group descriptor #%llu has an invalid bg_blkno of %llu\n",
(unsigned long long)bh->b_blocknr,
(unsigned long long)le64_to_cpu(gd->bg_blkno));
- return -EINVAL;
}
if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
- do_error("Group descriptor #%llu has an invalid "
- "fs_generation of #%u",
+ do_error("Group descriptor #%llu has an invalid fs_generation of #%u\n",
(unsigned long long)bh->b_blocknr,
le32_to_cpu(gd->bg_generation));
- return -EINVAL;
}
if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
- do_error("Group descriptor #%llu has bit count %u but "
- "claims that %u are free",
+ do_error("Group descriptor #%llu has bit count %u but claims that %u are free\n",
(unsigned long long)bh->b_blocknr,
le16_to_cpu(gd->bg_bits),
le16_to_cpu(gd->bg_free_bits_count));
- return -EINVAL;
}
if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
- do_error("Group descriptor #%llu has bit count %u but "
- "max bitmap bits of %u",
+ do_error("Group descriptor #%llu has bit count %u but max bitmap bits of %u\n",
(unsigned long long)bh->b_blocknr,
le16_to_cpu(gd->bg_bits),
8 * le16_to_cpu(gd->bg_size));
- return -EINVAL;
}
return 0;
@@ -233,20 +224,17 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
if (di->i_blkno != gd->bg_parent_dinode) {
- do_error("Group descriptor #%llu has bad parent "
- "pointer (%llu, expected %llu)",
+ do_error("Group descriptor #%llu has bad parent pointer (%llu, expected %llu)\n",
(unsigned long long)bh->b_blocknr,
(unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
(unsigned long long)le64_to_cpu(di->i_blkno));
- return -EINVAL;
}
max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
if (le16_to_cpu(gd->bg_bits) > max_bits) {
- do_error("Group descriptor #%llu has bit count of %u",
+ do_error("Group descriptor #%llu has bit count of %u\n",
(unsigned long long)bh->b_blocknr,
le16_to_cpu(gd->bg_bits));
- return -EINVAL;
}
/* In resize, we may meet the case bg_chain == cl_next_free_rec. */
@@ -254,10 +242,9 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
((le16_to_cpu(gd->bg_chain) ==
le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
- do_error("Group descriptor #%llu has bad chain %u",
+ do_error("Group descriptor #%llu has bad chain %u\n",
(unsigned long long)bh->b_blocknr,
le16_to_cpu(gd->bg_chain));
- return -EINVAL;
}
return 0;
@@ -384,11 +371,10 @@ static int ocfs2_block_group_fill(handle_t *handle,
struct super_block * sb = alloc_inode->i_sb;
if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
- ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
- "b_blocknr (%llu)",
- (unsigned long long)group_blkno,
- (unsigned long long) bg_bh->b_blocknr);
- status = -EIO;
+ status = ocfs2_error(alloc_inode->i_sb,
+ "group block (%llu) != b_blocknr (%llu)\n",
+ (unsigned long long)group_blkno,
+ (unsigned long long) bg_bh->b_blocknr);
goto bail;
}
@@ -834,9 +820,9 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
- ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
- (unsigned long long)le64_to_cpu(fe->i_blkno));
- status = -EIO;
+ status = ocfs2_error(alloc_inode->i_sb,
+ "Invalid chain allocator %llu\n",
+ (unsigned long long)le64_to_cpu(fe->i_blkno));
goto bail;
}
@@ -1370,12 +1356,11 @@ int ocfs2_block_group_set_bits(handle_t *handle,
le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
- ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
- " count %u but claims %u are freed. num_bits %d",
- (unsigned long long)le64_to_cpu(bg->bg_blkno),
- le16_to_cpu(bg->bg_bits),
- le16_to_cpu(bg->bg_free_bits_count), num_bits);
- return -EROFS;
+ return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n",
+ (unsigned long long)le64_to_cpu(bg->bg_blkno),
+ le16_to_cpu(bg->bg_bits),
+ le16_to_cpu(bg->bg_free_bits_count),
+ num_bits);
}
while(num_bits--)
ocfs2_set_bit(bit_off++, bitmap);
@@ -1905,13 +1890,11 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
le32_to_cpu(fe->id1.bitmap1.i_total)) {
- ocfs2_error(ac->ac_inode->i_sb,
- "Chain allocator dinode %llu has %u used "
- "bits but only %u total.",
- (unsigned long long)le64_to_cpu(fe->i_blkno),
- le32_to_cpu(fe->id1.bitmap1.i_used),
- le32_to_cpu(fe->id1.bitmap1.i_total));
- status = -EIO;
+ status = ocfs2_error(ac->ac_inode->i_sb,
+ "Chain allocator dinode %llu has %u used bits but only %u total\n",
+ (unsigned long long)le64_to_cpu(fe->i_blkno),
+ le32_to_cpu(fe->id1.bitmap1.i_used),
+ le32_to_cpu(fe->id1.bitmap1.i_total));
goto bail;
}
@@ -2429,12 +2412,11 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
}
le16_add_cpu(&bg->bg_free_bits_count, num_bits);
if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
- ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
- " count %u but claims %u are freed. num_bits %d",
- (unsigned long long)le64_to_cpu(bg->bg_blkno),
- le16_to_cpu(bg->bg_bits),
- le16_to_cpu(bg->bg_free_bits_count), num_bits);
- return -EROFS;
+ return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n",
+ (unsigned long long)le64_to_cpu(bg->bg_blkno),
+ le16_to_cpu(bg->bg_bits),
+ le16_to_cpu(bg->bg_free_bits_count),
+ num_bits);
}
if (undo_fn)
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 403c5660b30644..2fc02f7c294998 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -192,6 +192,7 @@ enum {
Opt_resv_level,
Opt_dir_resv_level,
Opt_journal_async_commit,
+ Opt_err_cont,
Opt_err,
};
@@ -224,6 +225,7 @@ static const match_table_t tokens = {
{Opt_resv_level, "resv_level=%u"},
{Opt_dir_resv_level, "dir_resv_level=%u"},
{Opt_journal_async_commit, "journal_async_commit"},
+ {Opt_err_cont, "errors=continue"},
{Opt_err, NULL}
};
@@ -1330,10 +1332,19 @@ static int ocfs2_parse_options(struct super_block *sb,
mopt->mount_opt |= OCFS2_MOUNT_NOINTR;
break;
case Opt_err_panic:
+ mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_CONT;
+ mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_ROFS;
mopt->mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
break;
case Opt_err_ro:
+ mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_CONT;
mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC;
+ mopt->mount_opt |= OCFS2_MOUNT_ERRORS_ROFS;
+ break;
+ case Opt_err_cont:
+ mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_ROFS;
+ mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC;
+ mopt->mount_opt |= OCFS2_MOUNT_ERRORS_CONT;
break;
case Opt_data_ordered:
mopt->mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK;
@@ -1530,6 +1541,8 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
if (opts & OCFS2_MOUNT_ERRORS_PANIC)
seq_printf(s, ",errors=panic");
+ else if (opts & OCFS2_MOUNT_ERRORS_CONT)
+ seq_printf(s, ",errors=continue");
else
seq_printf(s, ",errors=remount-ro");
@@ -2541,31 +2554,43 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
memset(osb, 0, sizeof(struct ocfs2_super));
}
-/* Put OCFS2 into a readonly state, or (if the user specifies it),
- * panic(). We do not support continue-on-error operation. */
-static void ocfs2_handle_error(struct super_block *sb)
+/* Depending on the mount option passed, perform one of the following:
+ * Put OCFS2 into a readonly state (default)
+ * Return EIO so that only the process errs
+ * Fix the error as if fsck.ocfs2 -y
+ * panic
+ */
+static int ocfs2_handle_error(struct super_block *sb)
{
struct ocfs2_super *osb = OCFS2_SB(sb);
-
- if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC)
- panic("OCFS2: (device %s): panic forced after error\n",
- sb->s_id);
+ int rv = 0;
ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS);
+ pr_crit("On-disk corruption discovered. "
+ "Please run fsck.ocfs2 once the filesystem is unmounted.\n");
- if (sb->s_flags & MS_RDONLY &&
- (ocfs2_is_soft_readonly(osb) ||
- ocfs2_is_hard_readonly(osb)))
- return;
-
- printk(KERN_CRIT "File system is now read-only due to the potential "
- "of on-disk corruption. Please run fsck.ocfs2 once the file "
- "system is unmounted.\n");
- sb->s_flags |= MS_RDONLY;
- ocfs2_set_ro_flag(osb, 0);
+ if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC) {
+ panic("OCFS2: (device %s): panic forced after error\n",
+ sb->s_id);
+ } else if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_CONT) {
+ pr_crit("OCFS2: Returning error to the calling process.\n");
+ rv = -EIO;
+ } else { /* default option */
+ rv = -EROFS;
+ if (sb->s_flags & MS_RDONLY &&
+ (ocfs2_is_soft_readonly(osb) ||
+ ocfs2_is_hard_readonly(osb)))
+ return rv;
+
+ pr_crit("OCFS2: File system is now read-only.\n");
+ sb->s_flags |= MS_RDONLY;
+ ocfs2_set_ro_flag(osb, 0);
+ }
+
+ return rv;
}
-void __ocfs2_error(struct super_block *sb, const char *function,
+int __ocfs2_error(struct super_block *sb, const char *function,
const char *fmt, ...)
{
struct va_format vaf;
@@ -2577,12 +2602,12 @@ void __ocfs2_error(struct super_block *sb, const char *function,
/* Not using mlog here because we want to show the actual
* function the error came from. */
- printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %pV\n",
+ printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %pV",
sb->s_id, function, &vaf);
va_end(args);
- ocfs2_handle_error(sb);
+ return ocfs2_handle_error(sb);
}
/* Handle critical errors. This is intentionally more drastic than
@@ -2599,7 +2624,7 @@ void __ocfs2_abort(struct super_block *sb, const char *function,
vaf.fmt = fmt;
vaf.va = &args;
- printk(KERN_CRIT "OCFS2: abort (device %s): %s: %pV\n",
+ printk(KERN_CRIT "OCFS2: abort (device %s): %s: %pV",
sb->s_id, function, &vaf);
va_end(args);
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index 74ff74cf78fe92..b477d0b1c7b6ce 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -32,16 +32,18 @@ int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
int node_num);
__printf(3, 4)
-void __ocfs2_error(struct super_block *sb, const char *function,
+int __ocfs2_error(struct super_block *sb, const char *function,
const char *fmt, ...);
-#define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args)
+#define ocfs2_error(sb, fmt, ...) \
+ __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##__VA_ARGS__)
__printf(3, 4)
void __ocfs2_abort(struct super_block *sb, const char *function,
const char *fmt, ...);
-#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
+#define ocfs2_abort(sb, fmt, ...) \
+ __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##__VA_ARGS__)
/*
* Void signal blockers, because in-kernel sigprocmask() only fails
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index d03bfbf3d27d50..5613883c05c0a0 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -499,30 +499,24 @@ static int ocfs2_validate_xattr_block(struct super_block *sb,
*/
if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
- ocfs2_error(sb,
- "Extended attribute block #%llu has bad "
- "signature %.*s",
- (unsigned long long)bh->b_blocknr, 7,
- xb->xb_signature);
- return -EINVAL;
+ return ocfs2_error(sb,
+ "Extended attribute block #%llu has bad signature %.*s\n",
+ (unsigned long long)bh->b_blocknr, 7,
+ xb->xb_signature);
}
if (le64_to_cpu(xb->xb_blkno) != bh->b_blocknr) {
- ocfs2_error(sb,
- "Extended attribute block #%llu has an "
- "invalid xb_blkno of %llu",
- (unsigned long long)bh->b_blocknr,
- (unsigned long long)le64_to_cpu(xb->xb_blkno));
- return -EINVAL;
+ return ocfs2_error(sb,
+ "Extended attribute block #%llu has an invalid xb_blkno of %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(xb->xb_blkno));
}
if (le32_to_cpu(xb->xb_fs_generation) != OCFS2_SB(sb)->fs_generation) {
- ocfs2_error(sb,
- "Extended attribute block #%llu has an invalid "
- "xb_fs_generation of #%u",
- (unsigned long long)bh->b_blocknr,
- le32_to_cpu(xb->xb_fs_generation));
- return -EINVAL;
+ return ocfs2_error(sb,
+ "Extended attribute block #%llu has an invalid xb_fs_generation of #%u\n",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(xb->xb_fs_generation));
}
return 0;
@@ -3694,11 +3688,10 @@ static int ocfs2_xattr_get_rec(struct inode *inode,
el = &eb->h_list;
if (el->l_tree_depth) {
- ocfs2_error(inode->i_sb,
- "Inode %lu has non zero tree depth in "
- "xattr tree block %llu\n", inode->i_ino,
- (unsigned long long)eb_bh->b_blocknr);
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb,
+ "Inode %lu has non zero tree depth in xattr tree block %llu\n",
+ inode->i_ino,
+ (unsigned long long)eb_bh->b_blocknr);
goto out;
}
}
@@ -3713,11 +3706,10 @@ static int ocfs2_xattr_get_rec(struct inode *inode,
}
if (!e_blkno) {
- ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
- "record (%u, %u, 0) in xattr", inode->i_ino,
- le32_to_cpu(rec->e_cpos),
- ocfs2_rec_clusters(el, rec));
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb, "Inode %lu has bad extent record (%u, %u, 0) in xattr\n",
+ inode->i_ino,
+ le32_to_cpu(rec->e_cpos),
+ ocfs2_rec_clusters(el, rec));
goto out;
}
@@ -7334,6 +7326,9 @@ static size_t ocfs2_xattr_trusted_list(struct dentry *dentry, char *list,
const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
const size_t total_len = prefix_len + name_len + 1;
+ if (!capable(CAP_SYS_ADMIN))
+ return 0;
+
if (list && total_len <= list_size) {
memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
memcpy(list + prefix_len, name, name_len);
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 84bb65b8357013..4fb17ded7d4780 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -547,51 +547,45 @@ posix_acl_create(struct inode *dir, umode_t *mode,
struct posix_acl **default_acl, struct posix_acl **acl)
{
struct posix_acl *p;
+ struct posix_acl *clone;
int ret;
+ *acl = NULL;
+ *default_acl = NULL;
+
if (S_ISLNK(*mode) || !IS_POSIXACL(dir))
- goto no_acl;
+ return 0;
p = get_acl(dir, ACL_TYPE_DEFAULT);
- if (IS_ERR(p)) {
- if (p == ERR_PTR(-EOPNOTSUPP))
- goto apply_umask;
- return PTR_ERR(p);
+ if (!p || p == ERR_PTR(-EOPNOTSUPP)) {
+ *mode &= ~current_umask();
+ return 0;
}
+ if (IS_ERR(p))
+ return PTR_ERR(p);
- if (!p)
- goto apply_umask;
-
- *acl = posix_acl_clone(p, GFP_NOFS);
- if (!*acl)
+ clone = posix_acl_clone(p, GFP_NOFS);
+ if (!clone)
goto no_mem;
- ret = posix_acl_create_masq(*acl, mode);
+ ret = posix_acl_create_masq(clone, mode);
if (ret < 0)
goto no_mem_clone;
- if (ret == 0) {
- posix_acl_release(*acl);
- *acl = NULL;
- }
+ if (ret == 0)
+ posix_acl_release(clone);
+ else
+ *acl = clone;
- if (!S_ISDIR(*mode)) {
+ if (!S_ISDIR(*mode))
posix_acl_release(p);
- *default_acl = NULL;
- } else {
+ else
*default_acl = p;
- }
- return 0;
-apply_umask:
- *mode &= ~current_umask();
-no_acl:
- *default_acl = NULL;
- *acl = NULL;
return 0;
no_mem_clone:
- posix_acl_release(*acl);
+ posix_acl_release(clone);
no_mem:
posix_acl_release(p);
return -ENOMEM;
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 2183fcf41d5933..d751fcb637bb73 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -71,3 +71,7 @@ config PROC_PAGE_MONITOR
/proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap,
/proc/kpagecount, and /proc/kpageflags. Disabling these
interfaces will reduce the size of the kernel by approximately 4kb.
+
+config PROC_CHILDREN
+ bool "Include /proc/<pid>/task/<tid>/children file"
+ default n
diff --git a/fs/proc/array.c b/fs/proc/array.c
index fd02a9ebfc30e5..ce065cf3104fb5 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -126,6 +126,14 @@ static inline const char *get_task_state(struct task_struct *tsk)
{
unsigned int state = (tsk->state | tsk->exit_state) & TASK_REPORT;
+ /*
+ * Parked tasks do not run; they sit in __kthread_parkme().
+ * Without this check, we would report them as running, which is
+ * clearly wrong, so we report them as sleeping instead.
+ */
+ if (tsk->state == TASK_PARKED)
+ state = TASK_INTERRUPTIBLE;
+
BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array)-1);
return task_state_array[fls(state)];
@@ -569,7 +577,7 @@ int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
return 0;
}
-#ifdef CONFIG_CHECKPOINT_RESTORE
+#ifdef CONFIG_PROC_CHILDREN
static struct pid *
get_children_pid(struct inode *inode, struct pid *pid_prev, loff_t pos)
{
@@ -692,4 +700,4 @@ const struct file_operations proc_tid_children_operations = {
.llseek = seq_lseek,
.release = children_seq_release,
};
-#endif /* CONFIG_CHECKPOINT_RESTORE */
+#endif /* CONFIG_PROC_CHILDREN */
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 286a422f440e9e..1d540b3f226fe3 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -196,18 +196,205 @@ static int proc_root_link(struct dentry *dentry, struct path *path)
return result;
}
-static int proc_pid_cmdline(struct seq_file *m, struct pid_namespace *ns,
- struct pid *pid, struct task_struct *task)
+static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
+ size_t _count, loff_t *pos)
{
+ struct task_struct *tsk;
+ struct mm_struct *mm;
+ char *page;
+ unsigned long count = _count;
+ unsigned long arg_start, arg_end, env_start, env_end;
+ unsigned long len1, len2, len;
+ unsigned long p;
+ char c;
+ ssize_t rv;
+
+ BUG_ON(*pos < 0);
+
+ tsk = get_proc_task(file_inode(file));
+ if (!tsk)
+ return -ESRCH;
+ mm = get_task_mm(tsk);
+ put_task_struct(tsk);
+ if (!mm)
+ return 0;
+ /* Check if process spawned far enough to have cmdline. */
+ if (!mm->env_end) {
+ rv = 0;
+ goto out_mmput;
+ }
+
+ page = (char *)__get_free_page(GFP_TEMPORARY);
+ if (!page) {
+ rv = -ENOMEM;
+ goto out_mmput;
+ }
+
+ down_read(&mm->mmap_sem);
+ arg_start = mm->arg_start;
+ arg_end = mm->arg_end;
+ env_start = mm->env_start;
+ env_end = mm->env_end;
+ up_read(&mm->mmap_sem);
+
+ BUG_ON(arg_start > arg_end);
+ BUG_ON(env_start > env_end);
+
+ len1 = arg_end - arg_start;
+ len2 = env_end - env_start;
+
/*
- * Rely on struct seq_operations::show() being called once
- * per internal buffer allocation. See single_open(), traverse().
+ * Inherently racy -- command line shares address space
+ * with code and data.
*/
- BUG_ON(m->size < PAGE_SIZE);
- m->count += get_cmdline(task, m->buf, PAGE_SIZE);
- return 0;
+ rv = access_remote_vm(mm, arg_end - 1, &c, 1, 0);
+ if (rv <= 0)
+ goto out_free_page;
+
+ rv = 0;
+
+ if (c == '\0') {
+ /* Command line (set of strings) occupies whole ARGV. */
+ if (len1 <= *pos)
+ goto out_free_page;
+
+ p = arg_start + *pos;
+ len = len1 - *pos;
+ while (count > 0 && len > 0) {
+ unsigned int _count;
+ int nr_read;
+
+ _count = min3(count, len, PAGE_SIZE);
+ nr_read = access_remote_vm(mm, p, page, _count, 0);
+ if (nr_read < 0)
+ rv = nr_read;
+ if (nr_read <= 0)
+ goto out_free_page;
+
+ if (copy_to_user(buf, page, nr_read)) {
+ rv = -EFAULT;
+ goto out_free_page;
+ }
+
+ p += nr_read;
+ len -= nr_read;
+ buf += nr_read;
+ count -= nr_read;
+ rv += nr_read;
+ }
+ } else {
+ /*
+ * Command line (1 string) occupies ARGV and maybe
+ * extends into ENVP.
+ */
+ if (len1 + len2 <= *pos)
+ goto skip_argv_envp;
+ if (len1 <= *pos)
+ goto skip_argv;
+
+ p = arg_start + *pos;
+ len = len1 - *pos;
+ while (count > 0 && len > 0) {
+ unsigned int _count, l;
+ int nr_read;
+ bool final;
+
+ _count = min3(count, len, PAGE_SIZE);
+ nr_read = access_remote_vm(mm, p, page, _count, 0);
+ if (nr_read < 0)
+ rv = nr_read;
+ if (nr_read <= 0)
+ goto out_free_page;
+
+ /*
+ * Command line can be shorter than whole ARGV
+ * even if last "marker" byte says it is not.
+ */
+ final = false;
+ l = strnlen(page, nr_read);
+ if (l < nr_read) {
+ nr_read = l;
+ final = true;
+ }
+
+ if (copy_to_user(buf, page, nr_read)) {
+ rv = -EFAULT;
+ goto out_free_page;
+ }
+
+ p += nr_read;
+ len -= nr_read;
+ buf += nr_read;
+ count -= nr_read;
+ rv += nr_read;
+
+ if (final)
+ goto out_free_page;
+ }
+skip_argv:
+ /*
+ * Command line (1 string) occupies ARGV and
+ * extends into ENVP.
+ */
+ if (len1 <= *pos) {
+ p = env_start + *pos - len1;
+ len = len1 + len2 - *pos;
+ } else {
+ p = env_start;
+ len = len2;
+ }
+ while (count > 0 && len > 0) {
+ unsigned int _count, l;
+ int nr_read;
+ bool final;
+
+ _count = min3(count, len, PAGE_SIZE);
+ nr_read = access_remote_vm(mm, p, page, _count, 0);
+ if (nr_read < 0)
+ rv = nr_read;
+ if (nr_read <= 0)
+ goto out_free_page;
+
+ /* Find EOS. */
+ final = false;
+ l = strnlen(page, nr_read);
+ if (l < nr_read) {
+ nr_read = l;
+ final = true;
+ }
+
+ if (copy_to_user(buf, page, nr_read)) {
+ rv = -EFAULT;
+ goto out_free_page;
+ }
+
+ p += nr_read;
+ len -= nr_read;
+ buf += nr_read;
+ count -= nr_read;
+ rv += nr_read;
+
+ if (final)
+ goto out_free_page;
+ }
+skip_argv_envp:
+ ;
+ }
+
+out_free_page:
+ free_page((unsigned long)page);
+out_mmput:
+ mmput(mm);
+ if (rv > 0)
+ *pos += rv;
+ return rv;
}
+static const struct file_operations proc_pid_cmdline_ops = {
+ .read = proc_pid_cmdline_read,
+ .llseek = generic_file_llseek,
+};
+
static int proc_pid_auxv(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
@@ -2572,7 +2759,7 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
ONE("syscall", S_IRUSR, proc_pid_syscall),
#endif
- ONE("cmdline", S_IRUGO, proc_pid_cmdline),
+ REG("cmdline", S_IRUGO, proc_pid_cmdline_ops),
ONE("stat", S_IRUGO, proc_tgid_stat),
ONE("statm", S_IRUGO, proc_pid_statm),
REG("maps", S_IRUGO, proc_pid_maps_operations),
@@ -2918,11 +3105,11 @@ static const struct pid_entry tid_base_stuff[] = {
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
ONE("syscall", S_IRUSR, proc_pid_syscall),
#endif
- ONE("cmdline", S_IRUGO, proc_pid_cmdline),
+ REG("cmdline", S_IRUGO, proc_pid_cmdline_ops),
ONE("stat", S_IRUGO, proc_tid_stat),
ONE("statm", S_IRUGO, proc_pid_statm),
REG("maps", S_IRUGO, proc_tid_maps_operations),
-#ifdef CONFIG_CHECKPOINT_RESTORE
+#ifdef CONFIG_PROC_CHILDREN
REG("children", S_IRUGO, proc_tid_children_operations),
#endif
#ifdef CONFIG_NUMA
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 6dee68d013ffa6..58be92e1193997 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -597,6 +597,8 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_HUGEPAGE)] = "hg",
[ilog2(VM_NOHUGEPAGE)] = "nh",
[ilog2(VM_MERGEABLE)] = "mg",
+ [ilog2(VM_UFFD_MISSING)]= "um",
+ [ilog2(VM_UFFD_WP)] = "uw",
};
size_t i;
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 8db932da40091b..8ebd9a33408528 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -17,7 +17,8 @@
static unsigned mounts_poll(struct file *file, poll_table *wait)
{
- struct proc_mounts *p = proc_mounts(file->private_data);
+ struct seq_file *m = file->private_data;
+ struct proc_mounts *p = m->private;
struct mnt_namespace *ns = p->ns;
unsigned res = POLLIN | POLLRDNORM;
int event;
@@ -25,8 +26,8 @@ static unsigned mounts_poll(struct file *file, poll_table *wait)
poll_wait(file, &p->ns->poll, wait);
event = ACCESS_ONCE(ns->event);
- if (p->m.poll_event != event) {
- p->m.poll_event = event;
+ if (m->poll_event != event) {
+ m->poll_event = event;
res |= POLLERR | POLLPRI;
}
@@ -92,7 +93,7 @@ static void show_type(struct seq_file *m, struct super_block *sb)
static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt)
{
- struct proc_mounts *p = proc_mounts(m);
+ struct proc_mounts *p = m->private;
struct mount *r = real_mount(mnt);
int err = 0;
struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
@@ -126,7 +127,7 @@ out:
static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
{
- struct proc_mounts *p = proc_mounts(m);
+ struct proc_mounts *p = m->private;
struct mount *r = real_mount(mnt);
struct super_block *sb = mnt->mnt_sb;
struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
@@ -186,7 +187,7 @@ out:
static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt)
{
- struct proc_mounts *p = proc_mounts(m);
+ struct proc_mounts *p = m->private;
struct mount *r = real_mount(mnt);
struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
struct super_block *sb = mnt_path.dentry->d_sb;
@@ -236,6 +237,7 @@ static int mounts_open_common(struct inode *inode, struct file *file,
struct mnt_namespace *ns = NULL;
struct path root;
struct proc_mounts *p;
+ struct seq_file *m;
int ret = -EINVAL;
if (!task)
@@ -260,26 +262,21 @@ static int mounts_open_common(struct inode *inode, struct file *file,
task_unlock(task);
put_task_struct(task);
- ret = -ENOMEM;
- p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL);
- if (!p)
+ ret = seq_open_private(file, &mounts_op, sizeof(struct proc_mounts));
+ if (ret)
goto err_put_path;
- file->private_data = &p->m;
- ret = seq_open(file, &mounts_op);
- if (ret)
- goto err_free;
+ m = file->private_data;
+ m->poll_event = ns->event;
+ p = m->private;
p->ns = ns;
p->root = root;
- p->m.poll_event = ns->event;
p->show = show;
p->cached_event = ~0ULL;
return 0;
- err_free:
- kfree(p);
err_put_path:
path_put(&root);
err_put_ns:
@@ -290,10 +287,11 @@ static int mounts_open_common(struct inode *inode, struct file *file,
static int mounts_release(struct inode *inode, struct file *file)
{
- struct proc_mounts *p = proc_mounts(file->private_data);
+ struct seq_file *m = file->private_data;
+ struct proc_mounts *p = m->private;
path_put(&p->root);
put_mnt_ns(p->ns);
- return seq_release(inode, file);
+ return seq_release_private(inode, file);
}
static int mounts_open(struct inode *inode, struct file *file)
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 3e0af317fcc3eb..0e4cf728126f2b 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -589,8 +589,7 @@ static struct kmem_cache *reiserfs_inode_cachep;
static struct inode *reiserfs_alloc_inode(struct super_block *sb)
{
struct reiserfs_inode_info *ei;
- ei = (struct reiserfs_inode_info *)
- kmem_cache_alloc(reiserfs_inode_cachep, GFP_KERNEL);
+ ei = kmem_cache_alloc(reiserfs_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
atomic_set(&ei->openers, 0);
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 555f82155be80d..760e25dad9850b 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -48,18 +48,21 @@ static void *seq_buf_alloc(unsigned long size)
* ERR_PTR(error). In the end of sequence they return %NULL. ->show()
* returns 0 in case of success and negative number in case of error.
* Returning SEQ_SKIP means "discard this element and move on".
+ * Note: seq_open() will allocate a struct seq_file and store its
+ * pointer in @file->private_data. This pointer should not be modified.
*/
int seq_open(struct file *file, const struct seq_operations *op)
{
- struct seq_file *p = file->private_data;
+ struct seq_file *p;
+
+ WARN_ON(file->private_data);
+
+ p = kzalloc(sizeof(*p), GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
+ file->private_data = p;
- if (!p) {
- p = kmalloc(sizeof(*p), GFP_KERNEL);
- if (!p)
- return -ENOMEM;
- file->private_data = p;
- }
- memset(p, 0, sizeof(*p));
mutex_init(&p->lock);
p->op = op;
#ifdef CONFIG_USER_NS
diff --git a/fs/splice.c b/fs/splice.c
index 4f355a1c1a9e87..5fc1e50a7f30c4 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -360,7 +360,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
break;
error = add_to_page_cache_lru(page, mapping, index,
- GFP_KERNEL);
+ GFP_KERNEL & mapping_gfp_mask(mapping));
if (unlikely(error)) {
page_cache_release(page);
if (error == -EEXIST)
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 2c1036080d5276..a7106eda50241b 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -51,8 +51,8 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
if (ufs_fragnum(fragment) + count > uspi->s_fpg)
ufs_error (sb, "ufs_free_fragments", "internal error");
-
- lock_ufs(sb);
+
+ mutex_lock(&UFS_SB(sb)->s_lock);
cgno = ufs_dtog(uspi, fragment);
bit = ufs_dtogd(uspi, fragment);
@@ -115,13 +115,13 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
if (sb->s_flags & MS_SYNCHRONOUS)
ubh_sync_block(UCPI_UBH(ucpi));
ufs_mark_sb_dirty(sb);
-
- unlock_ufs(sb);
+
+ mutex_unlock(&UFS_SB(sb)->s_lock);
UFSD("EXIT\n");
return;
failed:
- unlock_ufs(sb);
+ mutex_unlock(&UFS_SB(sb)->s_lock);
UFSD("EXIT (FAILED)\n");
return;
}
@@ -151,7 +151,7 @@ void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count)
goto failed;
}
- lock_ufs(sb);
+ mutex_lock(&UFS_SB(sb)->s_lock);
do_more:
overflow = 0;
@@ -211,12 +211,12 @@ do_more:
}
ufs_mark_sb_dirty(sb);
- unlock_ufs(sb);
+ mutex_unlock(&UFS_SB(sb)->s_lock);
UFSD("EXIT\n");
return;
failed_unlock:
- unlock_ufs(sb);
+ mutex_unlock(&UFS_SB(sb)->s_lock);
failed:
UFSD("EXIT (FAILED)\n");
return;
@@ -357,7 +357,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
usb1 = ubh_get_usb_first(uspi);
*err = -ENOSPC;
- lock_ufs(sb);
+ mutex_lock(&UFS_SB(sb)->s_lock);
tmp = ufs_data_ptr_to_cpu(sb, p);
if (count + ufs_fragnum(fragment) > uspi->s_fpb) {
@@ -378,19 +378,19 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
"fragment %llu, tmp %llu\n",
(unsigned long long)fragment,
(unsigned long long)tmp);
- unlock_ufs(sb);
+ mutex_unlock(&UFS_SB(sb)->s_lock);
return INVBLOCK;
}
if (fragment < UFS_I(inode)->i_lastfrag) {
UFSD("EXIT (ALREADY ALLOCATED)\n");
- unlock_ufs(sb);
+ mutex_unlock(&UFS_SB(sb)->s_lock);
return 0;
}
}
else {
if (tmp) {
UFSD("EXIT (ALREADY ALLOCATED)\n");
- unlock_ufs(sb);
+ mutex_unlock(&UFS_SB(sb)->s_lock);
return 0;
}
}
@@ -399,7 +399,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
* There is not enough space for user on the device
*/
if (!capable(CAP_SYS_RESOURCE) && ufs_freespace(uspi, UFS_MINFREE) <= 0) {
- unlock_ufs(sb);
+ mutex_unlock(&UFS_SB(sb)->s_lock);
UFSD("EXIT (FAILED)\n");
return 0;
}
@@ -424,7 +424,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
ufs_clear_frags(inode, result + oldcount,
newcount - oldcount, locked_page != NULL);
}
- unlock_ufs(sb);
+ mutex_unlock(&UFS_SB(sb)->s_lock);
UFSD("EXIT, result %llu\n", (unsigned long long)result);
return result;
}
@@ -439,7 +439,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
fragment + count);
ufs_clear_frags(inode, result + oldcount, newcount - oldcount,
locked_page != NULL);
- unlock_ufs(sb);
+ mutex_unlock(&UFS_SB(sb)->s_lock);
UFSD("EXIT, result %llu\n", (unsigned long long)result);
return result;
}
@@ -477,7 +477,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
*err = 0;
UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag,
fragment + count);
- unlock_ufs(sb);
+ mutex_unlock(&UFS_SB(sb)->s_lock);
if (newcount < request)
ufs_free_fragments (inode, result + newcount, request - newcount);
ufs_free_fragments (inode, tmp, oldcount);
@@ -485,7 +485,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
return result;
}
- unlock_ufs(sb);
+ mutex_unlock(&UFS_SB(sb)->s_lock);
UFSD("EXIT (FAILED)\n");
return 0;
}
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 7caa016528883c..fd0203ce1f7fde 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -69,11 +69,11 @@ void ufs_free_inode (struct inode * inode)
ino = inode->i_ino;
- lock_ufs(sb);
+ mutex_lock(&UFS_SB(sb)->s_lock);
if (!((ino > 1) && (ino < (uspi->s_ncg * uspi->s_ipg )))) {
ufs_warning(sb, "ufs_free_inode", "reserved inode or nonexistent inode %u\n", ino);
- unlock_ufs(sb);
+ mutex_unlock(&UFS_SB(sb)->s_lock);
return;
}
@@ -81,7 +81,7 @@ void ufs_free_inode (struct inode * inode)
bit = ufs_inotocgoff (ino);
ucpi = ufs_load_cylinder (sb, cg);
if (!ucpi) {
- unlock_ufs(sb);
+ mutex_unlock(&UFS_SB(sb)->s_lock);
return;
}
ucg = ubh_get_ucg(UCPI_UBH(ucpi));
@@ -115,7 +115,7 @@ void ufs_free_inode (struct inode * inode)
ubh_sync_block(UCPI_UBH(ucpi));
ufs_mark_sb_dirty(sb);
- unlock_ufs(sb);
+ mutex_unlock(&UFS_SB(sb)->s_lock);
UFSD("EXIT\n");
}
@@ -193,7 +193,7 @@ struct inode *ufs_new_inode(struct inode *dir, umode_t mode)
sbi = UFS_SB(sb);
uspi = sbi->s_uspi;
- lock_ufs(sb);
+ mutex_lock(&sbi->s_lock);
/*
* Try to place the inode in its parent directory
@@ -331,21 +331,21 @@ cg_found:
sync_dirty_buffer(bh);
brelse(bh);
}
- unlock_ufs(sb);
+ mutex_unlock(&sbi->s_lock);
UFSD("allocating inode %lu\n", inode->i_ino);
UFSD("EXIT\n");
return inode;
fail_remove_inode:
- unlock_ufs(sb);
+ mutex_unlock(&sbi->s_lock);
clear_nlink(inode);
unlock_new_inode(inode);
iput(inode);
UFSD("EXIT (FAILED): err %d\n", err);
return ERR_PTR(err);
failed:
- unlock_ufs(sb);
+ mutex_unlock(&sbi->s_lock);
make_bad_inode(inode);
iput (inode);
UFSD("EXIT (FAILED): err %d\n", err);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 99aaf5c9bf4d83..f913a6924b2381 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -903,6 +903,9 @@ void ufs_evict_inode(struct inode * inode)
invalidate_inode_buffers(inode);
clear_inode(inode);
- if (want_delete)
+ if (want_delete) {
+ lock_ufs(inode->i_sb);
ufs_free_inode(inode);
+ unlock_ufs(inode->i_sb);
+ }
}
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index f773deb1d2e3fd..7a36e0c19d4b12 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -128,12 +128,12 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
if (l > sb->s_blocksize)
goto out_notlocked;
+ lock_ufs(dir->i_sb);
inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO);
err = PTR_ERR(inode);
if (IS_ERR(inode))
- goto out_notlocked;
+ goto out;
- lock_ufs(dir->i_sb);
if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) {
/* slow symlink */
inode->i_op = &ufs_symlink_inode_operations;
@@ -185,9 +185,13 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
struct inode * inode;
int err;
+ lock_ufs(dir->i_sb);
+ inode_inc_link_count(dir);
+
inode = ufs_new_inode(dir, S_IFDIR|mode);
+ err = PTR_ERR(inode);
if (IS_ERR(inode))
- return PTR_ERR(inode);
+ goto out_dir;
inode->i_op = &ufs_dir_inode_operations;
inode->i_fop = &ufs_dir_operations;
@@ -195,9 +199,6 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
inode_inc_link_count(inode);
- lock_ufs(dir->i_sb);
- inode_inc_link_count(dir);
-
err = ufs_make_empty(inode, dir);
if (err)
goto out_fail;
@@ -216,6 +217,7 @@ out_fail:
inode_dec_link_count(inode);
unlock_new_inode(inode);
iput (inode);
+out_dir:
inode_dec_link_count(dir);
unlock_ufs(dir->i_sb);
goto out;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 098508a93c7b30..39f4237a1e9521 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -695,6 +695,7 @@ static int ufs_sync_fs(struct super_block *sb, int wait)
unsigned flags;
lock_ufs(sb);
+ mutex_lock(&UFS_SB(sb)->s_lock);
UFSD("ENTER\n");
@@ -712,6 +713,7 @@ static int ufs_sync_fs(struct super_block *sb, int wait)
ufs_put_cstotal(sb);
UFSD("EXIT\n");
+ mutex_unlock(&UFS_SB(sb)->s_lock);
unlock_ufs(sb);
return 0;
@@ -1291,6 +1293,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
new_mount_opt = 0;
ufs_set_opt (new_mount_opt, ONERROR_LOCK);
if (!ufs_parse_options (data, &new_mount_opt)) {
+ mutex_unlock(&UFS_SB(sb)->s_lock);
unlock_ufs(sb);
return -EINVAL;
}
@@ -1298,12 +1301,14 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
new_mount_opt |= ufstype;
} else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
pr_err("ufstype can't be changed during remount\n");
+ mutex_unlock(&UFS_SB(sb)->s_lock);
unlock_ufs(sb);
return -EINVAL;
}
if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
UFS_SB(sb)->s_mount_opt = new_mount_opt;
+ mutex_unlock(&UFS_SB(sb)->s_lock);
unlock_ufs(sb);
return 0;
}
@@ -1327,6 +1332,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
*/
#ifndef CONFIG_UFS_FS_WRITE
pr_err("ufs was compiled with read-only support, can't be mounted as read-write\n");
+ mutex_unlock(&UFS_SB(sb)->s_lock);
unlock_ufs(sb);
return -EINVAL;
#else
@@ -1336,11 +1342,13 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
ufstype != UFS_MOUNT_UFSTYPE_SUNx86 &&
ufstype != UFS_MOUNT_UFSTYPE_UFS2) {
pr_err("this ufstype is read-only supported\n");
+ mutex_unlock(&UFS_SB(sb)->s_lock);
unlock_ufs(sb);
return -EINVAL;
}
if (!ufs_read_cylinder_structures(sb)) {
pr_err("failed during remounting\n");
+ mutex_unlock(&UFS_SB(sb)->s_lock);
unlock_ufs(sb);
return -EPERM;
}
@@ -1348,6 +1356,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
#endif
}
UFS_SB(sb)->s_mount_opt = new_mount_opt;
+ mutex_unlock(&UFS_SB(sb)->s_lock);
unlock_ufs(sb);
return 0;
}
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 2a07396d5f9eb6..cf6368d42d4ab8 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -30,6 +30,7 @@ struct ufs_sb_info {
int work_queued; /* non-zero if the delayed work is queued */
struct delayed_work sync_work; /* FS sync delayed work */
spinlock_t work_lock; /* protects sync_work and work_queued */
+ struct mutex s_lock;
};
struct ufs_inode_info {
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
new file mode 100644
index 00000000000000..32a4c033cc7b53
--- /dev/null
+++ b/fs/userfaultfd.c
@@ -0,0 +1,1277 @@
+/*
+ * fs/userfaultfd.c
+ *
+ * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
+ * Copyright (C) 2008-2009 Red Hat, Inc.
+ * Copyright (C) 2015 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Some part derived from fs/eventfd.c (anon inode setup) and
+ * mm/ksm.c (mm hashing).
+ */
+
+#include <linux/hashtable.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/file.h>
+#include <linux/bug.h>
+#include <linux/anon_inodes.h>
+#include <linux/syscalls.h>
+#include <linux/userfaultfd_k.h>
+#include <linux/mempolicy.h>
+#include <linux/ioctl.h>
+#include <linux/security.h>
+
+static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
+
+enum userfaultfd_state {
+ UFFD_STATE_WAIT_API,
+ UFFD_STATE_RUNNING,
+};
+
+/*
+ * Start with fault_pending_wqh and fault_wqh so they're more likely
+ * to be in the same cacheline.
+ */
+struct userfaultfd_ctx {
+ /* waitqueue head for the pending (i.e. not read) userfaults */
+ wait_queue_head_t fault_pending_wqh;
+ /* waitqueue head for the userfaults */
+ wait_queue_head_t fault_wqh;
+ /* waitqueue head for the pseudo fd to wakeup poll/read */
+ wait_queue_head_t fd_wqh;
+ /* pseudo fd refcounting */
+ atomic_t refcount;
+ /* userfaultfd syscall flags */
+ unsigned int flags;
+ /* state machine */
+ enum userfaultfd_state state;
+ /* released */
+ bool released;
+ /* mm with one ore more vmas attached to this userfaultfd_ctx */
+ struct mm_struct *mm;
+};
+
+struct userfaultfd_wait_queue {
+ struct uffd_msg msg;
+ wait_queue_t wq;
+ struct userfaultfd_ctx *ctx;
+};
+
+struct userfaultfd_wake_range {
+ unsigned long start;
+ unsigned long len;
+};
+
+static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode,
+ int wake_flags, void *key)
+{
+ struct userfaultfd_wake_range *range = key;
+ int ret;
+ struct userfaultfd_wait_queue *uwq;
+ unsigned long start, len;
+
+ uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
+ ret = 0;
+ /* len == 0 means wake all */
+ start = range->start;
+ len = range->len;
+ if (len && (start > uwq->msg.arg.pagefault.address ||
+ start + len <= uwq->msg.arg.pagefault.address))
+ goto out;
+ ret = wake_up_state(wq->private, mode);
+ if (ret)
+ /*
+ * Wake only once, autoremove behavior.
+ *
+ * After the effect of list_del_init is visible to the
+ * other CPUs, the waitqueue may disappear from under
+ * us, see the !list_empty_careful() in
+ * handle_userfault(). try_to_wake_up() has an
+ * implicit smp_mb__before_spinlock, and the
+ * wq->private is read before calling the extern
+ * function "wake_up_state" (which in turns calls
+ * try_to_wake_up). While the spin_lock;spin_unlock;
+ * wouldn't be enough, the smp_mb__before_spinlock is
+ * enough to avoid an explicit smp_mb() here.
+ */
+ list_del_init(&wq->task_list);
+out:
+ return ret;
+}
+
+/**
+ * userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd
+ * context.
+ * @ctx: [in] Pointer to the userfaultfd context.
+ *
+ * Returns: In case of success, returns not zero.
+ */
+static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
+{
+ if (!atomic_inc_not_zero(&ctx->refcount))
+ BUG();
+}
+
+/**
+ * userfaultfd_ctx_put - Releases a reference to the internal userfaultfd
+ * context.
+ * @ctx: [in] Pointer to userfaultfd context.
+ *
+ * The userfaultfd context reference must have been previously acquired either
+ * with userfaultfd_ctx_get() or userfaultfd_ctx_fdget().
+ */
+static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
+{
+ if (atomic_dec_and_test(&ctx->refcount)) {
+ VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock));
+ VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
+ VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
+ VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
+ VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
+ VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
+ mmput(ctx->mm);
+ kmem_cache_free(userfaultfd_ctx_cachep, ctx);
+ }
+}
+
+static inline void msg_init(struct uffd_msg *msg)
+{
+// BUILD_BUG_ON(sizeof(struct uffd_msg) != 32);
+ /*
+ * Must use memset to zero out the paddings or kernel data is
+ * leaked to userland.
+ */
+ memset(msg, 0, sizeof(struct uffd_msg));
+}
+
+static inline struct uffd_msg userfault_msg(unsigned long address,
+ unsigned int flags,
+ unsigned long reason)
+{
+ struct uffd_msg msg;
+ msg_init(&msg);
+ msg.event = UFFD_EVENT_PAGEFAULT;
+ msg.arg.pagefault.address = address;
+ if (flags & FAULT_FLAG_WRITE)
+ /*
+ * If UFFD_FEATURE_PAGEFAULT_FLAG_WRITE was set in the
+ * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WRITE
+ * was not set in a UFFD_EVENT_PAGEFAULT, it means it
+ * was a read fault, otherwise if set it means it's
+ * a write fault.
+ */
+ msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE;
+ if (reason & VM_UFFD_WP)
+ /*
+ * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the
+ * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WP was
+ * not set in a UFFD_EVENT_PAGEFAULT, it means it was
+ * a missing fault, otherwise if set it means it's a
+ * write protect fault.
+ */
+ msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
+ return msg;
+}
+
+/*
+ * Verify the pagetables are still not ok after having reigstered into
+ * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
+ * userfault that has already been resolved, if userfaultfd_read and
+ * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different
+ * threads.
+ */
+static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
+ unsigned long address,
+ unsigned long flags,
+ unsigned long reason)
+{
+ struct mm_struct *mm = ctx->mm;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd, _pmd;
+ pte_t *pte;
+ bool ret = true;
+
+ VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
+
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ goto out;
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ goto out;
+ pmd = pmd_offset(pud, address);
+ /*
+ * READ_ONCE must function as a barrier with narrower scope
+ * and it must be equivalent to:
+ * _pmd = *pmd; barrier();
+ *
+ * This is to deal with the instability (as in
+ * pmd_trans_unstable) of the pmd.
+ */
+ _pmd = READ_ONCE(*pmd);
+ if (!pmd_present(_pmd))
+ goto out;
+
+ ret = false;
+ if (pmd_trans_huge(_pmd))
+ goto out;
+
+ /*
+ * the pmd is stable (as in !pmd_trans_unstable) so we can re-read it
+ * and use the standard pte_offset_map() instead of parsing _pmd.
+ */
+ pte = pte_offset_map(pmd, address);
+ /*
+ * Lockless access: we're in a wait_event so it's ok if it
+ * changes under us.
+ */
+ if (pte_none(*pte))
+ ret = true;
+ pte_unmap(pte);
+
+out:
+ return ret;
+}
+
+/*
+ * The locking rules involved in returning VM_FAULT_RETRY depending on
+ * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
+ * FAULT_FLAG_KILLABLE are not straightforward. The "Caution"
+ * recommendation in __lock_page_or_retry is not an understatement.
+ *
+ * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_sem must be released
+ * before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is
+ * not set.
+ *
+ * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not
+ * set, VM_FAULT_RETRY can still be returned if and only if there are
+ * fatal_signal_pending()s, and the mmap_sem must be released before
+ * returning it.
+ */
+int handle_userfault(struct vm_area_struct *vma, unsigned long address,
+ unsigned int flags, unsigned long reason)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct userfaultfd_ctx *ctx;
+ struct userfaultfd_wait_queue uwq;
+ int ret;
+ bool must_wait;
+
+ BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
+
+ ret = VM_FAULT_SIGBUS;
+ ctx = vma->vm_userfaultfd_ctx.ctx;
+ if (!ctx)
+ goto out;
+
+ BUG_ON(ctx->mm != mm);
+
+ VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP));
+ VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP));
+
+ /*
+ * If it's already released don't get it. This avoids to loop
+ * in __get_user_pages if userfaultfd_release waits on the
+ * caller of handle_userfault to release the mmap_sem.
+ */
+ if (unlikely(ACCESS_ONCE(ctx->released)))
+ goto out;
+
+ /*
+ * Check that we can return VM_FAULT_RETRY.
+ *
+ * NOTE: it should become possible to return VM_FAULT_RETRY
+ * even if FAULT_FLAG_TRIED is set without leading to gup()
+ * -EBUSY failures, if the userfaultfd is to be extended for
+ * VM_UFFD_WP tracking and we intend to arm the userfault
+ * without first stopping userland access to the memory. For
+ * VM_UFFD_MISSING userfaults this is enough for now.
+ */
+ if (unlikely(!(flags & FAULT_FLAG_ALLOW_RETRY))) {
+ /*
+ * Validate the invariant that nowait must allow retry
+ * to be sure not to return SIGBUS erroneously on
+ * nowait invocations.
+ */
+ BUG_ON(flags & FAULT_FLAG_RETRY_NOWAIT);
+#ifdef CONFIG_DEBUG_VM
+ if (printk_ratelimit()) {
+ printk(KERN_WARNING
+ "FAULT_FLAG_ALLOW_RETRY missing %x\n", flags);
+ dump_stack();
+ }
+#endif
+ goto out;
+ }
+
+ /*
+ * Handle nowait, not much to do other than tell it to retry
+ * and wait.
+ */
+ ret = VM_FAULT_RETRY;
+ if (flags & FAULT_FLAG_RETRY_NOWAIT)
+ goto out;
+
+ /* take the reference before dropping the mmap_sem */
+ userfaultfd_ctx_get(ctx);
+
+ init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
+ uwq.wq.private = current;
+ uwq.msg = userfault_msg(address, flags, reason);
+ uwq.ctx = ctx;
+
+ spin_lock(&ctx->fault_pending_wqh.lock);
+ /*
+ * After the __add_wait_queue the uwq is visible to userland
+ * through poll/read().
+ */
+ __add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq);
+ /*
+ * The smp_mb() after __set_current_state prevents the reads
+ * following the spin_unlock to happen before the list_add in
+ * __add_wait_queue.
+ */
+ set_current_state(TASK_KILLABLE);
+ spin_unlock(&ctx->fault_pending_wqh.lock);
+
+ must_wait = userfaultfd_must_wait(ctx, address, flags, reason);
+ up_read(&mm->mmap_sem);
+
+ if (likely(must_wait && !ACCESS_ONCE(ctx->released) &&
+ !fatal_signal_pending(current))) {
+ wake_up_poll(&ctx->fd_wqh, POLLIN);
+ schedule();
+ ret |= VM_FAULT_MAJOR;
+ }
+
+ __set_current_state(TASK_RUNNING);
+
+ /*
+ * Here we race with the list_del; list_add in
+ * userfaultfd_ctx_read(), however because we don't ever run
+ * list_del_init() to refile across the two lists, the prev
+ * and next pointers will never point to self. list_add also
+ * would never let any of the two pointers to point to
+ * self. So list_empty_careful won't risk to see both pointers
+ * pointing to self at any time during the list refile. The
+ * only case where list_del_init() is called is the full
+ * removal in the wake function and there we don't re-list_add
+ * and it's fine not to block on the spinlock. The uwq on this
+ * kernel stack can be released after the list_del_init.
+ */
+ if (!list_empty_careful(&uwq.wq.task_list)) {
+ spin_lock(&ctx->fault_pending_wqh.lock);
+ /*
+ * No need of list_del_init(), the uwq on the stack
+ * will be freed shortly anyway.
+ */
+ list_del(&uwq.wq.task_list);
+ spin_unlock(&ctx->fault_pending_wqh.lock);
+ }
+
+ /*
+ * ctx may go away after this if the userfault pseudo fd is
+ * already released.
+ */
+ userfaultfd_ctx_put(ctx);
+
+out:
+ return ret;
+}
+
+static int userfaultfd_release(struct inode *inode, struct file *file)
+{
+ struct userfaultfd_ctx *ctx = file->private_data;
+ struct mm_struct *mm = ctx->mm;
+ struct vm_area_struct *vma, *prev;
+ /* len == 0 means wake all */
+ struct userfaultfd_wake_range range = { .len = 0, };
+ unsigned long new_flags;
+
+ ACCESS_ONCE(ctx->released) = true;
+
+ /*
+ * Flush page faults out of all CPUs. NOTE: all page faults
+ * must be retried without returning VM_FAULT_SIGBUS if
+ * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx
+ * changes while handle_userfault released the mmap_sem. So
+ * it's critical that released is set to true (above), before
+ * taking the mmap_sem for writing.
+ */
+ down_write(&mm->mmap_sem);
+ prev = NULL;
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ cond_resched();
+ BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
+ !!(vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
+ if (vma->vm_userfaultfd_ctx.ctx != ctx) {
+ prev = vma;
+ continue;
+ }
+ new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
+ prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end,
+ new_flags, vma->anon_vma,
+ vma->vm_file, vma->vm_pgoff,
+ vma_policy(vma),
+ NULL_VM_UFFD_CTX);
+ if (prev)
+ vma = prev;
+ else
+ prev = vma;
+ vma->vm_flags = new_flags;
+ vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+ }
+ up_write(&mm->mmap_sem);
+
+ /*
+ * After no new page faults can wait on this fault_*wqh, flush
+ * the last page faults that may have been already waiting on
+ * the fault_*wqh.
+ */
+ spin_lock(&ctx->fault_pending_wqh.lock);
+ __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 0, &range);
+ __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, &range);
+ spin_unlock(&ctx->fault_pending_wqh.lock);
+
+ wake_up_poll(&ctx->fd_wqh, POLLHUP);
+ userfaultfd_ctx_put(ctx);
+ return 0;
+}
+
+/* fault_pending_wqh.lock must be hold by the caller */
+static inline struct userfaultfd_wait_queue *find_userfault(
+ struct userfaultfd_ctx *ctx)
+{
+ wait_queue_t *wq;
+ struct userfaultfd_wait_queue *uwq;
+
+ VM_BUG_ON(!spin_is_locked(&ctx->fault_pending_wqh.lock));
+
+ uwq = NULL;
+ if (!waitqueue_active(&ctx->fault_pending_wqh))
+ goto out;
+ /* walk in reverse to provide FIFO behavior to read userfaults */
+ wq = list_last_entry(&ctx->fault_pending_wqh.task_list,
+ typeof(*wq), task_list);
+ uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
+out:
+ return uwq;
+}
+
+static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
+{
+ struct userfaultfd_ctx *ctx = file->private_data;
+ unsigned int ret;
+
+ poll_wait(file, &ctx->fd_wqh, wait);
+
+ switch (ctx->state) {
+ case UFFD_STATE_WAIT_API:
+ return POLLERR;
+ case UFFD_STATE_RUNNING:
+ /*
+ * poll() never guarantees that read won't block.
+ * userfaults can be waken before they're read().
+ */
+ if (unlikely(!(file->f_flags & O_NONBLOCK)))
+ return POLLERR;
+ /*
+ * lockless access to see if there are pending faults
+ * __pollwait last action is the add_wait_queue but
+ * the spin_unlock would allow the waitqueue_active to
+ * pass above the actual list_add inside
+ * add_wait_queue critical section. So use a full
+ * memory barrier to serialize the list_add write of
+ * add_wait_queue() with the waitqueue_active read
+ * below.
+ */
+ ret = 0;
+ smp_mb();
+ if (waitqueue_active(&ctx->fault_pending_wqh))
+ ret = POLLIN;
+ return ret;
+ default:
+ BUG();
+ }
+}
+
+static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
+ struct uffd_msg *msg)
+{
+ ssize_t ret;
+ DECLARE_WAITQUEUE(wait, current);
+ struct userfaultfd_wait_queue *uwq;
+
+ /* always take the fd_wqh lock before the fault_pending_wqh lock */
+ spin_lock(&ctx->fd_wqh.lock);
+ __add_wait_queue(&ctx->fd_wqh, &wait);
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ spin_lock(&ctx->fault_pending_wqh.lock);
+ uwq = find_userfault(ctx);
+ if (uwq) {
+ /*
+ * The fault_pending_wqh.lock prevents the uwq
+ * to disappear from under us.
+ *
+ * Refile this userfault from
+ * fault_pending_wqh to fault_wqh, it's not
+ * pending anymore after we read it.
+ *
+ * Use list_del() by hand (as
+ * userfaultfd_wake_function also uses
+ * list_del_init() by hand) to be sure nobody
+ * changes __remove_wait_queue() to use
+ * list_del_init() in turn breaking the
+ * !list_empty_careful() check in
+ * handle_userfault(). The uwq->wq.task_list
+ * must never be empty at any time during the
+ * refile, or the waitqueue could disappear
+ * from under us. The "wait_queue_head_t"
+ * parameter of __remove_wait_queue() is unused
+ * anyway.
+ */
+ list_del(&uwq->wq.task_list);
+ __add_wait_queue(&ctx->fault_wqh, &uwq->wq);
+
+ /* careful to always initialize msg if ret == 0 */
+ *msg = uwq->msg;
+ spin_unlock(&ctx->fault_pending_wqh.lock);
+ ret = 0;
+ break;
+ }
+ spin_unlock(&ctx->fault_pending_wqh.lock);
+ if (signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
+ if (no_wait) {
+ ret = -EAGAIN;
+ break;
+ }
+ spin_unlock(&ctx->fd_wqh.lock);
+ schedule();
+ spin_lock(&ctx->fd_wqh.lock);
+ }
+ __remove_wait_queue(&ctx->fd_wqh, &wait);
+ __set_current_state(TASK_RUNNING);
+ spin_unlock(&ctx->fd_wqh.lock);
+
+ return ret;
+}
+
+static ssize_t userfaultfd_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct userfaultfd_ctx *ctx = file->private_data;
+ ssize_t _ret, ret = 0;
+ struct uffd_msg msg;
+ int no_wait = file->f_flags & O_NONBLOCK;
+
+ if (ctx->state == UFFD_STATE_WAIT_API)
+ return -EINVAL;
+ BUG_ON(ctx->state != UFFD_STATE_RUNNING);
+
+ for (;;) {
+ if (count < sizeof(msg))
+ return ret ? ret : -EINVAL;
+ _ret = userfaultfd_ctx_read(ctx, no_wait, &msg);
+ if (_ret < 0)
+ return ret ? ret : _ret;
+ if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg)))
+ return ret ? ret : -EFAULT;
+ ret += sizeof(msg);
+ buf += sizeof(msg);
+ count -= sizeof(msg);
+ /*
+ * Allow to read more than one fault at time but only
+ * block if waiting for the very first one.
+ */
+ no_wait = O_NONBLOCK;
+ }
+}
+
+static void __wake_userfault(struct userfaultfd_ctx *ctx,
+ struct userfaultfd_wake_range *range)
+{
+ unsigned long start, end;
+
+ start = range->start;
+ end = range->start + range->len;
+
+ spin_lock(&ctx->fault_pending_wqh.lock);
+ /* wake all in the range and autoremove */
+ if (waitqueue_active(&ctx->fault_pending_wqh))
+ __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 0,
+ range);
+ if (waitqueue_active(&ctx->fault_wqh))
+ __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, range);
+ spin_unlock(&ctx->fault_pending_wqh.lock);
+}
+
+static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
+ struct userfaultfd_wake_range *range)
+{
+ /*
+ * To be sure waitqueue_active() is not reordered by the CPU
+ * before the pagetable update, use an explicit SMP memory
+ * barrier here. PT lock release or up_read(mmap_sem) still
+ * have release semantics that can allow the
+ * waitqueue_active() to be reordered before the pte update.
+ */
+ smp_mb();
+
+ /*
+ * Use waitqueue_active because it's very frequent to
+ * change the address space atomically even if there are no
+ * userfaults yet. So we take the spinlock only when we're
+ * sure we've userfaults to wake.
+ */
+ if (waitqueue_active(&ctx->fault_pending_wqh) ||
+ waitqueue_active(&ctx->fault_wqh))
+ __wake_userfault(ctx, range);
+}
+
+static __always_inline int validate_range(struct mm_struct *mm,
+ __u64 start, __u64 len)
+{
+ __u64 task_size = mm->task_size;
+
+ if (start & ~PAGE_MASK)
+ return -EINVAL;
+ if (len & ~PAGE_MASK)
+ return -EINVAL;
+ if (!len)
+ return -EINVAL;
+ if (start < mmap_min_addr)
+ return -EINVAL;
+ if (start >= task_size)
+ return -EINVAL;
+ if (len > task_size - start)
+ return -EINVAL;
+ return 0;
+}
+
+static int userfaultfd_register(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ struct mm_struct *mm = ctx->mm;
+ struct vm_area_struct *vma, *prev, *cur;
+ int ret;
+ struct uffdio_register uffdio_register;
+ struct uffdio_register __user *user_uffdio_register;
+ unsigned long vm_flags, new_flags;
+ bool found;
+ unsigned long start, end, vma_end;
+
+ user_uffdio_register = (struct uffdio_register __user *) arg;
+
+ ret = -EFAULT;
+ if (copy_from_user(&uffdio_register, user_uffdio_register,
+ sizeof(uffdio_register)-sizeof(__u64)))
+ goto out;
+
+ ret = -EINVAL;
+ if (!uffdio_register.mode)
+ goto out;
+ if (uffdio_register.mode & ~(UFFDIO_REGISTER_MODE_MISSING|
+ UFFDIO_REGISTER_MODE_WP))
+ goto out;
+ vm_flags = 0;
+ if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
+ vm_flags |= VM_UFFD_MISSING;
+ if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
+ vm_flags |= VM_UFFD_WP;
+ /*
+ * FIXME: remove the below error constraint by
+ * implementing the wprotect tracking mode.
+ */
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = validate_range(mm, uffdio_register.range.start,
+ uffdio_register.range.len);
+ if (ret)
+ goto out;
+
+ start = uffdio_register.range.start;
+ end = start + uffdio_register.range.len;
+
+ down_write(&mm->mmap_sem);
+ vma = find_vma_prev(mm, start, &prev);
+
+ ret = -ENOMEM;
+ if (!vma)
+ goto out_unlock;
+
+ /* check that there's at least one vma in the range */
+ ret = -EINVAL;
+ if (vma->vm_start >= end)
+ goto out_unlock;
+
+ /*
+ * Search for not compatible vmas.
+ *
+ * FIXME: this shall be relaxed later so that it doesn't fail
+ * on tmpfs backed vmas (in addition to the current allowance
+ * on anonymous vmas).
+ */
+ found = false;
+ for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
+ cond_resched();
+
+ BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
+ !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
+
+ /* check not compatible vmas */
+ ret = -EINVAL;
+ if (cur->vm_ops)
+ goto out_unlock;
+
+ /*
+ * Check that this vma isn't already owned by a
+ * different userfaultfd. We can't allow more than one
+ * userfaultfd to own a single vma simultaneously or we
+ * wouldn't know which one to deliver the userfaults to.
+ */
+ ret = -EBUSY;
+ if (cur->vm_userfaultfd_ctx.ctx &&
+ cur->vm_userfaultfd_ctx.ctx != ctx)
+ goto out_unlock;
+
+ found = true;
+ }
+ BUG_ON(!found);
+
+ if (vma->vm_start < start)
+ prev = vma;
+
+ ret = 0;
+ do {
+ cond_resched();
+
+ BUG_ON(vma->vm_ops);
+ BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
+ vma->vm_userfaultfd_ctx.ctx != ctx);
+
+ /*
+ * Nothing to do: this vma is already registered into this
+ * userfaultfd and with the right tracking mode too.
+ */
+ if (vma->vm_userfaultfd_ctx.ctx == ctx &&
+ (vma->vm_flags & vm_flags) == vm_flags)
+ goto skip;
+
+ if (vma->vm_start > start)
+ start = vma->vm_start;
+ vma_end = min(end, vma->vm_end);
+
+ new_flags = (vma->vm_flags & ~vm_flags) | vm_flags;
+ prev = vma_merge(mm, prev, start, vma_end, new_flags,
+ vma->anon_vma, vma->vm_file, vma->vm_pgoff,
+ vma_policy(vma),
+ ((struct vm_userfaultfd_ctx){ ctx }));
+ if (prev) {
+ vma = prev;
+ goto next;
+ }
+ if (vma->vm_start < start) {
+ ret = split_vma(mm, vma, start, 1);
+ if (ret)
+ break;
+ }
+ if (vma->vm_end > end) {
+ ret = split_vma(mm, vma, end, 0);
+ if (ret)
+ break;
+ }
+ next:
+ /*
+ * In the vma_merge() successful mprotect-like case 8:
+ * the next vma was merged into the current one and
+ * the current one has not been updated yet.
+ */
+ vma->vm_flags = new_flags;
+ vma->vm_userfaultfd_ctx.ctx = ctx;
+
+ skip:
+ prev = vma;
+ start = vma->vm_end;
+ vma = vma->vm_next;
+ } while (vma && vma->vm_start < end);
+out_unlock:
+ up_write(&mm->mmap_sem);
+ if (!ret) {
+ /*
+ * Now that we scanned all vmas we can already tell
+ * userland which ioctls methods are guaranteed to
+ * succeed on this range.
+ */
+ if (put_user(UFFD_API_RANGE_IOCTLS,
+ &user_uffdio_register->ioctls))
+ ret = -EFAULT;
+ }
+out:
+ return ret;
+}
+
+static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ struct mm_struct *mm = ctx->mm;
+ struct vm_area_struct *vma, *prev, *cur;
+ int ret;
+ struct uffdio_range uffdio_unregister;
+ unsigned long new_flags;
+ bool found;
+ unsigned long start, end, vma_end;
+ const void __user *buf = (void __user *)arg;
+
+ ret = -EFAULT;
+ if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
+ goto out;
+
+ ret = validate_range(mm, uffdio_unregister.start,
+ uffdio_unregister.len);
+ if (ret)
+ goto out;
+
+ start = uffdio_unregister.start;
+ end = start + uffdio_unregister.len;
+
+ down_write(&mm->mmap_sem);
+ vma = find_vma_prev(mm, start, &prev);
+
+ ret = -ENOMEM;
+ if (!vma)
+ goto out_unlock;
+
+ /* check that there's at least one vma in the range */
+ ret = -EINVAL;
+ if (vma->vm_start >= end)
+ goto out_unlock;
+
+ /*
+ * Search for not compatible vmas.
+ *
+ * FIXME: this shall be relaxed later so that it doesn't fail
+ * on tmpfs backed vmas (in addition to the current allowance
+ * on anonymous vmas).
+ */
+ found = false;
+ ret = -EINVAL;
+ for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
+ cond_resched();
+
+ BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
+ !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
+
+ /*
+ * Check not compatible vmas, not strictly required
+ * here as not compatible vmas cannot have an
+ * userfaultfd_ctx registered on them, but this
+ * provides for more strict behavior to notice
+ * unregistration errors.
+ */
+ if (cur->vm_ops)
+ goto out_unlock;
+
+ found = true;
+ }
+ BUG_ON(!found);
+
+ if (vma->vm_start < start)
+ prev = vma;
+
+ ret = 0;
+ do {
+ cond_resched();
+
+ BUG_ON(vma->vm_ops);
+
+ /*
+ * Nothing to do: this vma is already registered into this
+ * userfaultfd and with the right tracking mode too.
+ */
+ if (!vma->vm_userfaultfd_ctx.ctx)
+ goto skip;
+
+ if (vma->vm_start > start)
+ start = vma->vm_start;
+ vma_end = min(end, vma->vm_end);
+
+ new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
+ prev = vma_merge(mm, prev, start, vma_end, new_flags,
+ vma->anon_vma, vma->vm_file, vma->vm_pgoff,
+ vma_policy(vma),
+ NULL_VM_UFFD_CTX);
+ if (prev) {
+ vma = prev;
+ goto next;
+ }
+ if (vma->vm_start < start) {
+ ret = split_vma(mm, vma, start, 1);
+ if (ret)
+ break;
+ }
+ if (vma->vm_end > end) {
+ ret = split_vma(mm, vma, end, 0);
+ if (ret)
+ break;
+ }
+ next:
+ /*
+ * In the vma_merge() successful mprotect-like case 8:
+ * the next vma was merged into the current one and
+ * the current one has not been updated yet.
+ */
+ vma->vm_flags = new_flags;
+ vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+
+ skip:
+ prev = vma;
+ start = vma->vm_end;
+ vma = vma->vm_next;
+ } while (vma && vma->vm_start < end);
+out_unlock:
+ up_write(&mm->mmap_sem);
+out:
+ return ret;
+}
+
+/*
+ * userfaultfd_wake may be used in combination with the
+ * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches.
+ */
+static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ int ret;
+ struct uffdio_range uffdio_wake;
+ struct userfaultfd_wake_range range;
+ const void __user *buf = (void __user *)arg;
+
+ ret = -EFAULT;
+ if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
+ goto out;
+
+ ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
+ if (ret)
+ goto out;
+
+ range.start = uffdio_wake.start;
+ range.len = uffdio_wake.len;
+
+ /*
+ * len == 0 means wake all and we don't want to wake all here,
+ * so check it again to be sure.
+ */
+ VM_BUG_ON(!range.len);
+
+ wake_userfault(ctx, &range);
+ ret = 0;
+
+out:
+ return ret;
+}
+
+static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ __s64 ret;
+ struct uffdio_copy uffdio_copy;
+ struct uffdio_copy __user *user_uffdio_copy;
+ struct userfaultfd_wake_range range;
+
+ user_uffdio_copy = (struct uffdio_copy __user *) arg;
+
+ ret = -EFAULT;
+ if (copy_from_user(&uffdio_copy, user_uffdio_copy,
+ /* don't copy "copy" last field */
+ sizeof(uffdio_copy)-sizeof(__s64)))
+ goto out;
+
+ ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
+ if (ret)
+ goto out;
+ /*
+ * double check for wraparound just in case. copy_from_user()
+ * will later check uffdio_copy.src + uffdio_copy.len to fit
+ * in the userland range.
+ */
+ ret = -EINVAL;
+ if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src)
+ goto out;
+ if (uffdio_copy.mode & ~UFFDIO_COPY_MODE_DONTWAKE)
+ goto out;
+
+ ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
+ uffdio_copy.len);
+ if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
+ return -EFAULT;
+ if (ret < 0)
+ goto out;
+ BUG_ON(!ret);
+ /* len == 0 would wake all */
+ range.len = ret;
+ if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) {
+ range.start = uffdio_copy.dst;
+ wake_userfault(ctx, &range);
+ }
+ ret = range.len == uffdio_copy.len ? 0 : -EAGAIN;
+out:
+ return ret;
+}
+
+static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ __s64 ret;
+ struct uffdio_zeropage uffdio_zeropage;
+ struct uffdio_zeropage __user *user_uffdio_zeropage;
+ struct userfaultfd_wake_range range;
+
+ user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
+
+ ret = -EFAULT;
+ if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
+ /* don't copy "zeropage" last field */
+ sizeof(uffdio_zeropage)-sizeof(__s64)))
+ goto out;
+
+ ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
+ uffdio_zeropage.range.len);
+ if (ret)
+ goto out;
+ ret = -EINVAL;
+ if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE)
+ goto out;
+
+ ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
+ uffdio_zeropage.range.len);
+ if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
+ return -EFAULT;
+ if (ret < 0)
+ goto out;
+ /* len == 0 would wake all */
+ BUG_ON(!ret);
+ range.len = ret;
+ if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) {
+ range.start = uffdio_zeropage.range.start;
+ wake_userfault(ctx, &range);
+ }
+ ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN;
+out:
+ return ret;
+}
+
+/*
+ * userland asks for a certain API version and we return which bits
+ * and ioctl commands are implemented in this kernel for such API
+ * version or -EINVAL if unknown.
+ */
+static int userfaultfd_api(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ struct uffdio_api uffdio_api;
+ void __user *buf = (void __user *)arg;
+ int ret;
+
+ ret = -EINVAL;
+ if (ctx->state != UFFD_STATE_WAIT_API)
+ goto out;
+ ret = -EFAULT;
+ if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
+ goto out;
+ if (uffdio_api.api != UFFD_API || uffdio_api.features) {
+ memset(&uffdio_api, 0, sizeof(uffdio_api));
+ if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
+ goto out;
+ ret = -EINVAL;
+ goto out;
+ }
+ uffdio_api.features = UFFD_API_FEATURES;
+ uffdio_api.ioctls = UFFD_API_IOCTLS;
+ ret = -EFAULT;
+ if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
+ goto out;
+ ctx->state = UFFD_STATE_RUNNING;
+ ret = 0;
+out:
+ return ret;
+}
+
+static long userfaultfd_ioctl(struct file *file, unsigned cmd,
+ unsigned long arg)
+{
+ int ret = -EINVAL;
+ struct userfaultfd_ctx *ctx = file->private_data;
+
+ switch(cmd) {
+ case UFFDIO_API:
+ ret = userfaultfd_api(ctx, arg);
+ break;
+ case UFFDIO_REGISTER:
+ ret = userfaultfd_register(ctx, arg);
+ break;
+ case UFFDIO_UNREGISTER:
+ ret = userfaultfd_unregister(ctx, arg);
+ break;
+ case UFFDIO_WAKE:
+ ret = userfaultfd_wake(ctx, arg);
+ break;
+ case UFFDIO_COPY:
+ ret = userfaultfd_copy(ctx, arg);
+ break;
+ case UFFDIO_ZEROPAGE:
+ ret = userfaultfd_zeropage(ctx, arg);
+ break;
+ }
+ return ret;
+}
+
+#ifdef CONFIG_PROC_FS
+static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
+{
+ struct userfaultfd_ctx *ctx = f->private_data;
+ wait_queue_t *wq;
+ struct userfaultfd_wait_queue *uwq;
+ unsigned long pending = 0, total = 0;
+
+ spin_lock(&ctx->fault_pending_wqh.lock);
+ list_for_each_entry(wq, &ctx->fault_pending_wqh.task_list, task_list) {
+ uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
+ pending++;
+ total++;
+ }
+ list_for_each_entry(wq, &ctx->fault_wqh.task_list, task_list) {
+ uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
+ total++;
+ }
+ spin_unlock(&ctx->fault_pending_wqh.lock);
+
+ /*
+ * If more protocols will be added, there will be all shown
+ * separated by a space. Like this:
+ * protocols: aa:... bb:...
+ */
+ seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
+ pending, total, UFFD_API, UFFD_API_FEATURES,
+ UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS);
+}
+#endif
+
+static const struct file_operations userfaultfd_fops = {
+#ifdef CONFIG_PROC_FS
+ .show_fdinfo = userfaultfd_show_fdinfo,
+#endif
+ .release = userfaultfd_release,
+ .poll = userfaultfd_poll,
+ .read = userfaultfd_read,
+ .unlocked_ioctl = userfaultfd_ioctl,
+ .compat_ioctl = userfaultfd_ioctl,
+ .llseek = noop_llseek,
+};
+
+static void init_once_userfaultfd_ctx(void *mem)
+{
+ struct userfaultfd_ctx *ctx = (struct userfaultfd_ctx *) mem;
+
+ init_waitqueue_head(&ctx->fault_pending_wqh);
+ init_waitqueue_head(&ctx->fault_wqh);
+ init_waitqueue_head(&ctx->fd_wqh);
+}
+
+/**
+ * userfaultfd_file_create - Creates an userfaultfd file pointer.
+ * @flags: Flags for the userfaultfd file.
+ *
+ * This function creates an userfaultfd file pointer, w/out installing
+ * it into the fd table. This is useful when the userfaultfd file is
+ * used during the initialization of data structures that require
+ * extra setup after the userfaultfd creation. So the userfaultfd
+ * creation is split into the file pointer creation phase, and the
+ * file descriptor installation phase. In this way races with
+ * userspace closing the newly installed file descriptor can be
+ * avoided. Returns an userfaultfd file pointer, or a proper error
+ * pointer.
+ */
+static struct file *userfaultfd_file_create(int flags)
+{
+ struct file *file;
+ struct userfaultfd_ctx *ctx;
+
+ BUG_ON(!current->mm);
+
+ /* Check the UFFD_* constants for consistency. */
+ BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
+ BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);
+
+ file = ERR_PTR(-EINVAL);
+ if (flags & ~UFFD_SHARED_FCNTL_FLAGS)
+ goto out;
+
+ file = ERR_PTR(-ENOMEM);
+ ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
+ if (!ctx)
+ goto out;
+
+ atomic_set(&ctx->refcount, 1);
+ ctx->flags = flags;
+ ctx->state = UFFD_STATE_WAIT_API;
+ ctx->released = false;
+ ctx->mm = current->mm;
+ /* prevent the mm struct to be freed */
+ atomic_inc(&ctx->mm->mm_users);
+
+ file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
+ O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS));
+ if (IS_ERR(file))
+ kmem_cache_free(userfaultfd_ctx_cachep, ctx);
+out:
+ return file;
+}
+
+SYSCALL_DEFINE1(userfaultfd, int, flags)
+{
+ int fd, error;
+ struct file *file;
+
+ error = get_unused_fd_flags(flags & UFFD_SHARED_FCNTL_FLAGS);
+ if (error < 0)
+ return error;
+ fd = error;
+
+ file = userfaultfd_file_create(flags);
+ if (IS_ERR(file)) {
+ error = PTR_ERR(file);
+ goto err_put_unused_fd;
+ }
+ fd_install(fd, file);
+
+ return fd;
+
+err_put_unused_fd:
+ put_unused_fd(fd);
+
+ return error;
+}
+
+static int __init userfaultfd_init(void)
+{
+ userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache",
+ sizeof(struct userfaultfd_ctx),
+ 0,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+ init_once_userfaultfd_ctx);
+ return 0;
+}
+__initcall(userfaultfd_init);
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 39f1d6a2b04d42..85e2434aeec544 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -96,11 +96,11 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
}
#endif
-#ifndef __HAVE_ARCH_PMDP_GET_AND_CLEAR
+#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
- unsigned long address,
- pmd_t *pmdp)
+static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
+ unsigned long address,
+ pmd_t *pmdp)
{
pmd_t pmd = *pmdp;
pmd_clear(pmdp);
@@ -109,13 +109,13 @@ static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif
-#ifndef __HAVE_ARCH_PMDP_GET_AND_CLEAR_FULL
+#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static inline pmd_t pmdp_get_and_clear_full(struct mm_struct *mm,
+static inline pmd_t pmdp_huge_get_and_clear_full(struct mm_struct *mm,
unsigned long address, pmd_t *pmdp,
int full)
{
- return pmdp_get_and_clear(mm, address, pmdp);
+ return pmdp_huge_get_and_clear(mm, address, pmdp);
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif
@@ -152,8 +152,8 @@ extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
pte_t *ptep);
#endif
-#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH
-extern pmd_t pmdp_clear_flush(struct vm_area_struct *vma,
+#ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
+extern pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma,
unsigned long address,
pmd_t *pmdp);
#endif
@@ -189,6 +189,22 @@ extern void pmdp_splitting_flush(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp);
#endif
+#ifndef pmdp_collapse_flush
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp);
+#else
+static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
+ unsigned long address,
+ pmd_t *pmdp)
+{
+ BUILD_BUG();
+ return *pmdp;
+}
+#define pmdp_collapse_flush pmdp_collapse_flush
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
+
#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
pgtable_t pgtable);
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 0995c2de8162c2..f589222bfa87e4 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -357,12 +357,12 @@ extern void *alloc_large_system_hash(const char *tablename,
/* Only NUMA needs hash distribution. 64bit NUMA architectures have
* sufficient vmalloc space.
*/
-#if defined(CONFIG_NUMA) && defined(CONFIG_64BIT)
-#define HASHDIST_DEFAULT 1
+#ifdef CONFIG_NUMA
+#define HASHDIST_DEFAULT IS_ENABLED(CONFIG_64BIT)
+extern int hashdist; /* Distribute hashes across NUMA nodes? */
#else
-#define HASHDIST_DEFAULT 0
+#define hashdist (0)
#endif
-extern int hashdist; /* Distribute hashes across NUMA nodes? */
#endif /* _LINUX_BOOTMEM_H */
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 371e560d13cf1d..dfaa7b3e9ae900 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -5,9 +5,9 @@
/*
* Common definitions for all gcc versions go here.
*/
-#define GCC_VERSION (__GNUC__ * 10000 \
- + __GNUC_MINOR__ * 100 \
- + __GNUC_PATCHLEVEL__)
+#define GCC_VERSION (__GNUC__ * 10000 \
+ + __GNUC_MINOR__ * 100 \
+ + __GNUC_PATCHLEVEL__)
/* Optimization barrier */
@@ -46,55 +46,63 @@
* the inline assembly constraint from =g to =r, in this particular
* case either is valid.
*/
-#define RELOC_HIDE(ptr, off) \
- ({ unsigned long __ptr; \
- __asm__ ("" : "=r"(__ptr) : "0"(ptr)); \
- (typeof(ptr)) (__ptr + (off)); })
+#define RELOC_HIDE(ptr, off) \
+({ \
+ unsigned long __ptr; \
+ __asm__ ("" : "=r"(__ptr) : "0"(ptr)); \
+ (typeof(ptr)) (__ptr + (off)); \
+})
/* Make the optimizer believe the variable can be manipulated arbitrarily. */
-#define OPTIMIZER_HIDE_VAR(var) __asm__ ("" : "=r" (var) : "0" (var))
+#define OPTIMIZER_HIDE_VAR(var) \
+ __asm__ ("" : "=r" (var) : "0" (var))
#ifdef __CHECKER__
-#define __must_be_array(arr) 0
+#define __must_be_array(a) 0
#else
/* &a[0] degrades to a pointer: a different type from an array */
-#define __must_be_array(a) BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0]))
+#define __must_be_array(a) BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0]))
#endif
/*
* Force always-inline if the user requests it so via the .config,
* or if gcc is too old:
*/
-#if !defined(CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING) || \
+#if !defined(CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING) || \
!defined(CONFIG_OPTIMIZE_INLINING) || (__GNUC__ < 4)
-# define inline inline __attribute__((always_inline)) notrace
-# define __inline__ __inline__ __attribute__((always_inline)) notrace
-# define __inline __inline __attribute__((always_inline)) notrace
+#define inline inline __attribute__((always_inline)) notrace
+#define __inline__ __inline__ __attribute__((always_inline)) notrace
+#define __inline __inline __attribute__((always_inline)) notrace
#else
/* A lot of inline functions can cause havoc with function tracing */
-# define inline inline notrace
-# define __inline__ __inline__ notrace
-# define __inline __inline notrace
+#define inline inline notrace
+#define __inline__ __inline__ notrace
+#define __inline __inline notrace
#endif
-#define __deprecated __attribute__((deprecated))
-#define __packed __attribute__((packed))
-#define __weak __attribute__((weak))
-#define __alias(symbol) __attribute__((alias(#symbol)))
+#define __always_inline inline __attribute__((always_inline))
+#define noinline __attribute__((noinline))
+
+#define __deprecated __attribute__((deprecated))
+#define __packed __attribute__((packed))
+#define __weak __attribute__((weak))
+#define __alias(symbol) __attribute__((alias(#symbol)))
/*
- * it doesn't make sense on ARM (currently the only user of __naked) to trace
- * naked functions because then mcount is called without stack and frame pointer
- * being set up and there is no chance to restore the lr register to the value
- * before mcount was called.
+ * it doesn't make sense on ARM (currently the only user of __naked)
+ * to trace naked functions because then mcount is called without
+ * stack and frame pointer being set up and there is no chance to
+ * restore the lr register to the value before mcount was called.
+ *
+ * The asm() bodies of naked functions often depend on standard calling
+ * conventions, therefore they must be noinline and noclone.
*
- * The asm() bodies of naked functions often depend on standard calling conventions,
- * therefore they must be noinline and noclone. GCC 4.[56] currently fail to enforce
- * this, so we must do so ourselves. See GCC PR44290.
+ * GCC 4.[56] currently fail to enforce this, so we must do so ourselves.
+ * See GCC PR44290.
*/
-#define __naked __attribute__((naked)) noinline __noclone notrace
+#define __naked __attribute__((naked)) noinline __noclone notrace
-#define __noreturn __attribute__((noreturn))
+#define __noreturn __attribute__((noreturn))
/*
* From the GCC manual:
@@ -106,19 +114,130 @@
* would be.
* [...]
*/
-#define __pure __attribute__((pure))
-#define __aligned(x) __attribute__((aligned(x)))
-#define __printf(a, b) __attribute__((format(printf, a, b)))
-#define __scanf(a, b) __attribute__((format(scanf, a, b)))
-#define noinline __attribute__((noinline))
-#define __attribute_const__ __attribute__((__const__))
-#define __maybe_unused __attribute__((unused))
-#define __always_unused __attribute__((unused))
-
-#define __gcc_header(x) #x
-#define _gcc_header(x) __gcc_header(linux/compiler-gcc##x.h)
-#define gcc_header(x) _gcc_header(x)
-#include gcc_header(__GNUC__)
+#define __pure __attribute__((pure))
+#define __aligned(x) __attribute__((aligned(x)))
+#define __printf(a, b) __attribute__((format(printf, a, b)))
+#define __scanf(a, b) __attribute__((format(scanf, a, b)))
+#define __attribute_const__ __attribute__((__const__))
+#define __maybe_unused __attribute__((unused))
+#define __always_unused __attribute__((unused))
+
+/* gcc version specific checks */
+
+#if GCC_VERSION < 30200
+# error Sorry, your compiler is too old - please upgrade it.
+#endif
+
+#if GCC_VERSION < 30300
+# define __used __attribute__((__unused__))
+#else
+# define __used __attribute__((__used__))
+#endif
+
+#ifdef CONFIG_GCOV_KERNEL
+# if GCC_VERSION < 30400
+# error "GCOV profiling support for gcc versions below 3.4 not included"
+# endif /* __GNUC_MINOR__ */
+#endif /* CONFIG_GCOV_KERNEL */
+
+#if GCC_VERSION >= 30400
+#define __must_check __attribute__((warn_unused_result))
+#endif
+
+#if GCC_VERSION >= 40000
+
+/* GCC 4.1.[01] miscompiles __weak */
+#ifdef __KERNEL__
+# if GCC_VERSION >= 40100 && GCC_VERSION <= 40101
+# error Your version of gcc miscompiles the __weak directive
+# endif
+#endif
+
+#define __used __attribute__((__used__))
+#define __compiler_offsetof(a, b) \
+ __builtin_offsetof(a, b)
+
+#if GCC_VERSION >= 40100 && GCC_VERSION < 40600
+# define __compiletime_object_size(obj) __builtin_object_size(obj, 0)
+#endif
+
+#if GCC_VERSION >= 40300
+/* Mark functions as cold. gcc will assume any path leading to a call
+ * to them will be unlikely. This means a lot of manual unlikely()s
+ * are unnecessary now for any paths leading to the usual suspects
+ * like BUG(), printk(), panic() etc. [but let's keep them for now for
+ * older compilers]
+ *
+ * Early snapshots of gcc 4.3 don't support this and we can't detect this
+ * in the preprocessor, but we can live with this because they're unreleased.
+ * Maketime probing would be overkill here.
+ *
+ * gcc also has a __attribute__((__hot__)) to move hot functions into
+ * a special section, but I don't see any sense in this right now in
+ * the kernel context
+ */
+#define __cold __attribute__((__cold__))
+
+#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__)
+
+#ifndef __CHECKER__
+# define __compiletime_warning(message) __attribute__((warning(message)))
+# define __compiletime_error(message) __attribute__((error(message)))
+#endif /* __CHECKER__ */
+#endif /* GCC_VERSION >= 40300 */
+
+#if GCC_VERSION >= 40500
+/*
+ * Mark a position in code as unreachable. This can be used to
+ * suppress control flow warnings after asm blocks that transfer
+ * control elsewhere.
+ *
+ * Early snapshots of gcc 4.5 don't support this and we can't detect
+ * this in the preprocessor, but we can live with this because they're
+ * unreleased. Really, we need to have autoconf for the kernel.
+ */
+#define unreachable() __builtin_unreachable()
+
+/* Mark a function definition as prohibited from being cloned. */
+#define __noclone __attribute__((__noclone__))
+
+#endif /* GCC_VERSION >= 40500 */
+
+#if GCC_VERSION >= 40600
+/*
+ * Tell the optimizer that something else uses this function or variable.
+ */
+#define __visible __attribute__((externally_visible))
+#endif
+
+/*
+ * GCC 'asm goto' miscompiles certain code sequences:
+ *
+ * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670
+ *
+ * Work it around via a compiler barrier quirk suggested by Jakub Jelinek.
+ *
+ * (asm goto is automatically volatile - the naming reflects this.)
+ */
+#define asm_volatile_goto(x...) do { asm goto(x); asm (""); } while (0)
+
+#ifdef CONFIG_ARCH_USE_BUILTIN_BSWAP
+#if GCC_VERSION >= 40400
+#define __HAVE_BUILTIN_BSWAP32__
+#define __HAVE_BUILTIN_BSWAP64__
+#endif
+#if GCC_VERSION >= 40800 || (defined(__powerpc__) && GCC_VERSION >= 40600)
+#define __HAVE_BUILTIN_BSWAP16__
+#endif
+#endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP */
+
+#if GCC_VERSION >= 50000
+#define KASAN_ABI_VERSION 4
+#elif GCC_VERSION >= 40902
+#define KASAN_ABI_VERSION 3
+#endif
+
+#endif /* gcc version >= 40000 specific checks */
#if !defined(__noclone)
#define __noclone /* not needed */
@@ -129,5 +248,3 @@
* code
*/
#define uninitialized_var(x) x = x
-
-#define __always_inline inline __attribute__((always_inline))
diff --git a/include/linux/compiler-gcc3.h b/include/linux/compiler-gcc3.h
deleted file mode 100644
index 7d89febe4d7938..00000000000000
--- a/include/linux/compiler-gcc3.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef __LINUX_COMPILER_H
-#error "Please don't include <linux/compiler-gcc3.h> directly, include <linux/compiler.h> instead."
-#endif
-
-#if GCC_VERSION < 30200
-# error Sorry, your compiler is too old - please upgrade it.
-#endif
-
-#if GCC_VERSION >= 30300
-# define __used __attribute__((__used__))
-#else
-# define __used __attribute__((__unused__))
-#endif
-
-#if GCC_VERSION >= 30400
-#define __must_check __attribute__((warn_unused_result))
-#endif
-
-#ifdef CONFIG_GCOV_KERNEL
-# if GCC_VERSION < 30400
-# error "GCOV profiling support for gcc versions below 3.4 not included"
-# endif /* __GNUC_MINOR__ */
-#endif /* CONFIG_GCOV_KERNEL */
diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h
deleted file mode 100644
index 769e1986463208..00000000000000
--- a/include/linux/compiler-gcc4.h
+++ /dev/null
@@ -1,91 +0,0 @@
-#ifndef __LINUX_COMPILER_H
-#error "Please don't include <linux/compiler-gcc4.h> directly, include <linux/compiler.h> instead."
-#endif
-
-/* GCC 4.1.[01] miscompiles __weak */
-#ifdef __KERNEL__
-# if GCC_VERSION >= 40100 && GCC_VERSION <= 40101
-# error Your version of gcc miscompiles the __weak directive
-# endif
-#endif
-
-#define __used __attribute__((__used__))
-#define __must_check __attribute__((warn_unused_result))
-#define __compiler_offsetof(a,b) __builtin_offsetof(a,b)
-
-#if GCC_VERSION >= 40100 && GCC_VERSION < 40600
-# define __compiletime_object_size(obj) __builtin_object_size(obj, 0)
-#endif
-
-#if GCC_VERSION >= 40300
-/* Mark functions as cold. gcc will assume any path leading to a call
- to them will be unlikely. This means a lot of manual unlikely()s
- are unnecessary now for any paths leading to the usual suspects
- like BUG(), printk(), panic() etc. [but let's keep them for now for
- older compilers]
-
- Early snapshots of gcc 4.3 don't support this and we can't detect this
- in the preprocessor, but we can live with this because they're unreleased.
- Maketime probing would be overkill here.
-
- gcc also has a __attribute__((__hot__)) to move hot functions into
- a special section, but I don't see any sense in this right now in
- the kernel context */
-#define __cold __attribute__((__cold__))
-
-#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__)
-
-#ifndef __CHECKER__
-# define __compiletime_warning(message) __attribute__((warning(message)))
-# define __compiletime_error(message) __attribute__((error(message)))
-#endif /* __CHECKER__ */
-#endif /* GCC_VERSION >= 40300 */
-
-#if GCC_VERSION >= 40500
-/*
- * Mark a position in code as unreachable. This can be used to
- * suppress control flow warnings after asm blocks that transfer
- * control elsewhere.
- *
- * Early snapshots of gcc 4.5 don't support this and we can't detect
- * this in the preprocessor, but we can live with this because they're
- * unreleased. Really, we need to have autoconf for the kernel.
- */
-#define unreachable() __builtin_unreachable()
-
-/* Mark a function definition as prohibited from being cloned. */
-#define __noclone __attribute__((__noclone__))
-
-#endif /* GCC_VERSION >= 40500 */
-
-#if GCC_VERSION >= 40600
-/*
- * Tell the optimizer that something else uses this function or variable.
- */
-#define __visible __attribute__((externally_visible))
-#endif
-
-/*
- * GCC 'asm goto' miscompiles certain code sequences:
- *
- * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670
- *
- * Work it around via a compiler barrier quirk suggested by Jakub Jelinek.
- *
- * (asm goto is automatically volatile - the naming reflects this.)
- */
-#define asm_volatile_goto(x...) do { asm goto(x); asm (""); } while (0)
-
-#ifdef CONFIG_ARCH_USE_BUILTIN_BSWAP
-#if GCC_VERSION >= 40400
-#define __HAVE_BUILTIN_BSWAP32__
-#define __HAVE_BUILTIN_BSWAP64__
-#endif
-#if GCC_VERSION >= 40800 || (defined(__powerpc__) && GCC_VERSION >= 40600)
-#define __HAVE_BUILTIN_BSWAP16__
-#endif
-#endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP */
-
-#if GCC_VERSION >= 40902
-#define KASAN_ABI_VERSION 3
-#endif
diff --git a/include/linux/compiler-gcc5.h b/include/linux/compiler-gcc5.h
deleted file mode 100644
index efee493714eba9..00000000000000
--- a/include/linux/compiler-gcc5.h
+++ /dev/null
@@ -1,67 +0,0 @@
-#ifndef __LINUX_COMPILER_H
-#error "Please don't include <linux/compiler-gcc5.h> directly, include <linux/compiler.h> instead."
-#endif
-
-#define __used __attribute__((__used__))
-#define __must_check __attribute__((warn_unused_result))
-#define __compiler_offsetof(a, b) __builtin_offsetof(a, b)
-
-/* Mark functions as cold. gcc will assume any path leading to a call
- to them will be unlikely. This means a lot of manual unlikely()s
- are unnecessary now for any paths leading to the usual suspects
- like BUG(), printk(), panic() etc. [but let's keep them for now for
- older compilers]
-
- Early snapshots of gcc 4.3 don't support this and we can't detect this
- in the preprocessor, but we can live with this because they're unreleased.
- Maketime probing would be overkill here.
-
- gcc also has a __attribute__((__hot__)) to move hot functions into
- a special section, but I don't see any sense in this right now in
- the kernel context */
-#define __cold __attribute__((__cold__))
-
-#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__)
-
-#ifndef __CHECKER__
-# define __compiletime_warning(message) __attribute__((warning(message)))
-# define __compiletime_error(message) __attribute__((error(message)))
-#endif /* __CHECKER__ */
-
-/*
- * Mark a position in code as unreachable. This can be used to
- * suppress control flow warnings after asm blocks that transfer
- * control elsewhere.
- *
- * Early snapshots of gcc 4.5 don't support this and we can't detect
- * this in the preprocessor, but we can live with this because they're
- * unreleased. Really, we need to have autoconf for the kernel.
- */
-#define unreachable() __builtin_unreachable()
-
-/* Mark a function definition as prohibited from being cloned. */
-#define __noclone __attribute__((__noclone__))
-
-/*
- * Tell the optimizer that something else uses this function or variable.
- */
-#define __visible __attribute__((externally_visible))
-
-/*
- * GCC 'asm goto' miscompiles certain code sequences:
- *
- * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670
- *
- * Work it around via a compiler barrier quirk suggested by Jakub Jelinek.
- *
- * (asm goto is automatically volatile - the naming reflects this.)
- */
-#define asm_volatile_goto(x...) do { asm goto(x); asm (""); } while (0)
-
-#ifdef CONFIG_ARCH_USE_BUILTIN_BSWAP
-#define __HAVE_BUILTIN_BSWAP32__
-#define __HAVE_BUILTIN_BSWAP64__
-#define __HAVE_BUILTIN_BSWAP16__
-#endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP */
-
-#define KASAN_ABI_VERSION 4
diff --git a/include/linux/compiler-intel.h b/include/linux/compiler-intel.h
index 0c9a2f2c2802fc..d4c71132d07f0b 100644
--- a/include/linux/compiler-intel.h
+++ b/include/linux/compiler-intel.h
@@ -13,10 +13,12 @@
/* Intel ECC compiler doesn't support gcc specific asm stmts.
* It uses intrinsics to do the equivalent things.
*/
+#undef barrier
#undef barrier_data
#undef RELOC_HIDE
#undef OPTIMIZER_HIDE_VAR
+#define barrier() __memory_barrier()
#define barrier_data(ptr) barrier()
#define RELOC_HIDE(ptr, off) \
diff --git a/include/linux/configfs.h b/include/linux/configfs.h
index 34025df6182962..c9e5c57e4edf2c 100644
--- a/include/linux/configfs.h
+++ b/include/linux/configfs.h
@@ -71,7 +71,6 @@ static inline char *config_item_name(struct config_item * item)
return item->ci_name;
}
-extern void config_item_init(struct config_item *);
extern void config_item_init_type_name(struct config_item *item,
const char *name,
struct config_item_type *type);
diff --git a/include/linux/console.h b/include/linux/console.h
index 9f50fb413c11cb..bd194343c3460d 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -115,6 +115,7 @@ static inline int con_debug_leave(void)
#define CON_BOOT (8)
#define CON_ANYTIME (16) /* Safe to call when cpu is offline */
#define CON_BRL (32) /* Used for a braille device */
+#define CON_EXTENDED (64) /* Use the extended output format a la /dev/kmsg */
struct console {
char name[16];
diff --git a/include/linux/crc64_ecma.h b/include/linux/crc64_ecma.h
new file mode 100644
index 00000000000000..bba7a4d692b34a
--- /dev/null
+++ b/include/linux/crc64_ecma.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2013 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Freescale Semiconductor nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __CRC64_ECMA_H_
+#define __CRC64_ECMA_H_
+
+#include <linux/types.h>
+
+
+#define CRC64_DEFAULT_INITVAL 0xFFFFFFFFFFFFFFFFULL
+
+
+/*
+ * crc64_ecma_seed - Initializes the CRC64 ECMA seed.
+ */
+u64 crc64_ecma_seed(void);
+
+/*
+ * crc64_ecma - Computes the 64 bit ECMA CRC.
+ *
+ * @pdata: pointer to the data to compute checksum for.
+ * @nbytes: number of bytes in data buffer.
+ * @seed: CRC seed.
+ */
+u64 crc64_ecma(u8 const *pdata, u32 nbytes, u64 seed);
+
+#endif /* __CRC64_ECMA_H_ */
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 024c27e7c0faa9..1f75b5d5fbd8e9 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -96,6 +96,8 @@ typedef struct {
#define EFI_MEMORY_WP ((u64)0x0000000000001000ULL) /* write-protect */
#define EFI_MEMORY_RP ((u64)0x0000000000002000ULL) /* read-protect */
#define EFI_MEMORY_XP ((u64)0x0000000000004000ULL) /* execute-protect */
+#define EFI_MEMORY_MORE_RELIABLE \
+ ((u64)0x0000000000010000ULL) /* higher reliability */
#define EFI_MEMORY_RUNTIME ((u64)0x8000000000000000ULL) /* range requires runtime mapping */
#define EFI_MEMORY_DESCRIPTOR_VERSION 1
@@ -868,6 +870,7 @@ extern void efi_enter_virtual_mode (void); /* switch EFI to virtual mode, if pos
extern void efi_late_init(void);
extern void efi_free_boot_services(void);
extern efi_status_t efi_query_variable_store(u32 attributes, unsigned long size);
+extern void efi_find_mirror(void);
#else
static inline void efi_late_init(void) {}
static inline void efi_free_boot_services(void) {}
diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h
index 8293262401de39..e65ef959546cd8 100644
--- a/include/linux/frontswap.h
+++ b/include/linux/frontswap.h
@@ -6,16 +6,16 @@
#include <linux/bitops.h>
struct frontswap_ops {
- void (*init)(unsigned);
- int (*store)(unsigned, pgoff_t, struct page *);
- int (*load)(unsigned, pgoff_t, struct page *);
- void (*invalidate_page)(unsigned, pgoff_t);
- void (*invalidate_area)(unsigned);
+ void (*init)(unsigned); /* this swap type was just swapon'ed */
+ int (*store)(unsigned, pgoff_t, struct page *); /* store a page */
+ int (*load)(unsigned, pgoff_t, struct page *); /* load a page */
+ void (*invalidate_page)(unsigned, pgoff_t); /* page no longer needed */
+ void (*invalidate_area)(unsigned); /* swap type just swapoff'ed */
+ struct frontswap_ops *next; /* private pointer to next ops */
};
extern bool frontswap_enabled;
-extern struct frontswap_ops *
- frontswap_register_ops(struct frontswap_ops *ops);
+extern void frontswap_register_ops(struct frontswap_ops *ops);
extern void frontswap_shrink(unsigned long);
extern unsigned long frontswap_curr_pages(void);
extern void frontswap_writethrough(bool);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 0f313f93c586f8..65a517dd32f7ad 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -84,8 +84,6 @@ struct fsnotify_fname;
* Each group much define these ops. The fsnotify infrastructure will call
* these operations for each relevant group.
*
- * should_send_event - given a group, inode, and mask this function determines
- * if the group is interested in this event.
* handle_event - main call for a group to handle an fs event
* free_group_priv - called when a group refcnt hits 0 to clean up the private union
* freeing_mark - called when a mark is being destroyed for some reason. The group
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 6ba7cf23748fe9..ad35f300b9a460 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -384,6 +384,14 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
void drain_all_pages(struct zone *zone);
void drain_local_pages(struct zone *zone);
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+void page_alloc_init_late(void);
+#else
+static inline void page_alloc_init_late(void)
+{
+}
+#endif
+
/*
* gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what
* GFP flags are used before interrupts are enabled. Once interrupts are
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index f10b20f0515992..44a840a5397453 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -19,6 +19,9 @@ extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
unsigned long addr,
pmd_t *pmd,
unsigned int flags);
+extern int madvise_free_huge_pmd(struct mmu_gather *tlb,
+ struct vm_area_struct *vma,
+ pmd_t *pmd, unsigned long addr);
extern int zap_huge_pmd(struct mmu_gather *tlb,
struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr);
@@ -56,6 +59,7 @@ extern pmd_t *page_check_address_pmd(struct page *page,
unsigned long address,
enum page_check_address_pmd_flag flag,
spinlock_t **ptl);
+extern int pmd_freeable(pmd_t pmd);
#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index cfa9351c7536ff..5f0be58640ea6e 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -439,6 +439,9 @@ extern int panic_on_unrecovered_nmi;
extern int panic_on_io_nmi;
extern int panic_on_warn;
extern int sysctl_panic_on_stackoverflow;
+
+extern bool crash_kexec_post_notifiers;
+
/*
* Only to be used by arch init code. If the user over-wrote the default
* CONFIG_PANIC_TIMEOUT, honor it.
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index e804306ef5e88d..e5fe4c1416a2e5 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -269,6 +269,8 @@ unsigned long paddr_vmcoreinfo_note(void);
vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name)
#define VMCOREINFO_CONFIG(name) \
vmcoreinfo_append_str("CONFIG_%s=y\n", #name)
+#define VMCOREINFO_PHYS_BASE(value) \
+ vmcoreinfo_append_str("PHYS_BASE=%lx\n", (unsigned long)value)
extern struct kimage *kexec_image;
extern struct kimage *kexec_crash_image;
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 9497ec7c77ea1b..cc4b0197206006 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -21,7 +21,11 @@
#define INIT_PHYSMEM_REGIONS 4
/* Definition of memblock flags. */
-#define MEMBLOCK_HOTPLUG 0x1 /* hotpluggable region */
+enum {
+ MEMBLOCK_NONE = 0x0, /* No special request */
+ MEMBLOCK_HOTPLUG = 0x1, /* hotpluggable region */
+ MEMBLOCK_MIRROR = 0x2, /* mirrored region */
+};
struct memblock_region {
phys_addr_t base;
@@ -61,7 +65,7 @@ extern bool movable_node_enabled;
phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align,
phys_addr_t start, phys_addr_t end,
- int nid);
+ int nid, ulong flags);
phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
phys_addr_t size, phys_addr_t align);
phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr);
@@ -75,6 +79,8 @@ int memblock_reserve(phys_addr_t base, phys_addr_t size);
void memblock_trim_memory(phys_addr_t align);
int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size);
int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size);
+int memblock_mark_mirror(phys_addr_t base, phys_addr_t size);
+ulong choose_memblock_flags(void);
/* Low level functions */
int memblock_add_range(struct memblock_type *type,
@@ -85,14 +91,19 @@ int memblock_remove_range(struct memblock_type *type,
phys_addr_t base,
phys_addr_t size);
-void __next_mem_range(u64 *idx, int nid, struct memblock_type *type_a,
+void __next_mem_range(u64 *idx, int nid, ulong flags,
+ struct memblock_type *type_a,
struct memblock_type *type_b, phys_addr_t *out_start,
phys_addr_t *out_end, int *out_nid);
-void __next_mem_range_rev(u64 *idx, int nid, struct memblock_type *type_a,
+void __next_mem_range_rev(u64 *idx, int nid, ulong flags,
+ struct memblock_type *type_a,
struct memblock_type *type_b, phys_addr_t *out_start,
phys_addr_t *out_end, int *out_nid);
+void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start,
+ phys_addr_t *out_end);
+
/**
* for_each_mem_range - iterate through memblock areas from type_a and not
* included in type_b. Or just type_a if type_b is NULL.
@@ -100,16 +111,17 @@ void __next_mem_range_rev(u64 *idx, int nid, struct memblock_type *type_a,
* @type_a: ptr to memblock_type to iterate
* @type_b: ptr to memblock_type which excludes from the iteration
* @nid: node selector, %NUMA_NO_NODE for all nodes
+ * @flags: pick from blocks based on memory attributes
* @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
* @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
* @p_nid: ptr to int for nid of the range, can be %NULL
*/
-#define for_each_mem_range(i, type_a, type_b, nid, \
+#define for_each_mem_range(i, type_a, type_b, nid, flags, \
p_start, p_end, p_nid) \
- for (i = 0, __next_mem_range(&i, nid, type_a, type_b, \
+ for (i = 0, __next_mem_range(&i, nid, flags, type_a, type_b, \
p_start, p_end, p_nid); \
i != (u64)ULLONG_MAX; \
- __next_mem_range(&i, nid, type_a, type_b, \
+ __next_mem_range(&i, nid, flags, type_a, type_b, \
p_start, p_end, p_nid))
/**
@@ -119,19 +131,35 @@ void __next_mem_range_rev(u64 *idx, int nid, struct memblock_type *type_a,
* @type_a: ptr to memblock_type to iterate
* @type_b: ptr to memblock_type which excludes from the iteration
* @nid: node selector, %NUMA_NO_NODE for all nodes
+ * @flags: pick from blocks based on memory attributes
* @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
* @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
* @p_nid: ptr to int for nid of the range, can be %NULL
*/
-#define for_each_mem_range_rev(i, type_a, type_b, nid, \
+#define for_each_mem_range_rev(i, type_a, type_b, nid, flags, \
p_start, p_end, p_nid) \
for (i = (u64)ULLONG_MAX, \
- __next_mem_range_rev(&i, nid, type_a, type_b, \
+ __next_mem_range_rev(&i, nid, flags, type_a, type_b,\
p_start, p_end, p_nid); \
i != (u64)ULLONG_MAX; \
- __next_mem_range_rev(&i, nid, type_a, type_b, \
+ __next_mem_range_rev(&i, nid, flags, type_a, type_b, \
p_start, p_end, p_nid))
+/**
+ * for_each_reserved_mem_region - iterate over all reserved memblock areas
+ * @i: u64 used as loop variable
+ * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ *
+ * Walks over reserved areas of memblock. Available as soon as memblock
+ * is initialized.
+ */
+#define for_each_reserved_mem_region(i, p_start, p_end) \
+ for (i = 0UL, \
+ __next_reserved_mem_region(&i, p_start, p_end); \
+ i != (u64)ULLONG_MAX; \
+ __next_reserved_mem_region(&i, p_start, p_end))
+
#ifdef CONFIG_MOVABLE_NODE
static inline bool memblock_is_hotpluggable(struct memblock_region *m)
{
@@ -153,6 +181,11 @@ static inline bool movable_node_is_enabled(void)
}
#endif
+static inline bool memblock_is_mirror(struct memblock_region *m)
+{
+ return m->flags & MEMBLOCK_MIRROR;
+}
+
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn,
unsigned long *end_pfn);
@@ -181,13 +214,14 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
* @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
* @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
* @p_nid: ptr to int for nid of the range, can be %NULL
+ * @flags: pick from blocks based on memory attributes
*
* Walks over free (memory && !reserved) areas of memblock. Available as
* soon as memblock is initialized.
*/
-#define for_each_free_mem_range(i, nid, p_start, p_end, p_nid) \
+#define for_each_free_mem_range(i, nid, flags, p_start, p_end, p_nid) \
for_each_mem_range(i, &memblock.memory, &memblock.reserved, \
- nid, p_start, p_end, p_nid)
+ nid, flags, p_start, p_end, p_nid)
/**
* for_each_free_mem_range_reverse - rev-iterate through free memblock areas
@@ -196,13 +230,15 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
* @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
* @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
* @p_nid: ptr to int for nid of the range, can be %NULL
+ * @flags: pick from blocks based on memory attributes
*
* Walks over free (memory && !reserved) areas of memblock in reverse
* order. Available as soon as memblock is initialized.
*/
-#define for_each_free_mem_range_reverse(i, nid, p_start, p_end, p_nid) \
+#define for_each_free_mem_range_reverse(i, nid, flags, p_start, p_end, \
+ p_nid) \
for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \
- nid, p_start, p_end, p_nid)
+ nid, flags, p_start, p_end, p_nid)
static inline void memblock_set_region_flags(struct memblock_region *r,
unsigned long flags)
@@ -273,7 +309,8 @@ static inline bool memblock_bottom_up(void) { return false; }
#define MEMBLOCK_ALLOC_ACCESSIBLE 0
phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
- phys_addr_t start, phys_addr_t end);
+ phys_addr_t start, phys_addr_t end,
+ ulong flags);
phys_addr_t memblock_alloc_base(phys_addr_t size, phys_addr_t align,
phys_addr_t max_addr);
phys_addr_t __memblock_alloc_base(phys_addr_t size, phys_addr_t align,
diff --git a/include/linux/mm-arch-hooks.h b/include/linux/mm-arch-hooks.h
new file mode 100644
index 00000000000000..4efc3f56e6dfda
--- /dev/null
+++ b/include/linux/mm-arch-hooks.h
@@ -0,0 +1,25 @@
+/*
+ * Generic mm no-op hooks.
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _LINUX_MM_ARCH_HOOKS_H
+#define _LINUX_MM_ARCH_HOOKS_H
+
+#include <asm/mm-arch-hooks.h>
+
+#ifndef arch_remap
+static inline void arch_remap(struct mm_struct *mm,
+ unsigned long old_start, unsigned long old_end,
+ unsigned long new_start, unsigned long new_end)
+{
+}
+#define arch_remap arch_remap
+#endif
+
+#endif /* _LINUX_MM_ARCH_HOOKS_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4024543b4203eb..7f471789781a93 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -124,8 +124,10 @@ extern unsigned int kobjsize(const void *objp);
#define VM_MAYSHARE 0x00000080
#define VM_GROWSDOWN 0x00000100 /* general info on the segment */
+#define VM_UFFD_MISSING 0x00000200 /* missing pages tracking */
#define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */
#define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */
+#define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */
#define VM_LOCKED 0x00002000
#define VM_IO 0x00004000 /* Memory mapped I/O or similar */
@@ -437,46 +439,6 @@ static inline void compound_unlock_irqrestore(struct page *page,
#endif
}
-static inline struct page *compound_head_by_tail(struct page *tail)
-{
- struct page *head = tail->first_page;
-
- /*
- * page->first_page may be a dangling pointer to an old
- * compound page, so recheck that it is still a tail
- * page before returning.
- */
- smp_rmb();
- if (likely(PageTail(tail)))
- return head;
- return tail;
-}
-
-/*
- * Since either compound page could be dismantled asynchronously in THP
- * or we access asynchronously arbitrary positioned struct page, there
- * would be tail flag race. To handle this race, we should call
- * smp_rmb() before checking tail flag. compound_head_by_tail() did it.
- */
-static inline struct page *compound_head(struct page *page)
-{
- if (unlikely(PageTail(page)))
- return compound_head_by_tail(page);
- return page;
-}
-
-/*
- * If we access compound page synchronously such as access to
- * allocated page, there is no need to handle tail flag race, so we can
- * check tail flag directly without any synchronization primitive.
- */
-static inline struct page *compound_head_fast(struct page *page)
-{
- if (unlikely(PageTail(page)))
- return page->first_page;
- return page;
-}
-
/*
* The atomic page->_mapcount, starts from -1: so that transitions
* both from it and to it can be tracked, using atomic_inc_and_test
@@ -500,7 +462,7 @@ static inline int page_count(struct page *page)
static inline bool __compound_tail_refcounted(struct page *page)
{
- return !PageSlab(page) && !PageHeadHuge(page);
+ return PageAnon(page) && !PageSlab(page) && !PageHeadHuge(page);
}
/*
@@ -1635,6 +1597,8 @@ extern void free_highmem_page(struct page *page);
extern void adjust_managed_page_count(struct page *page, long count);
extern void mem_init_print_info(const char *str);
+extern void reserve_bootmem_region(unsigned long start, unsigned long end);
+
/* Free the reserved page into the buddy system, so it gets managed. */
static inline void __free_reserved_page(struct page *page)
{
@@ -1724,7 +1688,8 @@ extern void sparse_memory_present_with_active_regions(int nid);
#if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \
!defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID)
-static inline int __early_pfn_to_nid(unsigned long pfn)
+static inline int __early_pfn_to_nid(unsigned long pfn,
+ struct mminit_pfnnid_cache *state)
{
return 0;
}
@@ -1732,7 +1697,8 @@ static inline int __early_pfn_to_nid(unsigned long pfn)
/* please see mm/page_alloc.c */
extern int __meminit early_pfn_to_nid(unsigned long pfn);
/* there is a per-arch backend function. */
-extern int __meminit __early_pfn_to_nid(unsigned long pfn);
+extern int __meminit __early_pfn_to_nid(unsigned long pfn,
+ struct mminit_pfnnid_cache *state);
#endif
extern void set_dma_reserve(unsigned long new_dma_reserve);
@@ -1801,7 +1767,7 @@ extern int vma_adjust(struct vm_area_struct *vma, unsigned long start,
extern struct vm_area_struct *vma_merge(struct mm_struct *,
struct vm_area_struct *prev, unsigned long addr, unsigned long end,
unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
- struct mempolicy *);
+ struct mempolicy *, struct vm_userfaultfd_ctx);
extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
extern int split_vma(struct mm_struct *,
struct vm_area_struct *, unsigned long addr, int new_below);
@@ -2150,12 +2116,47 @@ enum mf_flags {
extern int memory_failure(unsigned long pfn, int trapno, int flags);
extern void memory_failure_queue(unsigned long pfn, int trapno, int flags);
extern int unpoison_memory(unsigned long pfn);
+extern int get_hwpoison_page(struct page *page);
extern int sysctl_memory_failure_early_kill;
extern int sysctl_memory_failure_recovery;
extern void shake_page(struct page *p, int access);
extern atomic_long_t num_poisoned_pages;
extern int soft_offline_page(struct page *page, int flags);
+
+/*
+ * Error handlers for various types of pages.
+ */
+enum mf_result {
+ MF_IGNORED, /* Error: cannot be handled */
+ MF_FAILED, /* Error: handling failed */
+ MF_DELAYED, /* Will be handled later */
+ MF_RECOVERED, /* Successfully recovered */
+};
+
+enum mf_action_page_type {
+ MF_MSG_KERNEL,
+ MF_MSG_KERNEL_HIGH_ORDER,
+ MF_MSG_SLAB,
+ MF_MSG_DIFFERENT_COMPOUND,
+ MF_MSG_POISONED_HUGE,
+ MF_MSG_HUGE,
+ MF_MSG_FREE_HUGE,
+ MF_MSG_UNMAP_FAILED,
+ MF_MSG_DIRTY_SWAPCACHE,
+ MF_MSG_CLEAN_SWAPCACHE,
+ MF_MSG_DIRTY_MLOCKED_LRU,
+ MF_MSG_CLEAN_MLOCKED_LRU,
+ MF_MSG_DIRTY_UNEVICTABLE_LRU,
+ MF_MSG_CLEAN_UNEVICTABLE_LRU,
+ MF_MSG_DIRTY_LRU,
+ MF_MSG_CLEAN_LRU,
+ MF_MSG_TRUNCATED_LRU,
+ MF_MSG_BUDDY,
+ MF_MSG_BUDDY_2ND,
+ MF_MSG_UNKNOWN,
+};
+
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
extern void clear_huge_page(struct page *page,
unsigned long addr,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 0038ac7466fd26..2836da7c3264cc 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -265,6 +265,16 @@ struct vm_region {
* this region */
};
+#ifdef CONFIG_USERFAULTFD
+#define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) { NULL, })
+struct vm_userfaultfd_ctx {
+ struct userfaultfd_ctx *ctx;
+};
+#else /* CONFIG_USERFAULTFD */
+#define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) {})
+struct vm_userfaultfd_ctx {};
+#endif /* CONFIG_USERFAULTFD */
+
/*
* This struct defines a memory VMM memory area. There is one of these
* per VM-area/task. A VM area is any part of the process virtual memory
@@ -331,6 +341,7 @@ struct vm_area_struct {
#ifdef CONFIG_NUMA
struct mempolicy *vm_policy; /* NUMA policy for the VMA */
#endif
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
};
struct core_thread {
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 95243d28a0ee72..61cd67f4d7881c 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -324,25 +324,25 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
___pte; \
})
-#define pmdp_clear_flush_notify(__vma, __haddr, __pmd) \
+#define pmdp_huge_clear_flush_notify(__vma, __haddr, __pmd) \
({ \
unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \
struct mm_struct *___mm = (__vma)->vm_mm; \
pmd_t ___pmd; \
\
- ___pmd = pmdp_clear_flush(__vma, __haddr, __pmd); \
+ ___pmd = pmdp_huge_clear_flush(__vma, __haddr, __pmd); \
mmu_notifier_invalidate_range(___mm, ___haddr, \
___haddr + HPAGE_PMD_SIZE); \
\
___pmd; \
})
-#define pmdp_get_and_clear_notify(__mm, __haddr, __pmd) \
+#define pmdp_huge_get_and_clear_notify(__mm, __haddr, __pmd) \
({ \
unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \
pmd_t ___pmd; \
\
- ___pmd = pmdp_get_and_clear(__mm, __haddr, __pmd); \
+ ___pmd = pmdp_huge_get_and_clear(__mm, __haddr, __pmd); \
mmu_notifier_invalidate_range(__mm, ___haddr, \
___haddr + HPAGE_PMD_SIZE); \
\
@@ -428,8 +428,8 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
#define ptep_clear_flush_young_notify ptep_clear_flush_young
#define pmdp_clear_flush_young_notify pmdp_clear_flush_young
#define ptep_clear_flush_notify ptep_clear_flush
-#define pmdp_clear_flush_notify pmdp_clear_flush
-#define pmdp_get_and_clear_notify pmdp_get_and_clear
+#define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush
+#define pmdp_huge_get_and_clear_notify pmdp_huge_get_and_clear
#define set_pte_at_notify set_pte_at
#endif /* CONFIG_MMU_NOTIFIER */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 54d74f6eb23352..754c25966a0a78 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -762,6 +762,14 @@ typedef struct pglist_data {
/* Number of pages migrated during the rate limiting time interval */
unsigned long numabalancing_migrate_nr_pages;
#endif
+
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+ /*
+ * If memory initialisation on large machines is deferred then this
+ * is the first PFN that needs to be initialised.
+ */
+ unsigned long first_deferred_pfn;
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
} pg_data_t;
#define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages)
@@ -1216,11 +1224,16 @@ void sparse_init(void);
#define sparse_index_init(_sec, _nid) do {} while (0)
#endif /* CONFIG_SPARSEMEM */
-#ifdef CONFIG_NODES_SPAN_OTHER_NODES
-bool early_pfn_in_nid(unsigned long pfn, int nid);
-#else
-#define early_pfn_in_nid(pfn, nid) (1)
-#endif
+/*
+ * During memory init memblocks map pfns to nids. The search is expensive and
+ * this caches recent lookups. The implementation of __early_pfn_to_nid
+ * may treat start/end as pfns or sections.
+ */
+struct mminit_pfnnid_cache {
+ unsigned long last_start;
+ unsigned long last_end;
+ int last_nid;
+};
#ifndef early_pfn_valid
#define early_pfn_valid(pfn) (1)
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 3d46fb4708e051..f94da0e65dea90 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -67,6 +67,7 @@ extern int nmi_watchdog_enabled;
extern int soft_watchdog_enabled;
extern int watchdog_user_enabled;
extern int watchdog_thresh;
+extern unsigned long *watchdog_cpumask_bits;
extern int sysctl_softlockup_all_cpu_backtrace;
struct ctl_table;
extern int proc_watchdog(struct ctl_table *, int ,
@@ -77,6 +78,8 @@ extern int proc_soft_watchdog(struct ctl_table *, int ,
void __user *, size_t *, loff_t *);
extern int proc_watchdog_thresh(struct ctl_table *, int ,
void __user *, size_t *, loff_t *);
+extern int proc_watchdog_cpumask(struct ctl_table *, int,
+ void __user *, size_t *, loff_t *);
#endif
#ifdef CONFIG_HAVE_ACPI_APEI_NMI
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 44b2f6f7bbd832..061e0ffd3493bb 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -32,6 +32,8 @@ enum oom_scan_t {
/* Thread is the potential origin of an oom condition; kill first on oom */
#define OOM_FLAG_ORIGIN ((__force oom_flags_t)0x1)
+extern struct mutex oom_lock;
+
static inline void set_current_oom_origin(void)
{
current->signal->oom_flags |= OOM_FLAG_ORIGIN;
@@ -47,9 +49,7 @@ static inline bool oom_task_origin(const struct task_struct *p)
return !!(p->signal->oom_flags & OOM_FLAG_ORIGIN);
}
-extern void mark_tsk_oom_victim(struct task_struct *tsk);
-
-extern void unmark_oom_victim(void);
+extern void mark_oom_victim(struct task_struct *tsk);
extern unsigned long oom_badness(struct task_struct *p,
struct mem_cgroup *memcg, const nodemask_t *nodemask,
@@ -62,9 +62,6 @@ extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
struct mem_cgroup *memcg, nodemask_t *nodemask,
const char *message);
-extern bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_flags);
-extern void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_flags);
-
extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
int order, const nodemask_t *nodemask,
struct mem_cgroup *memcg);
@@ -73,8 +70,12 @@ extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
unsigned long totalpages, const nodemask_t *nodemask,
bool force_kill);
+extern bool force_out_of_memory(void);
extern bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
- int order, nodemask_t *mask, bool force_kill);
+ int order, nodemask_t *mask);
+
+extern void exit_oom_victim(void);
+
extern int register_oom_notifier(struct notifier_block *nb);
extern int unregister_oom_notifier(struct notifier_block *nb);
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index f34e040b34e9ff..91b7f9b2b77433 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -134,49 +134,68 @@ enum pageflags {
#ifndef __GENERATING_BOUNDS_H
+/* Page flags policies wrt compound pages */
+#define PF_ANY(page, enforce) page
+#define PF_HEAD(page, enforce) compound_head(page)
+#define PF_NO_TAIL(page, enforce) ({ \
+ if (enforce) \
+ VM_BUG_ON_PAGE(PageTail(page), page); \
+ else \
+ page = compound_head(page); \
+ page;})
+#define PF_NO_COMPOUND(page, enforce) ({ \
+ if (enforce) \
+ VM_BUG_ON_PAGE(PageCompound(page), page); \
+ page;})
+
/*
* Macros to create function definitions for page flags
*/
-#define TESTPAGEFLAG(uname, lname) \
-static inline int Page##uname(const struct page *page) \
- { return test_bit(PG_##lname, &page->flags); }
+#define TESTPAGEFLAG(uname, lname, policy) \
+static inline int Page##uname(struct page *page) \
+ { return test_bit(PG_##lname, &policy(page, 0)->flags); }
-#define SETPAGEFLAG(uname, lname) \
+#define SETPAGEFLAG(uname, lname, policy) \
static inline void SetPage##uname(struct page *page) \
- { set_bit(PG_##lname, &page->flags); }
+ { set_bit(PG_##lname, &policy(page, 1)->flags); }
-#define CLEARPAGEFLAG(uname, lname) \
+#define CLEARPAGEFLAG(uname, lname, policy) \
static inline void ClearPage##uname(struct page *page) \
- { clear_bit(PG_##lname, &page->flags); }
+ { clear_bit(PG_##lname, &policy(page, 1)->flags); }
-#define __SETPAGEFLAG(uname, lname) \
+#define __SETPAGEFLAG(uname, lname, policy) \
static inline void __SetPage##uname(struct page *page) \
- { __set_bit(PG_##lname, &page->flags); }
+ { __set_bit(PG_##lname, &policy(page, 1)->flags); }
-#define __CLEARPAGEFLAG(uname, lname) \
+#define __CLEARPAGEFLAG(uname, lname, policy) \
static inline void __ClearPage##uname(struct page *page) \
- { __clear_bit(PG_##lname, &page->flags); }
+ { __clear_bit(PG_##lname, &policy(page, 1)->flags); }
-#define TESTSETFLAG(uname, lname) \
+#define TESTSETFLAG(uname, lname, policy) \
static inline int TestSetPage##uname(struct page *page) \
- { return test_and_set_bit(PG_##lname, &page->flags); }
+ { return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); }
-#define TESTCLEARFLAG(uname, lname) \
+#define TESTCLEARFLAG(uname, lname, policy) \
static inline int TestClearPage##uname(struct page *page) \
- { return test_and_clear_bit(PG_##lname, &page->flags); }
+ { return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); }
-#define __TESTCLEARFLAG(uname, lname) \
+#define __TESTCLEARFLAG(uname, lname, policy) \
static inline int __TestClearPage##uname(struct page *page) \
- { return __test_and_clear_bit(PG_##lname, &page->flags); }
+ { return __test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); }
-#define PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname) \
- SETPAGEFLAG(uname, lname) CLEARPAGEFLAG(uname, lname)
+#define PAGEFLAG(uname, lname, policy) \
+ TESTPAGEFLAG(uname, lname, policy) \
+ SETPAGEFLAG(uname, lname, policy) \
+ CLEARPAGEFLAG(uname, lname, policy)
-#define __PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname) \
- __SETPAGEFLAG(uname, lname) __CLEARPAGEFLAG(uname, lname)
+#define __PAGEFLAG(uname, lname, policy) \
+ TESTPAGEFLAG(uname, lname, policy) \
+ __SETPAGEFLAG(uname, lname, policy) \
+ __CLEARPAGEFLAG(uname, lname, policy)
-#define TESTSCFLAG(uname, lname) \
- TESTSETFLAG(uname, lname) TESTCLEARFLAG(uname, lname)
+#define TESTSCFLAG(uname, lname, policy) \
+ TESTSETFLAG(uname, lname, policy) \
+ TESTCLEARFLAG(uname, lname, policy)
#define TESTPAGEFLAG_FALSE(uname) \
static inline int Page##uname(const struct page *page) { return 0; }
@@ -205,47 +224,100 @@ static inline int __TestClearPage##uname(struct page *page) { return 0; }
#define TESTSCFLAG_FALSE(uname) \
TESTSETFLAG_FALSE(uname) TESTCLEARFLAG_FALSE(uname)
-struct page; /* forward declaration */
-
-TESTPAGEFLAG(Locked, locked)
-PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error)
-PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced)
- __SETPAGEFLAG(Referenced, referenced)
-PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty)
-PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru)
-PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active)
- TESTCLEARFLAG(Active, active)
-__PAGEFLAG(Slab, slab)
-PAGEFLAG(Checked, checked) /* Used by some filesystems */
-PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned) /* Xen */
-PAGEFLAG(SavePinned, savepinned); /* Xen */
-PAGEFLAG(Foreign, foreign); /* Xen */
-PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
-PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked)
- __SETPAGEFLAG(SwapBacked, swapbacked)
-
-__PAGEFLAG(SlobFree, slob_free)
+/* Forward declarations */
+struct page;
+static inline int PageCompound(struct page *page);
+static inline int PageTail(struct page *page);
+
+static inline struct page *compound_head_by_tail(struct page *tail)
+{
+ struct page *head = tail->first_page;
+
+ /*
+ * page->first_page may be a dangling pointer to an old
+ * compound page, so recheck that it is still a tail
+ * page before returning.
+ */
+ smp_rmb();
+ if (likely(PageTail(tail)))
+ return head;
+ return tail;
+}
+
+/*
+ * Since either compound page could be dismantled asynchronously in THP
+ * or we access asynchronously arbitrary positioned struct page, there
+ * would be tail flag race. To handle this race, we should call
+ * smp_rmb() before checking tail flag. compound_head_by_tail() did it.
+ */
+static inline struct page *compound_head(struct page *page)
+{
+ if (unlikely(PageTail(page)))
+ return compound_head_by_tail(page);
+ return page;
+}
+
+/*
+ * If we access compound page synchronously such as access to
+ * allocated page, there is no need to handle tail flag race, so we can
+ * check tail flag directly without any synchronization primitive.
+ */
+static inline struct page *compound_head_fast(struct page *page)
+{
+ if (unlikely(PageTail(page)))
+ return page->first_page;
+ return page;
+}
+
+__PAGEFLAG(Locked, locked, PF_NO_TAIL)
+PAGEFLAG(Error, error, PF_NO_COMPOUND) TESTCLEARFLAG(Error, error, PF_NO_COMPOUND)
+PAGEFLAG(Referenced, referenced, PF_HEAD)
+ TESTCLEARFLAG(Referenced, referenced, PF_HEAD)
+ __SETPAGEFLAG(Referenced, referenced, PF_HEAD)
+PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD)
+ __CLEARPAGEFLAG(Dirty, dirty, PF_HEAD)
+PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD)
+PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD)
+ TESTCLEARFLAG(Active, active, PF_HEAD)
+__PAGEFLAG(Slab, slab, PF_NO_TAIL)
+__PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL)
+PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */
+
+/* Xen */
+PAGEFLAG(Pinned, pinned, PF_NO_COMPOUND) TESTSCFLAG(Pinned, pinned, PF_NO_COMPOUND)
+PAGEFLAG(SavePinned, savepinned, PF_NO_COMPOUND)
+PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND)
+
+PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
+ __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
+PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
+ __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
+ __SETPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
/*
* Private page markings that may be used by the filesystem that owns the page
* for its own purposes.
* - PG_private and PG_private_2 cause releasepage() and co to be invoked
*/
-PAGEFLAG(Private, private) __SETPAGEFLAG(Private, private)
- __CLEARPAGEFLAG(Private, private)
-PAGEFLAG(Private2, private_2) TESTSCFLAG(Private2, private_2)
-PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1)
+PAGEFLAG(Private, private, PF_ANY) __SETPAGEFLAG(Private, private, PF_ANY)
+ __CLEARPAGEFLAG(Private, private, PF_ANY)
+PAGEFLAG(Private2, private_2, PF_ANY) TESTSCFLAG(Private2, private_2, PF_ANY)
+PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
+ TESTCLEARFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
/*
* Only test-and-set exist for PG_writeback. The unconditional operators are
* risky: they bypass page accounting.
*/
-TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback)
-PAGEFLAG(MappedToDisk, mappedtodisk)
+TESTPAGEFLAG(Writeback, writeback, PF_NO_COMPOUND)
+ TESTSCFLAG(Writeback, writeback, PF_NO_COMPOUND)
+PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_COMPOUND)
/* PG_readahead is only used for reads; PG_reclaim is only for writes */
-PAGEFLAG(Reclaim, reclaim) TESTCLEARFLAG(Reclaim, reclaim)
-PAGEFLAG(Readahead, reclaim) TESTCLEARFLAG(Readahead, reclaim)
+PAGEFLAG(Reclaim, reclaim, PF_NO_COMPOUND)
+ TESTCLEARFLAG(Reclaim, reclaim, PF_NO_COMPOUND)
+PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND)
+ TESTCLEARFLAG(Readahead, reclaim, PF_NO_COMPOUND)
#ifdef CONFIG_HIGHMEM
/*
@@ -258,31 +330,33 @@ PAGEFLAG_FALSE(HighMem)
#endif
#ifdef CONFIG_SWAP
-PAGEFLAG(SwapCache, swapcache)
+PAGEFLAG(SwapCache, swapcache, PF_NO_COMPOUND)
#else
PAGEFLAG_FALSE(SwapCache)
#endif
-PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable)
- TESTCLEARFLAG(Unevictable, unevictable)
+PAGEFLAG(Unevictable, unevictable, PF_HEAD)
+ __CLEARPAGEFLAG(Unevictable, unevictable, PF_HEAD)
+ TESTCLEARFLAG(Unevictable, unevictable, PF_HEAD)
#ifdef CONFIG_MMU
-PAGEFLAG(Mlocked, mlocked) __CLEARPAGEFLAG(Mlocked, mlocked)
- TESTSCFLAG(Mlocked, mlocked) __TESTCLEARFLAG(Mlocked, mlocked)
+PAGEFLAG(Mlocked, mlocked, PF_NO_TAIL) __CLEARPAGEFLAG(Mlocked, mlocked, PF_NO_TAIL)
+ TESTSCFLAG(Mlocked, mlocked, PF_NO_TAIL)
+ __TESTCLEARFLAG(Mlocked, mlocked, PF_NO_TAIL)
#else
PAGEFLAG_FALSE(Mlocked) __CLEARPAGEFLAG_NOOP(Mlocked)
TESTSCFLAG_FALSE(Mlocked) __TESTCLEARFLAG_FALSE(Mlocked)
#endif
#ifdef CONFIG_ARCH_USES_PG_UNCACHED
-PAGEFLAG(Uncached, uncached)
+PAGEFLAG(Uncached, uncached, PF_NO_COMPOUND)
#else
PAGEFLAG_FALSE(Uncached)
#endif
#ifdef CONFIG_MEMORY_FAILURE
-PAGEFLAG(HWPoison, hwpoison)
-TESTSCFLAG(HWPoison, hwpoison)
+PAGEFLAG(HWPoison, hwpoison, PF_ANY)
+TESTSCFLAG(HWPoison, hwpoison, PF_ANY)
#define __PG_HWPOISON (1UL << PG_hwpoison)
#else
PAGEFLAG_FALSE(HWPoison)
@@ -311,6 +385,7 @@ PAGEFLAG_FALSE(HWPoison)
static inline int PageAnon(struct page *page)
{
+ page = compound_head(page);
return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
}
@@ -323,6 +398,7 @@ static inline int PageAnon(struct page *page)
*/
static inline int PageKsm(struct page *page)
{
+ page = compound_head(page);
return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
(PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
}
@@ -334,8 +410,9 @@ u64 stable_page_flags(struct page *page);
static inline int PageUptodate(struct page *page)
{
- int ret = test_bit(PG_uptodate, &(page)->flags);
-
+ int ret;
+ page = compound_head(page);
+ ret = test_bit(PG_uptodate, &(page)->flags);
/*
* Must ensure that the data we read out of the page is loaded
* _after_ we've loaded page->flags to check for PageUptodate.
@@ -352,22 +429,24 @@ static inline int PageUptodate(struct page *page)
static inline void __SetPageUptodate(struct page *page)
{
+ VM_BUG_ON_PAGE(PageTail(page), page);
smp_wmb();
- __set_bit(PG_uptodate, &(page)->flags);
+ __set_bit(PG_uptodate, &page->flags);
}
static inline void SetPageUptodate(struct page *page)
{
+ VM_BUG_ON_PAGE(PageTail(page), page);
/*
* Memory barrier must be issued before setting the PG_uptodate bit,
* so that all previous stores issued in order to bring the page
* uptodate are actually visible before PageUptodate becomes true.
*/
smp_wmb();
- set_bit(PG_uptodate, &(page)->flags);
+ set_bit(PG_uptodate, &page->flags);
}
-CLEARPAGEFLAG(Uptodate, uptodate)
+CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL)
int test_clear_page_writeback(struct page *page);
int __test_set_page_writeback(struct page *page, bool keep_write);
@@ -396,8 +475,8 @@ static inline void set_page_writeback_keepwrite(struct page *page)
* and arch/powerpc/kvm/book3s_64_vio_hv.c which use it to detect huge pages
* and avoid handling those in real mode.
*/
-__PAGEFLAG(Head, head) CLEARPAGEFLAG(Head, head)
-__PAGEFLAG(Tail, tail)
+__PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY)
+__PAGEFLAG(Tail, tail, PF_ANY)
static inline int PageCompound(struct page *page)
{
@@ -421,8 +500,8 @@ static inline void ClearPageCompound(struct page *page)
* because PageCompound is always set for compound pages and not for
* pages on the LRU and/or pagecache.
*/
-TESTPAGEFLAG(Compound, compound)
-__SETPAGEFLAG(Head, compound) __CLEARPAGEFLAG(Head, compound)
+TESTPAGEFLAG(Compound, compound, PF_ANY)
+__SETPAGEFLAG(Head, compound, PF_ANY) __CLEARPAGEFLAG(Head, compound, PF_ANY)
/*
* PG_reclaim is used in combination with PG_compound to mark the
@@ -518,21 +597,9 @@ static inline int PageTransTail(struct page *page)
}
#else
-
-static inline int PageTransHuge(struct page *page)
-{
- return 0;
-}
-
-static inline int PageTransCompound(struct page *page)
-{
- return 0;
-}
-
-static inline int PageTransTail(struct page *page)
-{
- return 0;
-}
+TESTPAGEFLAG_FALSE(TransHuge)
+TESTPAGEFLAG_FALSE(TransCompound)
+TESTPAGEFLAG_FALSE(TransTail)
#endif
/*
@@ -655,6 +722,10 @@ static inline int page_has_private(struct page *page)
return !!(page->flags & PAGE_FLAGS_PRIVATE);
}
+#undef PF_ANY
+#undef PF_HEAD
+#undef PF_NO_TAIL
+#undef PF_NO_COMPOUND
#endif /* !__GENERATING_BOUNDS_H */
#endif /* PAGE_FLAGS_H */
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index fb0814ca65c732..4779de5ed499f8 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -426,18 +426,9 @@ extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
unsigned int flags);
extern void unlock_page(struct page *page);
-static inline void __set_page_locked(struct page *page)
-{
- __set_bit(PG_locked, &page->flags);
-}
-
-static inline void __clear_page_locked(struct page *page)
-{
- __clear_bit(PG_locked, &page->flags);
-}
-
static inline int trylock_page(struct page *page)
{
+ page = compound_head(page);
return (likely(!test_and_set_bit_lock(PG_locked, &page->flags)));
}
@@ -490,9 +481,9 @@ extern int wait_on_page_bit_killable_timeout(struct page *page,
static inline int wait_on_page_locked_killable(struct page *page)
{
- if (PageLocked(page))
- return wait_on_page_bit_killable(page, PG_locked);
- return 0;
+ if (!PageLocked(page))
+ return 0;
+ return wait_on_page_bit_killable(compound_head(page), PG_locked);
}
extern wait_queue_head_t *page_waitqueue(struct page *page);
@@ -511,7 +502,7 @@ static inline void wake_up_page(struct page *page, int bit)
static inline void wait_on_page_locked(struct page *page)
{
if (PageLocked(page))
- wait_on_page_bit(page, PG_locked);
+ wait_on_page_bit(compound_head(page), PG_locked);
}
/*
@@ -657,17 +648,17 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask);
/*
* Like add_to_page_cache_locked, but used to add newly allocated pages:
- * the page is new, so we can just run __set_page_locked() against it.
+ * the page is new, so we can just run __SetPageLocked() against it.
*/
static inline int add_to_page_cache(struct page *page,
struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask)
{
int error;
- __set_page_locked(page);
+ __SetPageLocked(page);
error = add_to_page_cache_locked(page, mapping, offset, gfp_mask);
if (unlikely(error))
- __clear_page_locked(page);
+ __ClearPageLocked(page);
return error;
}
diff --git a/include/linux/poison.h b/include/linux/poison.h
index 2110a81c5e2afa..7b2a7fcde6a320 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -32,6 +32,10 @@
/********** mm/debug-pagealloc.c **********/
#define PAGE_POISON 0xaa
+/********** mm/page_alloc.c ************/
+
+#define TAIL_MAPPING ((void *) 0x01014A11 + POISON_POINTER_DELTA)
+
/********** mm/slab.c **********/
/*
* Magic nums for obj red zoning.
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 9b30871c914998..58b1fec40d3737 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -30,6 +30,8 @@ static inline const char *printk_skip_level(const char *buffer)
return buffer;
}
+#define CONSOLE_EXT_LOG_MAX 8192
+
/* printk's without a loglevel use this.. */
#define MESSAGE_LOGLEVEL_DEFAULT CONFIG_MESSAGE_LOGLEVEL_DEFAULT
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index c89c53a113a8d5..bf36b6e644c4ac 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -85,6 +85,7 @@ enum ttu_flags {
TTU_UNMAP = 1, /* unmap mode */
TTU_MIGRATION = 2, /* migration mode */
TTU_MUNLOCK = 4, /* munlock mode */
+ TTU_FREE = 8, /* free mode */
TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */
TTU_IGNORE_ACCESS = (1 << 9), /* don't age */
@@ -183,7 +184,8 @@ static inline void page_dup_rmap(struct page *page)
* Called from mm/vmscan.c to handle paging out
*/
int page_referenced(struct page *, int is_locked,
- struct mem_cgroup *memcg, unsigned long *vm_flags);
+ struct mem_cgroup *memcg, unsigned long *vm_flags,
+ int *is_pte_dirty);
#define TTU_ACTION(x) ((x) & TTU_ACTION_MASK)
@@ -260,9 +262,12 @@ int rmap_walk(struct page *page, struct rmap_walk_control *rwc);
static inline int page_referenced(struct page *page, int is_locked,
struct mem_cgroup *memcg,
- unsigned long *vm_flags)
+ unsigned long *vm_flags,
+ int *is_pte_dirty)
{
*vm_flags = 0;
+ if (is_pte_dirty)
+ *is_pte_dirty = 0;
return 0;
}
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index 587017e7939c4b..ad6e32f3eab210 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -133,6 +133,10 @@ struct rtc_device {
/* Some hardware can't support UIE mode */
int uie_unsupported;
+#ifdef CONFIG_PM_SLEEP
+ struct rtc_wkalrm alarm;
+ bool valid_alarm;
+#endif
#ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL
struct work_struct uie_task;
struct timer_list uie_timer;
diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index 50a8486c524bb2..9b1ef0c820a72d 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -265,13 +265,16 @@ int sg_alloc_table_from_pages(struct sg_table *sgt,
unsigned long offset, unsigned long size,
gfp_t gfp_mask);
+size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
+ size_t buflen, off_t skip, bool to_buffer);
+
size_t sg_copy_from_buffer(struct scatterlist *sgl, unsigned int nents,
- void *buf, size_t buflen);
+ const void *buf, size_t buflen);
size_t sg_copy_to_buffer(struct scatterlist *sgl, unsigned int nents,
void *buf, size_t buflen);
size_t sg_pcopy_from_buffer(struct scatterlist *sgl, unsigned int nents,
- void *buf, size_t buflen, off_t skip);
+ const void *buf, size_t buflen, off_t skip);
size_t sg_pcopy_to_buffer(struct scatterlist *sgl, unsigned int nents,
void *buf, size_t buflen, off_t skip);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 380d7f4c1f3de2..d505bcab1c47b7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2548,8 +2548,22 @@ extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode);
/* Remove the current tasks stale references to the old mm_struct */
extern void mm_release(struct task_struct *, struct mm_struct *);
+#ifdef CONFIG_HAVE_COPY_THREAD_TLS
+extern int copy_thread_tls(unsigned long, unsigned long, unsigned long,
+ struct task_struct *, unsigned long);
+#else
extern int copy_thread(unsigned long, unsigned long, unsigned long,
struct task_struct *);
+
+/* Architectures that haven't opted into copy_thread_tls get the tls argument
+ * via pt_regs, so ignore the tls argument passed via C. */
+static inline int copy_thread_tls(
+ unsigned long clone_flags, unsigned long sp, unsigned long arg,
+ struct task_struct *p, unsigned long tls)
+{
+ return copy_thread(clone_flags, sp, arg, p);
+}
+#endif
extern void flush_thread(void);
extern void exit_thread(void);
@@ -2568,6 +2582,7 @@ extern int do_execveat(int, struct filename *,
const char __user * const __user *,
const char __user * const __user *,
int);
+extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *, unsigned long);
extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
struct task_struct *fork_idle(int);
extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
diff --git a/include/linux/slab.h b/include/linux/slab.h
index ffd24c8301513f..ca761238b2a596 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -153,8 +153,30 @@ size_t ksize(const void *);
#define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN
#define KMALLOC_MIN_SIZE ARCH_DMA_MINALIGN
#define KMALLOC_SHIFT_LOW ilog2(ARCH_DMA_MINALIGN)
+/*
+ * The KMALLOC_LOOP_LOW is the definition for the for loop index start number
+ * to create the kmalloc_caches object in create_kmalloc_caches(). The first
+ * and the second are 96 and 192. You can see that in the kmalloc_index(), if
+ * the KMALLOC_MIN_SIZE <= 32, then return 1 (96). If KMALLOC_MIN_SIZE <= 64,
+ * then return 2 (192). If the KMALLOC_MIN_SIZE is bigger than 64, we don't
+ * need to initialize 96 and 192. Go directly to start the KMALLOC_SHIFT_LOW.
+ */
+#if KMALLOC_MIN_SIZE <= 32
+#define KMALLOC_LOOP_LOW 1
+#elif KMALLOC_MIN_SIZE <= 64
+#define KMALLOC_LOOP_LOW 2
+#else
+#define KMALLOC_LOOP_LOW KMALLOC_SHIFT_LOW
+#endif
+
#else
#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
+/*
+ * The KMALLOC_MIN_SIZE of slub/slab/slob is 2^3/2^5/2^3. So, even slab is used.
+ * The KMALLOC_MIN_SIZE <= 32. The kmalloc-96 and kmalloc-192 should also be
+ * initialized.
+ */
+#define KMALLOC_LOOP_LOW 1
#endif
/*
@@ -240,8 +262,8 @@ extern struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1];
* belongs to.
* 0 = zero alloc
* 1 = 65 .. 96 bytes
- * 2 = 120 .. 192 bytes
- * n = 2^(n-1) .. 2^n -1
+ * 2 = 129 .. 192 bytes
+ * n = 2^(n-1)+1 .. 2^n
*/
static __always_inline int kmalloc_index(size_t size)
{
@@ -290,6 +312,16 @@ void *__kmalloc(size_t size, gfp_t flags);
void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags);
void kmem_cache_free(struct kmem_cache *, void *);
+/*
+ * Bulk allocation and freeing operations. These are accellerated in an
+ * allocator specific way to avoid taking locks repeatedly or building
+ * metadata structures unnecessarily.
+ *
+ * Note that interrupts must be enabled when calling these functions.
+ */
+void kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
+bool kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
+
#ifdef CONFIG_NUMA
void *__kmalloc_node(size_t size, gfp_t flags, int node);
void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h
index d600afb2192673..da3c593f9845b0 100644
--- a/include/linux/smpboot.h
+++ b/include/linux/smpboot.h
@@ -27,6 +27,8 @@ struct smpboot_thread_data;
* @pre_unpark: Optional unpark function, called before the thread is
* unparked (cpu online). This is not guaranteed to be
* called on the target cpu of the thread. Careful!
+ * @cpumask: Internal state. To update which threads are unparked,
+ * call smpboot_update_cpumask_percpu_thread().
* @selfparking: Thread is not parked by the park function.
* @thread_comm: The base name of the thread
*/
@@ -41,11 +43,14 @@ struct smp_hotplug_thread {
void (*park)(unsigned int cpu);
void (*unpark)(unsigned int cpu);
void (*pre_unpark)(unsigned int cpu);
+ cpumask_var_t cpumask;
bool selfparking;
const char *thread_comm;
};
int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread);
void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread);
+int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
+ const struct cpumask *);
#endif
diff --git a/include/linux/string.h b/include/linux/string.h
index e40099e585c99b..12a5a60f1f3bf2 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -117,6 +117,7 @@ extern void kfree_const(const void *x);
extern char *kstrdup(const char *s, gfp_t gfp);
extern const char *kstrdup_const(const char *s, gfp_t gfp);
extern char *kstrndup(const char *s, size_t len, gfp_t gfp);
+extern char *kstrimdup(const char *s, gfp_t gfp);
extern void *kmemdup(const void *src, size_t len, gfp_t gfp);
extern char **argv_split(gfp_t gfp, const char *str, int *argcp);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 38874729dc5fd8..9a7adfb85e2294 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -308,6 +308,7 @@ extern void lru_add_drain_cpu(int cpu);
extern void lru_add_drain_all(void);
extern void rotate_reclaimable_page(struct page *page);
extern void deactivate_file_page(struct page *page);
+extern void deactivate_page(struct page *page);
extern void swap_setup(void);
extern void add_page_to_unevictable_list(struct page *page);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index d8b06abb264f3f..08001317aee737 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -810,6 +810,7 @@ asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr);
asmlinkage long sys_eventfd(unsigned int count);
asmlinkage long sys_eventfd2(unsigned int count, int flags);
asmlinkage long sys_memfd_create(const char __user *uname_ptr, unsigned int flags);
+asmlinkage long sys_userfaultfd(int flags);
asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
asmlinkage long sys_old_readdir(unsigned int, struct old_linux_dirent __user *, unsigned int);
asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *,
@@ -827,15 +828,15 @@ asmlinkage long sys_syncfs(int fd);
asmlinkage long sys_fork(void);
asmlinkage long sys_vfork(void);
#ifdef CONFIG_CLONE_BACKWARDS
-asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, int,
+asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, unsigned long,
int __user *);
#else
#ifdef CONFIG_CLONE_BACKWARDS3
asmlinkage long sys_clone(unsigned long, unsigned long, int, int __user *,
- int __user *, int);
+ int __user *, unsigned long);
#else
asmlinkage long sys_clone(unsigned long, unsigned long, int __user *,
- int __user *, int);
+ int __user *, unsigned long);
#endif
#endif
diff --git a/include/linux/syslog.h b/include/linux/syslog.h
index 4b7b875a7ce1fe..c3a7f0cc3a275f 100644
--- a/include/linux/syslog.h
+++ b/include/linux/syslog.h
@@ -47,12 +47,12 @@
#define SYSLOG_FROM_READER 0
#define SYSLOG_FROM_PROC 1
-int do_syslog(int type, char __user *buf, int count, bool from_file);
+int do_syslog(int type, char __user *buf, int count, int source);
#ifdef CONFIG_PRINTK
-int check_syslog_permissions(int type, bool from_file);
+int check_syslog_permissions(int type, int source);
#else
-static inline int check_syslog_permissions(int type, bool from_file)
+static inline int check_syslog_permissions(int type, int source)
{
return 0;
}
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
new file mode 100644
index 00000000000000..587480ad41b7b1
--- /dev/null
+++ b/include/linux/userfaultfd_k.h
@@ -0,0 +1,85 @@
+/*
+ * include/linux/userfaultfd_k.h
+ *
+ * Copyright (C) 2015 Red Hat, Inc.
+ *
+ */
+
+#ifndef _LINUX_USERFAULTFD_K_H
+#define _LINUX_USERFAULTFD_K_H
+
+#ifdef CONFIG_USERFAULTFD
+
+#include <linux/userfaultfd.h> /* linux/include/uapi/linux/userfaultfd.h */
+
+#include <linux/fcntl.h>
+
+/*
+ * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining
+ * new flags, since they might collide with O_* ones. We want
+ * to re-use O_* flags that couldn't possibly have a meaning
+ * from userfaultfd, in order to leave a free define-space for
+ * shared O_* flags.
+ */
+#define UFFD_CLOEXEC O_CLOEXEC
+#define UFFD_NONBLOCK O_NONBLOCK
+
+#define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
+#define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS)
+
+extern int handle_userfault(struct vm_area_struct *vma, unsigned long address,
+ unsigned int flags, unsigned long reason);
+
+extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
+ unsigned long src_start, unsigned long len);
+extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
+ unsigned long dst_start,
+ unsigned long len);
+
+/* mm helpers */
+static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
+ struct vm_userfaultfd_ctx vm_ctx)
+{
+ return vma->vm_userfaultfd_ctx.ctx == vm_ctx.ctx;
+}
+
+static inline bool userfaultfd_missing(struct vm_area_struct *vma)
+{
+ return vma->vm_flags & VM_UFFD_MISSING;
+}
+
+static inline bool userfaultfd_armed(struct vm_area_struct *vma)
+{
+ return vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP);
+}
+
+#else /* CONFIG_USERFAULTFD */
+
+/* mm helpers */
+static inline int handle_userfault(struct vm_area_struct *vma,
+ unsigned long address,
+ unsigned int flags,
+ unsigned long reason)
+{
+ return VM_FAULT_SIGBUS;
+}
+
+static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
+ struct vm_userfaultfd_ctx vm_ctx)
+{
+ return true;
+}
+
+static inline bool userfaultfd_missing(struct vm_area_struct *vma)
+{
+ return false;
+}
+
+static inline bool userfaultfd_armed(struct vm_area_struct *vma)
+{
+ return false;
+}
+
+#endif /* CONFIG_USERFAULTFD */
+
+#endif /* _LINUX_USERFAULTFD_K_H */
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 9246d32dc97343..2b1cef88b827ae 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -25,6 +25,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
FOR_ALL_ZONES(PGALLOC),
PGFREE, PGACTIVATE, PGDEACTIVATE,
PGFAULT, PGMAJFAULT,
+ PGLAZYFREED,
FOR_ALL_ZONES(PGREFILL),
FOR_ALL_ZONES(PGSTEAL_KSWAPD),
FOR_ALL_ZONES(PGSTEAL_DIRECT),
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 1e1bf9f963a947..d3d077228d4c15 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -147,7 +147,8 @@ __remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old)
typedef int wait_bit_action_f(struct wait_bit_key *);
void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
-void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key);
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr,
+ void *key);
void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr);
void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr);
@@ -179,7 +180,7 @@ wait_queue_head_t *bit_waitqueue(void *, int);
#define wake_up_poll(x, m) \
__wake_up(x, TASK_NORMAL, 1, (void *) (m))
#define wake_up_locked_poll(x, m) \
- __wake_up_locked_key((x), TASK_NORMAL, (void *) (m))
+ __wake_up_locked_key((x), TASK_NORMAL, 1, (void *) (m))
#define wake_up_interruptible_poll(x, m) \
__wake_up(x, TASK_INTERRUPTIBLE, 1, (void *) (m))
#define wake_up_interruptible_sync_poll(x, m) \
diff --git a/include/linux/zpool.h b/include/linux/zpool.h
index 56529b34dc6353..d30eff3d84d542 100644
--- a/include/linux/zpool.h
+++ b/include/linux/zpool.h
@@ -81,7 +81,8 @@ struct zpool_driver {
atomic_t refcount;
struct list_head list;
- void *(*create)(char *name, gfp_t gfp, struct zpool_ops *ops);
+ void *(*create)(char *name, gfp_t gfp, struct zpool_ops *ops,
+ struct zpool *zpool);
void (*destroy)(void *pool);
int (*malloc)(void *pool, size_t size, gfp_t gfp,
@@ -102,6 +103,4 @@ void zpool_register_driver(struct zpool_driver *driver);
int zpool_unregister_driver(struct zpool_driver *driver);
-int zpool_evict(void *pool, unsigned long handle);
-
#endif
diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
index 79abb9c71772ef..1443d79e4fe66b 100644
--- a/include/ras/ras_event.h
+++ b/include/ras/ras_event.h
@@ -11,6 +11,7 @@
#include <linux/pci.h>
#include <linux/aer.h>
#include <linux/cper.h>
+#include <linux/mm.h>
/*
* MCE Extended Error Log trace event
@@ -232,6 +233,90 @@ TRACE_EVENT(aer_event,
__print_flags(__entry->status, "|", aer_uncorrectable_errors))
);
+/*
+ * memory-failure recovery action result event
+ *
+ * unsigned long pfn - Page Frame Number of the corrupted page
+ * int type - Page types of the corrupted page
+ * int result - Result of recovery action
+ */
+
+#ifdef CONFIG_MEMORY_FAILURE
+#define MF_ACTION_RESULT \
+ EM ( MF_IGNORED, "Ignored" ) \
+ EM ( MF_FAILED, "Failed" ) \
+ EM ( MF_DELAYED, "Delayed" ) \
+ EMe ( MF_RECOVERED, "Recovered" )
+
+#define MF_PAGE_TYPE \
+ EM ( MF_MSG_KERNEL, "reserved kernel page" ) \
+ EM ( MF_MSG_KERNEL_HIGH_ORDER, "high-order kernel page" ) \
+ EM ( MF_MSG_SLAB, "kernel slab page" ) \
+ EM ( MF_MSG_DIFFERENT_COMPOUND, "different compound page after locking" ) \
+ EM ( MF_MSG_POISONED_HUGE, "huge page already hardware poisoned" ) \
+ EM ( MF_MSG_HUGE, "huge page" ) \
+ EM ( MF_MSG_FREE_HUGE, "free huge page" ) \
+ EM ( MF_MSG_UNMAP_FAILED, "unmapping failed page" ) \
+ EM ( MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page" ) \
+ EM ( MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page" ) \
+ EM ( MF_MSG_DIRTY_MLOCKED_LRU, "dirty mlocked LRU page" ) \
+ EM ( MF_MSG_CLEAN_MLOCKED_LRU, "clean mlocked LRU page" ) \
+ EM ( MF_MSG_DIRTY_UNEVICTABLE_LRU, "dirty unevictable LRU page" ) \
+ EM ( MF_MSG_CLEAN_UNEVICTABLE_LRU, "clean unevictable LRU page" ) \
+ EM ( MF_MSG_DIRTY_LRU, "dirty LRU page" ) \
+ EM ( MF_MSG_CLEAN_LRU, "clean LRU page" ) \
+ EM ( MF_MSG_TRUNCATED_LRU, "already truncated LRU page" ) \
+ EM ( MF_MSG_BUDDY, "free buddy page" ) \
+ EM ( MF_MSG_BUDDY_2ND, "free buddy page (2nd try)" ) \
+ EMe ( MF_MSG_UNKNOWN, "unknown page" )
+
+/*
+ * First define the enums in MM_ACTION_RESULT to be exported to userspace
+ * via TRACE_DEFINE_ENUM().
+ */
+#undef EM
+#undef EMe
+#define EM(a, b) TRACE_DEFINE_ENUM(a);
+#define EMe(a, b) TRACE_DEFINE_ENUM(a);
+
+MF_ACTION_RESULT
+MF_PAGE_TYPE
+
+/*
+ * Now redefine the EM() and EMe() macros to map the enums to the strings
+ * that will be printed in the output.
+ */
+#undef EM
+#undef EMe
+#define EM(a, b) { a, b },
+#define EMe(a, b) { a, b }
+
+TRACE_EVENT(memory_failure_event,
+ TP_PROTO(unsigned long pfn,
+ int type,
+ int result),
+
+ TP_ARGS(pfn, type, result),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, pfn)
+ __field(int, type)
+ __field(int, result)
+ ),
+
+ TP_fast_assign(
+ __entry->pfn = pfn;
+ __entry->type = type;
+ __entry->result = result;
+ ),
+
+ TP_printk("pfn %#lx: recovery action for %s: %s",
+ __entry->pfn,
+ __print_symbolic(__entry->type, MF_PAGE_TYPE),
+ __print_symbolic(__entry->result, MF_ACTION_RESULT)
+ )
+);
+#endif /* CONFIG_MEMORY_FAILURE */
#endif /* _TRACE_HW_EVENT_MC_H */
/* This part must be outside protection */
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index ddc3b36f1046bd..7a94102b7a0277 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -34,6 +34,7 @@
#define MADV_SEQUENTIAL 2 /* expect sequential page references */
#define MADV_WILLNEED 3 /* will need these pages */
#define MADV_DONTNEED 4 /* don't need these pages */
+#define MADV_FREE 5 /* free pages only if memory pressure */
/* common parameters: try to keep these consistent across architectures */
#define MADV_REMOVE 9 /* remove these pages & resources */
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index 5da885cec4e4d2..d73d0bd0149bdc 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -454,3 +454,4 @@ header-y += xfrm.h
header-y += xilinx-v4l2-controls.h
header-y += zorro.h
header-y += zorro_ids.h
+header-y += userfaultfd.h
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
new file mode 100644
index 00000000000000..df0e09bb7dd5a2
--- /dev/null
+++ b/include/uapi/linux/userfaultfd.h
@@ -0,0 +1,169 @@
+/*
+ * include/linux/userfaultfd.h
+ *
+ * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
+ * Copyright (C) 2015 Red Hat, Inc.
+ *
+ */
+
+#ifndef _LINUX_USERFAULTFD_H
+#define _LINUX_USERFAULTFD_H
+
+#include <linux/types.h>
+
+#include <linux/compiler.h>
+
+#define UFFD_API ((__u64)0xAA)
+/*
+ * After implementing the respective features it will become:
+ * #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \
+ * UFFD_FEATURE_EVENT_FORK)
+ */
+#define UFFD_API_FEATURES (0)
+#define UFFD_API_IOCTLS \
+ ((__u64)1 << _UFFDIO_REGISTER | \
+ (__u64)1 << _UFFDIO_UNREGISTER | \
+ (__u64)1 << _UFFDIO_API)
+#define UFFD_API_RANGE_IOCTLS \
+ ((__u64)1 << _UFFDIO_WAKE | \
+ (__u64)1 << _UFFDIO_COPY | \
+ (__u64)1 << _UFFDIO_ZEROPAGE)
+
+/*
+ * Valid ioctl command number range with this API is from 0x00 to
+ * 0x3F. UFFDIO_API is the fixed number, everything else can be
+ * changed by implementing a different UFFD_API. If sticking to the
+ * same UFFD_API more ioctl can be added and userland will be aware of
+ * which ioctl the running kernel implements through the ioctl command
+ * bitmask written by the UFFDIO_API.
+ */
+#define _UFFDIO_REGISTER (0x00)
+#define _UFFDIO_UNREGISTER (0x01)
+#define _UFFDIO_WAKE (0x02)
+#define _UFFDIO_COPY (0x03)
+#define _UFFDIO_ZEROPAGE (0x04)
+#define _UFFDIO_API (0x3F)
+
+/* userfaultfd ioctl ids */
+#define UFFDIO 0xAA
+#define UFFDIO_API _IOWR(UFFDIO, _UFFDIO_API, \
+ struct uffdio_api)
+#define UFFDIO_REGISTER _IOWR(UFFDIO, _UFFDIO_REGISTER, \
+ struct uffdio_register)
+#define UFFDIO_UNREGISTER _IOR(UFFDIO, _UFFDIO_UNREGISTER, \
+ struct uffdio_range)
+#define UFFDIO_WAKE _IOR(UFFDIO, _UFFDIO_WAKE, \
+ struct uffdio_range)
+#define UFFDIO_COPY _IOWR(UFFDIO, _UFFDIO_COPY, \
+ struct uffdio_copy)
+#define UFFDIO_ZEROPAGE _IOWR(UFFDIO, _UFFDIO_ZEROPAGE, \
+ struct uffdio_zeropage)
+
+/* read() structure */
+struct uffd_msg {
+ __u8 event;
+
+ __u8 reserved1;
+ __u16 reserved2;
+ __u32 reserved3;
+
+ union {
+ struct {
+ __u64 flags;
+ __u64 address;
+ } pagefault;
+
+ struct {
+ /* unused reserved fields */
+ __u64 reserved1;
+ __u64 reserved2;
+ __u64 reserved3;
+ } reserved;
+ } arg;
+} __packed;
+
+/*
+ * Start at 0x12 and not at 0 to be more strict against bugs.
+ */
+#define UFFD_EVENT_PAGEFAULT 0x12
+#if 0 /* not available yet */
+#define UFFD_EVENT_FORK 0x13
+#endif
+
+/* flags for UFFD_EVENT_PAGEFAULT */
+#define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */
+#define UFFD_PAGEFAULT_FLAG_WP (1<<1) /* If reason is VM_UFFD_WP */
+
+struct uffdio_api {
+ /* userland asks for an API number and the features to enable */
+ __u64 api;
+ /*
+ * Kernel answers below with the all available features for
+ * the API, this notifies userland of which events and/or
+ * which flags for each event are enabled in the current
+ * kernel.
+ *
+ * Note: UFFD_EVENT_PAGEFAULT and UFFD_PAGEFAULT_FLAG_WRITE
+ * are to be considered implicitly always enabled in all kernels as
+ * long as the uffdio_api.api requested matches UFFD_API.
+ */
+#if 0 /* not available yet */
+#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0)
+#define UFFD_FEATURE_EVENT_FORK (1<<1)
+#endif
+ __u64 features;
+
+ __u64 ioctls;
+};
+
+struct uffdio_range {
+ __u64 start;
+ __u64 len;
+};
+
+struct uffdio_register {
+ struct uffdio_range range;
+#define UFFDIO_REGISTER_MODE_MISSING ((__u64)1<<0)
+#define UFFDIO_REGISTER_MODE_WP ((__u64)1<<1)
+ __u64 mode;
+
+ /*
+ * kernel answers which ioctl commands are available for the
+ * range, keep at the end as the last 8 bytes aren't read.
+ */
+ __u64 ioctls;
+};
+
+struct uffdio_copy {
+ __u64 dst;
+ __u64 src;
+ __u64 len;
+ /*
+ * There will be a wrprotection flag later that allows to map
+ * pages wrprotected on the fly. And such a flag will be
+ * available if the wrprotection ioctl are implemented for the
+ * range according to the uffdio_register.ioctls.
+ */
+#define UFFDIO_COPY_MODE_DONTWAKE ((__u64)1<<0)
+ __u64 mode;
+
+ /*
+ * "copy" is written by the ioctl and must be at the end: the
+ * copy_from_user will not read the last 8 bytes.
+ */
+ __s64 copy;
+};
+
+struct uffdio_zeropage {
+ struct uffdio_range range;
+#define UFFDIO_ZEROPAGE_MODE_DONTWAKE ((__u64)1<<0)
+ __u64 mode;
+
+ /*
+ * "zeropage" is written by the ioctl and must be at the end:
+ * the copy_from_user will not read the last 8 bytes.
+ */
+ __s64 zeropage;
+};
+
+#endif /* _LINUX_USERFAULTFD_H */
diff --git a/init/Kconfig b/init/Kconfig
index f17bf480237382..3593213c44ed21 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1141,6 +1141,7 @@ endif # CGROUPS
config CHECKPOINT_RESTORE
bool "Checkpoint/restore support" if EXPERT
+ select PROC_CHILDREN
default n
help
Enables additional kernel features in a sake of checkpoint/restore.
@@ -1641,6 +1642,17 @@ config ADVISE_SYSCALLS
applications use these syscalls, you can disable this option to save
space.
+config USERFAULTFD
+ bool "Enable userfaultfd() system call"
+ select ANON_INODES
+ default y
+ depends on MMU
+ help
+ Enable the userfaultfd() system call that allows to intercept and
+ handle page faults in userland.
+
+ If unsure, say Y.
+
config PCI_QUIRKS
default y
bool "Enable PCI quirk workarounds" if EXPERT
diff --git a/init/do_mounts.c b/init/do_mounts.c
index a95bbdb2a50232..dea5de95c2dd23 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -533,8 +533,13 @@ void __init mount_root(void)
}
#endif
#ifdef CONFIG_BLOCK
- create_dev("/dev/root", ROOT_DEV);
- mount_block_root("/dev/root", root_mountflags);
+ {
+ int err = create_dev("/dev/root", ROOT_DEV);
+
+ if (err < 0)
+ pr_emerg("Failed to create /dev/root: %d\n", err);
+ mount_block_root("/dev/root", root_mountflags);
+ }
#endif
}
diff --git a/init/main.c b/init/main.c
index edcb134406469d..998bf46bfa4809 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1003,6 +1003,8 @@ static noinline void __init kernel_init_freeable(void)
smp_init();
sched_init_smp();
+ page_alloc_init_late();
+
do_basic_setup();
/* Open the /dev/console on the rootfs, this should never fail */
diff --git a/ipc/msg.c b/ipc/msg.c
index 2b6fdbb9e0e9ae..0ab85c0f4c464d 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -37,6 +37,7 @@
#include <linux/rwsem.h>
#include <linux/nsproxy.h>
#include <linux/ipc_namespace.h>
+#include <linux/freezer.h>
#include <asm/current.h>
#include <linux/uaccess.h>
@@ -196,7 +197,7 @@ static void expunge_all(struct msg_queue *msq, int res)
* or dealing with -EAGAIN cases. See lockless receive part 1
* and 2 in do_msgrcv().
*/
- smp_mb();
+ smp_wmb();
msr->r_msg = ERR_PTR(res);
}
}
@@ -580,7 +581,7 @@ static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg)
/* initialize pipelined send ordering */
msr->r_msg = NULL;
wake_up_process(msr->r_tsk);
- smp_mb(); /* see barrier comment below */
+ smp_wmb(); /* see barrier comment below */
msr->r_msg = ERR_PTR(-E2BIG);
} else {
msr->r_msg = NULL;
@@ -589,11 +590,12 @@ static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg)
wake_up_process(msr->r_tsk);
/*
* Ensure that the wakeup is visible before
- * setting r_msg, as the receiving end depends
- * on it. See lockless receive part 1 and 2 in
- * do_msgrcv().
+ * setting r_msg, as the receiving can otherwise
+ * exit - once r_msg is set, the receiver can
+ * continue. See lockless receive part 1 and 2
+ * in do_msgrcv().
*/
- smp_mb();
+ smp_wmb();
msr->r_msg = msg;
return 1;
@@ -673,7 +675,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
ipc_unlock_object(&msq->q_perm);
rcu_read_unlock();
- schedule();
+ freezable_schedule();
rcu_read_lock();
ipc_lock_object(&msq->q_perm);
@@ -915,7 +917,7 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgfl
ipc_unlock_object(&msq->q_perm);
rcu_read_unlock();
- schedule();
+ freezable_schedule();
/* Lockless receive, part 1:
* Disable preemption. We don't hold a reference to the queue
@@ -934,10 +936,22 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgfl
* wake_up_process(). There is a race with exit(), see
* ipc/mqueue.c for the details.
*/
- msg = (struct msg_msg *)msr_d.r_msg;
- while (msg == NULL) {
- cpu_relax();
+ for (;;) {
+ /*
+ * Pairs with writer barrier in pipelined_send
+ * or expunge_all
+ */
+ smp_rmb();
msg = (struct msg_msg *)msr_d.r_msg;
+ if (msg)
+ break;
+
+ /*
+ * The cpu_relax() call is a compiler barrier
+ * which forces everything in this loop to be
+ * re-loaded.
+ */
+ cpu_relax();
}
/* Lockless receive, part 3:
diff --git a/ipc/shm.c b/ipc/shm.c
index 6d767071c3673d..763e6e3c37dc32 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -155,8 +155,11 @@ static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id)
{
struct kern_ipc_perm *ipcp = ipc_lock(&shm_ids(ns), id);
- if (IS_ERR(ipcp))
- return (struct shmid_kernel *)ipcp;
+ /*
+ * We raced in the idr lookup or with RMID. Either way, the ID is
+ * busted.
+ */
+ BUG_ON(IS_ERR(ipcp));
return container_of(ipcp, struct shmid_kernel, shm_perm);
}
@@ -191,7 +194,6 @@ static void shm_open(struct vm_area_struct *vma)
struct shmid_kernel *shp;
shp = shm_lock(sfd->ns, sfd->id);
- BUG_ON(IS_ERR(shp));
shp->shm_atim = get_seconds();
shp->shm_lprid = task_tgid_vnr(current);
shp->shm_nattch++;
@@ -258,7 +260,6 @@ static void shm_close(struct vm_area_struct *vma)
down_write(&shm_ids(ns).rwsem);
/* remove from the list of attaches of the shm segment */
shp = shm_lock(ns, sfd->id);
- BUG_ON(IS_ERR(shp));
shp->shm_lprid = task_tgid_vnr(current);
shp->shm_dtim = get_seconds();
shp->shm_nattch--;
@@ -1191,7 +1192,6 @@ out_fput:
out_nattch:
down_write(&shm_ids(ns).rwsem);
shp = shm_lock(ns, shmid);
- BUG_ON(IS_ERR(shp));
shp->shm_nattch--;
if (shm_may_destroy(ns, shp))
shm_destroy(ns, shp);
diff --git a/ipc/util.c b/ipc/util.c
index ff3323ef8d8b4e..537a41c7f63391 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -467,10 +467,7 @@ void ipc_rcu_free(struct rcu_head *head)
{
struct ipc_rcu *p = container_of(head, struct ipc_rcu, rcu);
- if (is_vmalloc_addr(p))
- vfree(p);
- else
- kfree(p);
+ kvfree(p);
}
/**
diff --git a/kernel/exit.c b/kernel/exit.c
index 22fcc05dec4022..031325e9acf90a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -436,7 +436,7 @@ static void exit_mm(struct task_struct *tsk)
mm_update_next_owner(mm);
mmput(mm);
if (test_thread_flag(TIF_MEMDIE))
- unmark_oom_victim();
+ exit_oom_victim();
}
static struct task_struct *find_alive_thread(struct task_struct *p)
@@ -711,10 +711,10 @@ void do_exit(long code)
current->comm, task_pid_nr(current),
preempt_count());
- acct_update_integrals(tsk);
/* sync mm's RSS info before statistics gathering */
if (tsk->mm)
sync_mm_rss(tsk->mm);
+ acct_update_integrals(tsk);
group_dead = atomic_dec_and_test(&tsk->signal->live);
if (group_dead) {
hrtimer_cancel(&tsk->signal->real_timer);
diff --git a/kernel/fork.c b/kernel/fork.c
index 9ae2248c2683c4..88bd4291c39312 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -449,8 +449,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
tmp->vm_mm = mm;
if (anon_vma_fork(tmp, mpnt))
goto fail_nomem_anon_vma_fork;
- tmp->vm_flags &= ~VM_LOCKED;
+ tmp->vm_flags &= ~(VM_LOCKED|VM_UFFD_MISSING|VM_UFFD_WP);
tmp->vm_next = tmp->vm_prev = NULL;
+ tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
file = tmp->vm_file;
if (file) {
struct inode *inode = file_inode(file);
@@ -1234,7 +1235,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
unsigned long stack_size,
int __user *child_tidptr,
struct pid *pid,
- int trace)
+ int trace,
+ unsigned long tls)
{
int retval;
struct task_struct *p;
@@ -1443,7 +1445,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
retval = copy_io(clone_flags, p);
if (retval)
goto bad_fork_cleanup_namespaces;
- retval = copy_thread(clone_flags, stack_start, stack_size, p);
+ retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls);
if (retval)
goto bad_fork_cleanup_io;
@@ -1655,7 +1657,7 @@ static inline void init_idle_pids(struct pid_link *links)
struct task_struct *fork_idle(int cpu)
{
struct task_struct *task;
- task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0);
+ task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0);
if (!IS_ERR(task)) {
init_idle_pids(task->pids);
init_idle(task, cpu);
@@ -1670,11 +1672,12 @@ struct task_struct *fork_idle(int cpu)
* It copies the process, and if successful kick-starts
* it and waits for it to finish using the VM if required.
*/
-long do_fork(unsigned long clone_flags,
+long _do_fork(unsigned long clone_flags,
unsigned long stack_start,
unsigned long stack_size,
int __user *parent_tidptr,
- int __user *child_tidptr)
+ int __user *child_tidptr,
+ unsigned long tls)
{
struct task_struct *p;
int trace = 0;
@@ -1699,7 +1702,7 @@ long do_fork(unsigned long clone_flags,
}
p = copy_process(clone_flags, stack_start, stack_size,
- child_tidptr, NULL, trace);
+ child_tidptr, NULL, trace, tls);
/*
* Do this prior waking up the new thread - the thread pointer
* might get invalid after that point, if the thread exits quickly.
@@ -1740,20 +1743,34 @@ long do_fork(unsigned long clone_flags,
return nr;
}
+#ifndef CONFIG_HAVE_COPY_THREAD_TLS
+/* For compatibility with architectures that call do_fork directly rather than
+ * using the syscall entry points below. */
+long do_fork(unsigned long clone_flags,
+ unsigned long stack_start,
+ unsigned long stack_size,
+ int __user *parent_tidptr,
+ int __user *child_tidptr)
+{
+ return _do_fork(clone_flags, stack_start, stack_size,
+ parent_tidptr, child_tidptr, 0);
+}
+#endif
+
/*
* Create a kernel thread.
*/
pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
- return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
- (unsigned long)arg, NULL, NULL);
+ return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
+ (unsigned long)arg, NULL, NULL, 0);
}
#ifdef __ARCH_WANT_SYS_FORK
SYSCALL_DEFINE0(fork)
{
#ifdef CONFIG_MMU
- return do_fork(SIGCHLD, 0, 0, NULL, NULL);
+ return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0);
#else
/* can not support in nommu mode */
return -EINVAL;
@@ -1764,8 +1781,8 @@ SYSCALL_DEFINE0(fork)
#ifdef __ARCH_WANT_SYS_VFORK
SYSCALL_DEFINE0(vfork)
{
- return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
- 0, NULL, NULL);
+ return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
+ 0, NULL, NULL, 0);
}
#endif
@@ -1773,27 +1790,27 @@ SYSCALL_DEFINE0(vfork)
#ifdef CONFIG_CLONE_BACKWARDS
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
int __user *, parent_tidptr,
- int, tls_val,
+ unsigned long, tls,
int __user *, child_tidptr)
#elif defined(CONFIG_CLONE_BACKWARDS2)
SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
int __user *, parent_tidptr,
int __user *, child_tidptr,
- int, tls_val)
+ unsigned long, tls)
#elif defined(CONFIG_CLONE_BACKWARDS3)
SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
int, stack_size,
int __user *, parent_tidptr,
int __user *, child_tidptr,
- int, tls_val)
+ unsigned long, tls)
#else
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
int __user *, parent_tidptr,
int __user *, child_tidptr,
- int, tls_val)
+ unsigned long, tls)
#endif
{
- return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr);
+ return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls);
}
#endif
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index a744098e4eb76f..7080ae1eb6c10d 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -92,6 +92,12 @@ void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters)
}
EXPORT_SYMBOL(__gcov_merge_time_profile);
+void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_icall_topn);
+
/**
* gcov_enable_events - enable event reporting through gcov_event()
*
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index 826ba9fb5e3277..e25e92fb44face 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -18,7 +18,9 @@
#include <linux/vmalloc.h>
#include "gcov.h"
-#if __GNUC__ == 4 && __GNUC_MINOR__ >= 9
+#if __GNUC__ == 5 && __GNUC_MINOR__ >= 1
+#define GCOV_COUNTERS 10
+#elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9
#define GCOV_COUNTERS 9
#else
#define GCOV_COUNTERS 8
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 7a36fdcca5bfb0..a785c1015e25bf 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -84,6 +84,17 @@ struct resource crashk_low_res = {
int kexec_should_crash(struct task_struct *p)
{
+ /*
+ * If crash_kexec_post_notifiers is enabled, don't run
+ * crash_kexec() here yet, which must be run after panic
+ * notifiers in panic().
+ */
+ if (crash_kexec_post_notifiers)
+ return 0;
+ /*
+ * There are 4 panic() calls in do_exit() path, each of which
+ * corresponds to each of these 4 conditions.
+ */
if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
return 1;
return 0;
diff --git a/kernel/panic.c b/kernel/panic.c
index 8136ad76e5fd3e..04e91ff7560b3a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -32,7 +32,7 @@ static unsigned long tainted_mask;
static int pause_on_oops;
static int pause_on_oops_flag;
static DEFINE_SPINLOCK(pause_on_oops_lock);
-static bool crash_kexec_post_notifiers;
+bool crash_kexec_post_notifiers;
int panic_on_warn __read_mostly;
int panic_timeout = CONFIG_PANIC_TIMEOUT;
@@ -142,7 +142,8 @@ void panic(const char *fmt, ...)
* Note: since some panic_notifiers can make crashed kernel
* more unstable, it can increase risks of the kdump failure too.
*/
- crash_kexec(NULL);
+ if (crash_kexec_post_notifiers)
+ crash_kexec(NULL);
bust_spinlocks(0);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index c099b082cd0272..de553849f3ac91 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -85,6 +85,18 @@ static struct lockdep_map console_lock_dep_map = {
#endif
/*
+ * Number of registered extended console drivers.
+ *
+ * If extended consoles are present, in-kernel cont reassembly is disabled
+ * and each fragment is stored as a separate log entry with proper
+ * continuation flag so that every emitted message has full metadata. This
+ * doesn't change the result for regular consoles or /proc/kmsg. For
+ * /dev/kmsg, as long as the reader concatenates messages according to
+ * consecutive continuation flags, the end result should be the same too.
+ */
+static int nr_ext_console_drivers;
+
+/*
* Helper macros to handle lockdep when locking/unlocking console_sem. We use
* macros instead of functions so that _RET_IP_ contains useful information.
*/
@@ -195,7 +207,7 @@ static int console_may_schedule;
* need to be changed in the future, when the requirements change.
*
* /dev/kmsg exports the structured data in the following line format:
- * "level,sequnum,timestamp;<message text>\n"
+ * "<level>,<sequnum>,<timestamp>,<contflag>;<message text>\n"
*
* The optional key/value pairs are attached as continuation lines starting
* with a space character and terminated by a newline. All possible
@@ -477,18 +489,18 @@ static int syslog_action_restricted(int type)
type != SYSLOG_ACTION_SIZE_BUFFER;
}
-int check_syslog_permissions(int type, bool from_file)
+int check_syslog_permissions(int type, int source)
{
/*
* If this is from /proc/kmsg and we've already opened it, then we've
* already done the capabilities checks at open time.
*/
- if (from_file && type != SYSLOG_ACTION_OPEN)
- return 0;
+ if (source == SYSLOG_FROM_PROC && type != SYSLOG_ACTION_OPEN)
+ goto ok;
if (syslog_action_restricted(type)) {
if (capable(CAP_SYSLOG))
- return 0;
+ goto ok;
/*
* For historical reasons, accept CAP_SYS_ADMIN too, with
* a warning.
@@ -498,13 +510,94 @@ int check_syslog_permissions(int type, bool from_file)
"CAP_SYS_ADMIN but no CAP_SYSLOG "
"(deprecated).\n",
current->comm, task_pid_nr(current));
- return 0;
+ goto ok;
}
return -EPERM;
}
+ok:
return security_syslog(type);
}
+static void append_char(char **pp, char *e, char c)
+{
+ if (*pp < e)
+ *(*pp)++ = c;
+}
+
+static ssize_t msg_print_ext_header(char *buf, size_t size,
+ struct printk_log *msg, u64 seq,
+ enum log_flags prev_flags)
+{
+ u64 ts_usec = msg->ts_nsec;
+ char cont = '-';
+
+ do_div(ts_usec, 1000);
+
+ /*
+ * If we couldn't merge continuation line fragments during the print,
+ * export the stored flags to allow an optional external merge of the
+ * records. Merging the records isn't always neccessarily correct, like
+ * when we hit a race during printing. In most cases though, it produces
+ * better readable output. 'c' in the record flags mark the first
+ * fragment of a line, '+' the following.
+ */
+ if (msg->flags & LOG_CONT && !(prev_flags & LOG_CONT))
+ cont = 'c';
+ else if ((msg->flags & LOG_CONT) ||
+ ((prev_flags & LOG_CONT) && !(msg->flags & LOG_PREFIX)))
+ cont = '+';
+
+ return scnprintf(buf, size, "%u,%llu,%llu,%c;",
+ (msg->facility << 3) | msg->level, seq, ts_usec, cont);
+}
+
+static ssize_t msg_print_ext_body(char *buf, size_t size,
+ char *dict, size_t dict_len,
+ char *text, size_t text_len)
+{
+ char *p = buf, *e = buf + size;
+ size_t i;
+
+ /* escape non-printable characters */
+ for (i = 0; i < text_len; i++) {
+ unsigned char c = text[i];
+
+ if (c < ' ' || c >= 127 || c == '\\')
+ p += scnprintf(p, e - p, "\\x%02x", c);
+ else
+ append_char(&p, e, c);
+ }
+ append_char(&p, e, '\n');
+
+ if (dict_len) {
+ bool line = true;
+
+ for (i = 0; i < dict_len; i++) {
+ unsigned char c = dict[i];
+
+ if (line) {
+ append_char(&p, e, ' ');
+ line = false;
+ }
+
+ if (c == '\0') {
+ append_char(&p, e, '\n');
+ line = true;
+ continue;
+ }
+
+ if (c < ' ' || c >= 127 || c == '\\') {
+ p += scnprintf(p, e - p, "\\x%02x", c);
+ continue;
+ }
+
+ append_char(&p, e, c);
+ }
+ append_char(&p, e, '\n');
+ }
+
+ return p - buf;
+}
/* /dev/kmsg - userspace message inject/listen interface */
struct devkmsg_user {
@@ -512,7 +605,7 @@ struct devkmsg_user {
u32 idx;
enum log_flags prev;
struct mutex lock;
- char buf[8192];
+ char buf[CONSOLE_EXT_LOG_MAX];
};
static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
@@ -570,9 +663,6 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
{
struct devkmsg_user *user = file->private_data;
struct printk_log *msg;
- u64 ts_usec;
- size_t i;
- char cont = '-';
size_t len;
ssize_t ret;
@@ -608,66 +698,13 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
}
msg = log_from_idx(user->idx);
- ts_usec = msg->ts_nsec;
- do_div(ts_usec, 1000);
+ len = msg_print_ext_header(user->buf, sizeof(user->buf),
+ msg, user->seq, user->prev);
+ len += msg_print_ext_body(user->buf + len, sizeof(user->buf) - len,
+ log_dict(msg), msg->dict_len,
+ log_text(msg), msg->text_len);
- /*
- * If we couldn't merge continuation line fragments during the print,
- * export the stored flags to allow an optional external merge of the
- * records. Merging the records isn't always neccessarily correct, like
- * when we hit a race during printing. In most cases though, it produces
- * better readable output. 'c' in the record flags mark the first
- * fragment of a line, '+' the following.
- */
- if (msg->flags & LOG_CONT && !(user->prev & LOG_CONT))
- cont = 'c';
- else if ((msg->flags & LOG_CONT) ||
- ((user->prev & LOG_CONT) && !(msg->flags & LOG_PREFIX)))
- cont = '+';
-
- len = sprintf(user->buf, "%u,%llu,%llu,%c;",
- (msg->facility << 3) | msg->level,
- user->seq, ts_usec, cont);
user->prev = msg->flags;
-
- /* escape non-printable characters */
- for (i = 0; i < msg->text_len; i++) {
- unsigned char c = log_text(msg)[i];
-
- if (c < ' ' || c >= 127 || c == '\\')
- len += sprintf(user->buf + len, "\\x%02x", c);
- else
- user->buf[len++] = c;
- }
- user->buf[len++] = '\n';
-
- if (msg->dict_len) {
- bool line = true;
-
- for (i = 0; i < msg->dict_len; i++) {
- unsigned char c = log_dict(msg)[i];
-
- if (line) {
- user->buf[len++] = ' ';
- line = false;
- }
-
- if (c == '\0') {
- user->buf[len++] = '\n';
- line = true;
- continue;
- }
-
- if (c < ' ' || c >= 127 || c == '\\') {
- len += sprintf(user->buf + len, "\\x%02x", c);
- continue;
- }
-
- user->buf[len++] = c;
- }
- user->buf[len++] = '\n';
- }
-
user->idx = log_next(user->idx);
user->seq++;
raw_spin_unlock_irq(&logbuf_lock);
@@ -1253,20 +1290,16 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
return len;
}
-int do_syslog(int type, char __user *buf, int len, bool from_file)
+int do_syslog(int type, char __user *buf, int len, int source)
{
bool clear = false;
static int saved_console_loglevel = LOGLEVEL_DEFAULT;
int error;
- error = check_syslog_permissions(type, from_file);
+ error = check_syslog_permissions(type, source);
if (error)
goto out;
- error = security_syslog(type);
- if (error)
- return error;
-
switch (type) {
case SYSLOG_ACTION_CLOSE: /* Close log */
break;
@@ -1346,7 +1379,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
syslog_prev = 0;
syslog_partial = 0;
}
- if (from_file) {
+ if (source == SYSLOG_FROM_PROC) {
/*
* Short-cut for poll(/"proc/kmsg") which simply checks
* for pending data, not the size; return the count of
@@ -1393,7 +1426,9 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
* log_buf[start] to log_buf[end - 1].
* The console_lock must be held.
*/
-static void call_console_drivers(int level, const char *text, size_t len)
+static void call_console_drivers(int level,
+ const char *ext_text, size_t ext_len,
+ const char *text, size_t len)
{
struct console *con;
@@ -1414,7 +1449,10 @@ static void call_console_drivers(int level, const char *text, size_t len)
if (!cpu_online(smp_processor_id()) &&
!(con->flags & CON_ANYTIME))
continue;
- con->write(con, text, len);
+ if (con->flags & CON_EXTENDED)
+ con->write(con, ext_text, ext_len);
+ else
+ con->write(con, text, len);
}
}
@@ -1557,8 +1595,12 @@ static bool cont_add(int facility, int level, const char *text, size_t len)
if (cont.len && cont.flushed)
return false;
- if (cont.len + len > sizeof(cont.buf)) {
- /* the line gets too long, split it up in separate records */
+ /*
+ * If ext consoles are present, flush and skip in-kernel
+ * continuation. See nr_ext_console_drivers definition. Also, if
+ * the line gets too long, split it up in separate records.
+ */
+ if (nr_ext_console_drivers || cont.len + len > sizeof(cont.buf)) {
cont_flush(LOG_CONT);
return false;
}
@@ -1893,9 +1935,19 @@ static struct cont {
u8 level;
bool flushed:1;
} cont;
+static char *log_text(const struct printk_log *msg) { return NULL; }
+static char *log_dict(const struct printk_log *msg) { return NULL; }
static struct printk_log *log_from_idx(u32 idx) { return NULL; }
static u32 log_next(u32 idx) { return 0; }
-static void call_console_drivers(int level, const char *text, size_t len) {}
+static ssize_t msg_print_ext_header(char *buf, size_t size,
+ struct printk_log *msg, u64 seq,
+ enum log_flags prev_flags) { return 0; }
+static ssize_t msg_print_ext_body(char *buf, size_t size,
+ char *dict, size_t dict_len,
+ char *text, size_t text_len) { return 0; }
+static void call_console_drivers(int level,
+ const char *ext_text, size_t ext_len,
+ const char *text, size_t len) {}
static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
bool syslog, char *buf, size_t size) { return 0; }
static size_t cont_print_text(char *text, size_t size) { return 0; }
@@ -2148,7 +2200,7 @@ static void console_cont_flush(char *text, size_t size)
len = cont_print_text(text, size);
raw_spin_unlock(&logbuf_lock);
stop_critical_timings();
- call_console_drivers(cont.level, text, len);
+ call_console_drivers(cont.level, NULL, 0, text, len);
start_critical_timings();
local_irq_restore(flags);
return;
@@ -2172,6 +2224,7 @@ out:
*/
void console_unlock(void)
{
+ static char ext_text[CONSOLE_EXT_LOG_MAX];
static char text[LOG_LINE_MAX + PREFIX_MAX];
static u64 seen_seq;
unsigned long flags;
@@ -2190,6 +2243,7 @@ void console_unlock(void)
again:
for (;;) {
struct printk_log *msg;
+ size_t ext_len = 0;
size_t len;
int level;
@@ -2235,13 +2289,22 @@ skip:
level = msg->level;
len += msg_print_text(msg, console_prev, false,
text + len, sizeof(text) - len);
+ if (nr_ext_console_drivers) {
+ ext_len = msg_print_ext_header(ext_text,
+ sizeof(ext_text),
+ msg, console_seq, console_prev);
+ ext_len += msg_print_ext_body(ext_text + ext_len,
+ sizeof(ext_text) - ext_len,
+ log_dict(msg), msg->dict_len,
+ log_text(msg), msg->text_len);
+ }
console_idx = log_next(console_idx);
console_seq++;
console_prev = msg->flags;
raw_spin_unlock(&logbuf_lock);
stop_critical_timings(); /* don't trace print latency */
- call_console_drivers(level, text, len);
+ call_console_drivers(level, ext_text, ext_len, text, len);
start_critical_timings();
local_irq_restore(flags);
}
@@ -2497,6 +2560,11 @@ void register_console(struct console *newcon)
newcon->next = console_drivers->next;
console_drivers->next = newcon;
}
+
+ if (newcon->flags & CON_EXTENDED)
+ if (!nr_ext_console_drivers++)
+ pr_info("printk: continuation disabled due to ext consoles, expect more fragments in /dev/kmsg\n");
+
if (newcon->flags & CON_PRINTBUFFER) {
/*
* console_unlock(); will print out the buffered messages
@@ -2569,6 +2637,9 @@ int unregister_console(struct console *console)
}
}
+ if (!res && (console->flags & CON_EXTENDED))
+ nr_ext_console_drivers--;
+
/*
* If this isn't the last console and it has CON_CONSDEV set, we
* need to set it on the next preferred console.
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 052e02672d1242..272d9322bc5dfb 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -106,9 +106,10 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
}
EXPORT_SYMBOL_GPL(__wake_up_locked);
-void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr,
+ void *key)
{
- __wake_up_common(q, mode, 1, 0, key);
+ __wake_up_common(q, mode, nr, 0, key);
}
EXPORT_SYMBOL_GPL(__wake_up_locked_key);
@@ -283,7 +284,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
if (!list_empty(&wait->task_list))
list_del_init(&wait->task_list);
else if (waitqueue_active(q))
- __wake_up_locked_key(q, mode, key);
+ __wake_up_locked_key(q, mode, 1, key);
spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(abort_exclusive_wait);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index c697f73d82d6a4..7c434c39f02a25 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -232,7 +232,8 @@ void smpboot_unpark_threads(unsigned int cpu)
mutex_lock(&smpboot_threads_lock);
list_for_each_entry(cur, &hotplug_threads, list)
- smpboot_unpark_thread(cur, cpu);
+ if (cpumask_test_cpu(cpu, cur->cpumask))
+ smpboot_unpark_thread(cur, cpu);
mutex_unlock(&smpboot_threads_lock);
}
@@ -258,6 +259,15 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
{
unsigned int cpu;
+ /* Unpark any threads that were voluntarily parked. */
+ for_each_cpu_not(cpu, ht->cpumask) {
+ if (cpu_online(cpu)) {
+ struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
+ if (tsk)
+ kthread_unpark(tsk);
+ }
+ }
+
/* We need to destroy also the parked threads of offline cpus */
for_each_possible_cpu(cpu) {
struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
@@ -281,6 +291,10 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
unsigned int cpu;
int ret = 0;
+ if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL))
+ return -ENOMEM;
+ cpumask_copy(plug_thread->cpumask, cpu_possible_mask);
+
get_online_cpus();
mutex_lock(&smpboot_threads_lock);
for_each_online_cpu(cpu) {
@@ -313,9 +327,53 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
smpboot_destroy_threads(plug_thread);
mutex_unlock(&smpboot_threads_lock);
put_online_cpus();
+ free_cpumask_var(plug_thread->cpumask);
}
EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
+/**
+ * smpboot_update_cpumask_percpu_thread - Adjust which per_cpu hotplug threads stay parked
+ * @plug_thread: Hotplug thread descriptor
+ * @new: Revised mask to use
+ *
+ * The cpumask field in the smp_hotplug_thread must not be updated directly
+ * by the client, but only by calling this function.
+ * This function can only be called on a registered smp_hotplug_thread.
+ */
+int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
+ const struct cpumask *new)
+{
+ struct cpumask *old = plug_thread->cpumask;
+ cpumask_var_t tmp;
+ unsigned int cpu;
+
+ if (!alloc_cpumask_var(&tmp, GFP_KERNEL))
+ return -ENOMEM;
+
+ get_online_cpus();
+ mutex_lock(&smpboot_threads_lock);
+
+ /* Park threads that were exclusively enabled on the old mask. */
+ cpumask_andnot(tmp, old, new);
+ for_each_cpu_and(cpu, tmp, cpu_online_mask)
+ smpboot_park_thread(plug_thread, cpu);
+
+ /* Unpark threads that are exclusively enabled on the new mask. */
+ cpumask_andnot(tmp, new, old);
+ for_each_cpu_and(cpu, tmp, cpu_online_mask)
+ smpboot_unpark_thread(plug_thread, cpu);
+
+ cpumask_copy(old, new);
+
+ mutex_unlock(&smpboot_threads_lock);
+ put_online_cpus();
+
+ free_cpumask_var(tmp);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(smpboot_update_cpumask_percpu_thread);
+
static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD);
/*
diff --git a/kernel/sys.c b/kernel/sys.c
index a4e372b798a5f2..125b323690f926 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1722,7 +1722,6 @@ exit_err:
goto exit;
}
-#ifdef CONFIG_CHECKPOINT_RESTORE
/*
* WARNING: we don't require any capability here so be very careful
* in what is allowed for modification from userspace.
@@ -1818,6 +1817,7 @@ out:
return error;
}
+#ifdef CONFIG_CHECKPOINT_RESTORE
static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size)
{
struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, };
@@ -1902,10 +1902,41 @@ out:
}
#endif /* CONFIG_CHECKPOINT_RESTORE */
+static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr,
+ unsigned long len)
+{
+ /*
+ * This doesn't move the auxiliary vector itself since it's pinned to
+ * mm_struct, but it permits filling the vector with new values. It's
+ * up to the caller to provide sane values here, otherwise userspace
+ * tools which use this vector might be unhappy.
+ */
+ unsigned long user_auxv[AT_VECTOR_SIZE];
+
+ if (len > sizeof(user_auxv))
+ return -EINVAL;
+
+ if (copy_from_user(user_auxv, (const void __user *)addr, len))
+ return -EFAULT;
+
+ /* Make sure the last entry is always AT_NULL */
+ user_auxv[AT_VECTOR_SIZE - 2] = 0;
+ user_auxv[AT_VECTOR_SIZE - 1] = 0;
+
+ BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
+
+ task_lock(current);
+ memcpy(mm->saved_auxv, user_auxv, len);
+ task_unlock(current);
+
+ return 0;
+}
+
static int prctl_set_mm(int opt, unsigned long addr,
unsigned long arg4, unsigned long arg5)
{
struct mm_struct *mm = current->mm;
+ struct prctl_mm_map prctl_map;
struct vm_area_struct *vma;
int error;
@@ -1925,6 +1956,9 @@ static int prctl_set_mm(int opt, unsigned long addr,
if (opt == PR_SET_MM_EXE_FILE)
return prctl_set_mm_exe_file(mm, (unsigned int)addr);
+ if (opt == PR_SET_MM_AUXV)
+ return prctl_set_auxv(mm, addr, arg4);
+
if (addr >= TASK_SIZE || addr < mmap_min_addr)
return -EINVAL;
@@ -1933,42 +1967,64 @@ static int prctl_set_mm(int opt, unsigned long addr,
down_read(&mm->mmap_sem);
vma = find_vma(mm, addr);
+ prctl_map.start_code = mm->start_code;
+ prctl_map.end_code = mm->end_code;
+ prctl_map.start_data = mm->start_data;
+ prctl_map.end_data = mm->end_data;
+ prctl_map.start_brk = mm->start_brk;
+ prctl_map.brk = mm->brk;
+ prctl_map.start_stack = mm->start_stack;
+ prctl_map.arg_start = mm->arg_start;
+ prctl_map.arg_end = mm->arg_end;
+ prctl_map.env_start = mm->env_start;
+ prctl_map.env_end = mm->env_end;
+ prctl_map.auxv = NULL;
+ prctl_map.auxv_size = 0;
+ prctl_map.exe_fd = -1;
+
switch (opt) {
case PR_SET_MM_START_CODE:
- mm->start_code = addr;
+ prctl_map.start_code = addr;
break;
case PR_SET_MM_END_CODE:
- mm->end_code = addr;
+ prctl_map.end_code = addr;
break;
case PR_SET_MM_START_DATA:
- mm->start_data = addr;
+ prctl_map.start_data = addr;
break;
case PR_SET_MM_END_DATA:
- mm->end_data = addr;
+ prctl_map.end_data = addr;
+ break;
+ case PR_SET_MM_START_STACK:
+ prctl_map.start_stack = addr;
break;
-
case PR_SET_MM_START_BRK:
- if (addr <= mm->end_data)
- goto out;
-
- if (check_data_rlimit(rlimit(RLIMIT_DATA), mm->brk, addr,
- mm->end_data, mm->start_data))
- goto out;
-
- mm->start_brk = addr;
+ prctl_map.start_brk = addr;
break;
-
case PR_SET_MM_BRK:
- if (addr <= mm->end_data)
- goto out;
-
- if (check_data_rlimit(rlimit(RLIMIT_DATA), addr, mm->start_brk,
- mm->end_data, mm->start_data))
- goto out;
-
- mm->brk = addr;
+ prctl_map.brk = addr;
break;
+ case PR_SET_MM_ARG_START:
+ prctl_map.arg_start = addr;
+ break;
+ case PR_SET_MM_ARG_END:
+ prctl_map.arg_end = addr;
+ break;
+ case PR_SET_MM_ENV_START:
+ prctl_map.env_start = addr;
+ break;
+ case PR_SET_MM_ENV_END:
+ prctl_map.env_end = addr;
+ break;
+ default:
+ goto out;
+ }
+
+ error = validate_prctl_map(&prctl_map);
+ if (error)
+ goto out;
+ switch (opt) {
/*
* If command line arguments and environment
* are placed somewhere else on stack, we can
@@ -1985,52 +2041,20 @@ static int prctl_set_mm(int opt, unsigned long addr,
error = -EFAULT;
goto out;
}
- if (opt == PR_SET_MM_START_STACK)
- mm->start_stack = addr;
- else if (opt == PR_SET_MM_ARG_START)
- mm->arg_start = addr;
- else if (opt == PR_SET_MM_ARG_END)
- mm->arg_end = addr;
- else if (opt == PR_SET_MM_ENV_START)
- mm->env_start = addr;
- else if (opt == PR_SET_MM_ENV_END)
- mm->env_end = addr;
- break;
-
- /*
- * This doesn't move auxiliary vector itself
- * since it's pinned to mm_struct, but allow
- * to fill vector with new values. It's up
- * to a caller to provide sane values here
- * otherwise user space tools which use this
- * vector might be unhappy.
- */
- case PR_SET_MM_AUXV: {
- unsigned long user_auxv[AT_VECTOR_SIZE];
-
- if (arg4 > sizeof(user_auxv))
- goto out;
- up_read(&mm->mmap_sem);
-
- if (copy_from_user(user_auxv, (const void __user *)addr, arg4))
- return -EFAULT;
-
- /* Make sure the last entry is always AT_NULL */
- user_auxv[AT_VECTOR_SIZE - 2] = 0;
- user_auxv[AT_VECTOR_SIZE - 1] = 0;
-
- BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
-
- task_lock(current);
- memcpy(mm->saved_auxv, user_auxv, arg4);
- task_unlock(current);
-
- return 0;
- }
- default:
- goto out;
}
+ mm->start_code = prctl_map.start_code;
+ mm->end_code = prctl_map.end_code;
+ mm->start_data = prctl_map.start_data;
+ mm->end_data = prctl_map.end_data;
+ mm->start_brk = prctl_map.start_brk;
+ mm->brk = prctl_map.brk;
+ mm->start_stack = prctl_map.start_stack;
+ mm->arg_start = prctl_map.arg_start;
+ mm->arg_end = prctl_map.arg_end;
+ mm->env_start = prctl_map.env_start;
+ mm->env_end = prctl_map.env_end;
+
error = 0;
out:
up_read(&mm->mmap_sem);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 7995ef5868d8f3..8b3e10ea512324 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -218,6 +218,7 @@ cond_syscall(compat_sys_timerfd_gettime);
cond_syscall(sys_eventfd);
cond_syscall(sys_eventfd2);
cond_syscall(sys_memfd_create);
+cond_syscall(sys_userfaultfd);
/* performance counters: */
cond_syscall(sys_perf_event_open);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c3eee4c6d6c173..c566b5614f968c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -881,6 +881,13 @@ static struct ctl_table kern_table[] = {
.extra2 = &one,
},
{
+ .procname = "watchdog_cpumask",
+ .data = &watchdog_cpumask_bits,
+ .maxlen = NR_CPUS,
+ .mode = 0644,
+ .proc_handler = proc_watchdog_cpumask,
+ },
+ {
.procname = "softlockup_panic",
.data = &softlockup_panic,
.maxlen = sizeof(int),
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 581a68a04c6408..a6ffa43f299301 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -19,6 +19,7 @@
#include <linux/sysctl.h>
#include <linux/smpboot.h>
#include <linux/sched/rt.h>
+#include <linux/tick.h>
#include <asm/irq_regs.h>
#include <linux/kvm_para.h>
@@ -58,6 +59,12 @@ int __read_mostly sysctl_softlockup_all_cpu_backtrace;
#else
#define sysctl_softlockup_all_cpu_backtrace 0
#endif
+static struct cpumask watchdog_cpumask __read_mostly;
+unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
+
+/* Helper for online, unparked cpus. */
+#define for_each_watchdog_cpu(cpu) \
+ for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
static int __read_mostly watchdog_running;
static u64 __read_mostly sample_period;
@@ -207,7 +214,7 @@ void touch_all_softlockup_watchdogs(void)
* do we care if a 0 races with a timestamp?
* all it means is the softlock check starts one cycle later
*/
- for_each_online_cpu(cpu)
+ for_each_watchdog_cpu(cpu)
per_cpu(watchdog_touch_ts, cpu) = 0;
}
@@ -616,7 +623,7 @@ void watchdog_nmi_enable_all(void)
goto unlock;
get_online_cpus();
- for_each_online_cpu(cpu)
+ for_each_watchdog_cpu(cpu)
watchdog_nmi_enable(cpu);
put_online_cpus();
@@ -634,7 +641,7 @@ void watchdog_nmi_disable_all(void)
goto unlock;
get_online_cpus();
- for_each_online_cpu(cpu)
+ for_each_watchdog_cpu(cpu)
watchdog_nmi_disable(cpu);
put_online_cpus();
@@ -696,7 +703,7 @@ static void update_watchdog_all_cpus(void)
int cpu;
get_online_cpus();
- for_each_online_cpu(cpu)
+ for_each_watchdog_cpu(cpu)
update_watchdog(cpu);
put_online_cpus();
}
@@ -709,8 +716,12 @@ static int watchdog_enable_all_cpus(void)
err = smpboot_register_percpu_thread(&watchdog_threads);
if (err)
pr_err("Failed to create watchdog threads, disabled\n");
- else
+ else {
+ if (smpboot_update_cpumask_percpu_thread(
+ &watchdog_threads, &watchdog_cpumask))
+ pr_err("Failed to set cpumask for watchdog threads\n");
watchdog_running = 1;
+ }
} else {
/*
* Enable/disable the lockup detectors or
@@ -879,12 +890,58 @@ out:
mutex_unlock(&watchdog_proc_mutex);
return err;
}
+
+/*
+ * The cpumask is the mask of possible cpus that the watchdog can run
+ * on, not the mask of cpus it is actually running on. This allows the
+ * user to specify a mask that will include cpus that have not yet
+ * been brought online, if desired.
+ */
+int proc_watchdog_cpumask(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int err;
+
+ mutex_lock(&watchdog_proc_mutex);
+ err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
+ if (!err && write) {
+ /* Remove impossible cpus to keep sysctl output cleaner. */
+ cpumask_and(&watchdog_cpumask, &watchdog_cpumask,
+ cpu_possible_mask);
+
+ if (watchdog_running) {
+ /*
+ * Failure would be due to being unable to allocate
+ * a temporary cpumask, so we are likely not in a
+ * position to do much else to make things better.
+ */
+ if (smpboot_update_cpumask_percpu_thread(
+ &watchdog_threads, &watchdog_cpumask) != 0)
+ pr_err("cpumask update failed\n");
+ }
+ }
+ mutex_unlock(&watchdog_proc_mutex);
+ return err;
+}
+
#endif /* CONFIG_SYSCTL */
void __init lockup_detector_init(void)
{
set_sample_period();
+#ifdef CONFIG_NO_HZ_FULL
+ if (tick_nohz_full_enabled()) {
+ if (!cpumask_empty(tick_nohz_full_mask))
+ pr_info("Disabling watchdog on nohz_full cores by default\n");
+ cpumask_andnot(&watchdog_cpumask, cpu_possible_mask,
+ tick_nohz_full_mask);
+ } else
+ cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
+#else
+ cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
+#endif
+
if (watchdog_enabled)
watchdog_enable_all_cpus();
}
diff --git a/lib/Kconfig b/lib/Kconfig
index 34e332b8d32629..e9ac40c1763ab1 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -188,6 +188,13 @@ config CRC8
when they need to do cyclic redundancy check according CRC8
algorithm. Module will be called crc8.
+config CRC64_ECMA
+ tristate "CRC64 ECMA function"
+ help
+ This option provides CRC64 ECMA function. Drivers may select this
+ when they need to do cyclic redundancy check according to the CRC64
+ ECMA algorithm.
+
config AUDIT_GENERIC
bool
depends on AUDIT && !AUDIT_ARCH
diff --git a/lib/Makefile b/lib/Makefile
index ff37c8c2f7b24f..81ea9822b01b67 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -76,6 +76,7 @@ obj-$(CONFIG_CRC32) += crc32.o
obj-$(CONFIG_CRC7) += crc7.o
obj-$(CONFIG_LIBCRC32C) += libcrc32c.o
obj-$(CONFIG_CRC8) += crc8.o
+obj-$(CONFIG_CRC64_ECMA) += crc64_ecma.o
obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o
obj-$(CONFIG_842_COMPRESS) += 842/
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 64c0926f5dd8e5..a578a018919977 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -462,19 +462,20 @@ EXPORT_SYMBOL(bitmap_parse_user);
* Output format is a comma-separated list of decimal numbers and
* ranges if list is specified or hex digits grouped into comma-separated
* sets of 8 digits/set. Returns the number of characters written to buf.
+ *
+ * It is assumed that @buf is a pointer into a PAGE_SIZE area and that
+ * sufficient storage remains at @buf to accommodate the
+ * bitmap_print_to_pagebuf() output.
*/
int bitmap_print_to_pagebuf(bool list, char *buf, const unsigned long *maskp,
int nmaskbits)
{
- ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf - 2;
+ ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf;
int n = 0;
- if (len > 1) {
- n = list ? scnprintf(buf, len, "%*pbl", nmaskbits, maskp) :
- scnprintf(buf, len, "%*pb", nmaskbits, maskp);
- buf[n++] = '\n';
- buf[n] = '\0';
- }
+ if (len > 1)
+ n = list ? scnprintf(buf, len, "%*pbl\n", nmaskbits, maskp) :
+ scnprintf(buf, len, "%*pb\n", nmaskbits, maskp);
return n;
}
EXPORT_SYMBOL(bitmap_print_to_pagebuf);
@@ -506,12 +507,12 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen,
unsigned a, b;
int c, old_c, totaldigits;
const char __user __force *ubuf = (const char __user __force *)buf;
- int exp_digit, in_range;
+ int at_start, in_range;
totaldigits = c = 0;
bitmap_zero(maskp, nmaskbits);
do {
- exp_digit = 1;
+ at_start = 1;
in_range = 0;
a = b = 0;
@@ -540,11 +541,10 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen,
break;
if (c == '-') {
- if (exp_digit || in_range)
+ if (at_start || in_range)
return -EINVAL;
b = 0;
in_range = 1;
- exp_digit = 1;
continue;
}
@@ -554,16 +554,18 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen,
b = b * 10 + (c - '0');
if (!in_range)
a = b;
- exp_digit = 0;
+ at_start = 0;
totaldigits++;
}
if (!(a <= b))
return -EINVAL;
if (b >= nmaskbits)
return -ERANGE;
- while (a <= b) {
- set_bit(a, maskp);
- a++;
+ if (!at_start) {
+ while (a <= b) {
+ set_bit(a, maskp);
+ a++;
+ }
}
} while (buflen && c == ',');
return 0;
diff --git a/lib/crc64_ecma.c b/lib/crc64_ecma.c
new file mode 100644
index 00000000000000..41629ea5a60cef
--- /dev/null
+++ b/lib/crc64_ecma.c
@@ -0,0 +1,341 @@
+/*
+ * Copyright 2013 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Freescale Semiconductor nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/module.h>
+#include <linux/crc64_ecma.h>
+
+
+#define CRC64_BYTE_MASK 0xFF
+#define CRC64_TABLE_SIZE 256
+
+
+struct crc64_table {
+ u64 seed;
+ u64 table[CRC64_TABLE_SIZE];
+};
+
+
+static struct crc64_table CRC64_ECMA_182 = {
+ CRC64_DEFAULT_INITVAL,
+ {
+ 0x0000000000000000ULL,
+ 0xb32e4cbe03a75f6fULL,
+ 0xf4843657a840a05bULL,
+ 0x47aa7ae9abe7ff34ULL,
+ 0x7bd0c384ff8f5e33ULL,
+ 0xc8fe8f3afc28015cULL,
+ 0x8f54f5d357cffe68ULL,
+ 0x3c7ab96d5468a107ULL,
+ 0xf7a18709ff1ebc66ULL,
+ 0x448fcbb7fcb9e309ULL,
+ 0x0325b15e575e1c3dULL,
+ 0xb00bfde054f94352ULL,
+ 0x8c71448d0091e255ULL,
+ 0x3f5f08330336bd3aULL,
+ 0x78f572daa8d1420eULL,
+ 0xcbdb3e64ab761d61ULL,
+ 0x7d9ba13851336649ULL,
+ 0xceb5ed8652943926ULL,
+ 0x891f976ff973c612ULL,
+ 0x3a31dbd1fad4997dULL,
+ 0x064b62bcaebc387aULL,
+ 0xb5652e02ad1b6715ULL,
+ 0xf2cf54eb06fc9821ULL,
+ 0x41e11855055bc74eULL,
+ 0x8a3a2631ae2dda2fULL,
+ 0x39146a8fad8a8540ULL,
+ 0x7ebe1066066d7a74ULL,
+ 0xcd905cd805ca251bULL,
+ 0xf1eae5b551a2841cULL,
+ 0x42c4a90b5205db73ULL,
+ 0x056ed3e2f9e22447ULL,
+ 0xb6409f5cfa457b28ULL,
+ 0xfb374270a266cc92ULL,
+ 0x48190ecea1c193fdULL,
+ 0x0fb374270a266cc9ULL,
+ 0xbc9d3899098133a6ULL,
+ 0x80e781f45de992a1ULL,
+ 0x33c9cd4a5e4ecdceULL,
+ 0x7463b7a3f5a932faULL,
+ 0xc74dfb1df60e6d95ULL,
+ 0x0c96c5795d7870f4ULL,
+ 0xbfb889c75edf2f9bULL,
+ 0xf812f32ef538d0afULL,
+ 0x4b3cbf90f69f8fc0ULL,
+ 0x774606fda2f72ec7ULL,
+ 0xc4684a43a15071a8ULL,
+ 0x83c230aa0ab78e9cULL,
+ 0x30ec7c140910d1f3ULL,
+ 0x86ace348f355aadbULL,
+ 0x3582aff6f0f2f5b4ULL,
+ 0x7228d51f5b150a80ULL,
+ 0xc10699a158b255efULL,
+ 0xfd7c20cc0cdaf4e8ULL,
+ 0x4e526c720f7dab87ULL,
+ 0x09f8169ba49a54b3ULL,
+ 0xbad65a25a73d0bdcULL,
+ 0x710d64410c4b16bdULL,
+ 0xc22328ff0fec49d2ULL,
+ 0x85895216a40bb6e6ULL,
+ 0x36a71ea8a7ace989ULL,
+ 0x0adda7c5f3c4488eULL,
+ 0xb9f3eb7bf06317e1ULL,
+ 0xfe5991925b84e8d5ULL,
+ 0x4d77dd2c5823b7baULL,
+ 0x64b62bcaebc387a1ULL,
+ 0xd7986774e864d8ceULL,
+ 0x90321d9d438327faULL,
+ 0x231c512340247895ULL,
+ 0x1f66e84e144cd992ULL,
+ 0xac48a4f017eb86fdULL,
+ 0xebe2de19bc0c79c9ULL,
+ 0x58cc92a7bfab26a6ULL,
+ 0x9317acc314dd3bc7ULL,
+ 0x2039e07d177a64a8ULL,
+ 0x67939a94bc9d9b9cULL,
+ 0xd4bdd62abf3ac4f3ULL,
+ 0xe8c76f47eb5265f4ULL,
+ 0x5be923f9e8f53a9bULL,
+ 0x1c4359104312c5afULL,
+ 0xaf6d15ae40b59ac0ULL,
+ 0x192d8af2baf0e1e8ULL,
+ 0xaa03c64cb957be87ULL,
+ 0xeda9bca512b041b3ULL,
+ 0x5e87f01b11171edcULL,
+ 0x62fd4976457fbfdbULL,
+ 0xd1d305c846d8e0b4ULL,
+ 0x96797f21ed3f1f80ULL,
+ 0x2557339fee9840efULL,
+ 0xee8c0dfb45ee5d8eULL,
+ 0x5da24145464902e1ULL,
+ 0x1a083bacedaefdd5ULL,
+ 0xa9267712ee09a2baULL,
+ 0x955cce7fba6103bdULL,
+ 0x267282c1b9c65cd2ULL,
+ 0x61d8f8281221a3e6ULL,
+ 0xd2f6b4961186fc89ULL,
+ 0x9f8169ba49a54b33ULL,
+ 0x2caf25044a02145cULL,
+ 0x6b055fede1e5eb68ULL,
+ 0xd82b1353e242b407ULL,
+ 0xe451aa3eb62a1500ULL,
+ 0x577fe680b58d4a6fULL,
+ 0x10d59c691e6ab55bULL,
+ 0xa3fbd0d71dcdea34ULL,
+ 0x6820eeb3b6bbf755ULL,
+ 0xdb0ea20db51ca83aULL,
+ 0x9ca4d8e41efb570eULL,
+ 0x2f8a945a1d5c0861ULL,
+ 0x13f02d374934a966ULL,
+ 0xa0de61894a93f609ULL,
+ 0xe7741b60e174093dULL,
+ 0x545a57dee2d35652ULL,
+ 0xe21ac88218962d7aULL,
+ 0x5134843c1b317215ULL,
+ 0x169efed5b0d68d21ULL,
+ 0xa5b0b26bb371d24eULL,
+ 0x99ca0b06e7197349ULL,
+ 0x2ae447b8e4be2c26ULL,
+ 0x6d4e3d514f59d312ULL,
+ 0xde6071ef4cfe8c7dULL,
+ 0x15bb4f8be788911cULL,
+ 0xa6950335e42fce73ULL,
+ 0xe13f79dc4fc83147ULL,
+ 0x521135624c6f6e28ULL,
+ 0x6e6b8c0f1807cf2fULL,
+ 0xdd45c0b11ba09040ULL,
+ 0x9aefba58b0476f74ULL,
+ 0x29c1f6e6b3e0301bULL,
+ 0xc96c5795d7870f42ULL,
+ 0x7a421b2bd420502dULL,
+ 0x3de861c27fc7af19ULL,
+ 0x8ec62d7c7c60f076ULL,
+ 0xb2bc941128085171ULL,
+ 0x0192d8af2baf0e1eULL,
+ 0x4638a2468048f12aULL,
+ 0xf516eef883efae45ULL,
+ 0x3ecdd09c2899b324ULL,
+ 0x8de39c222b3eec4bULL,
+ 0xca49e6cb80d9137fULL,
+ 0x7967aa75837e4c10ULL,
+ 0x451d1318d716ed17ULL,
+ 0xf6335fa6d4b1b278ULL,
+ 0xb199254f7f564d4cULL,
+ 0x02b769f17cf11223ULL,
+ 0xb4f7f6ad86b4690bULL,
+ 0x07d9ba1385133664ULL,
+ 0x4073c0fa2ef4c950ULL,
+ 0xf35d8c442d53963fULL,
+ 0xcf273529793b3738ULL,
+ 0x7c0979977a9c6857ULL,
+ 0x3ba3037ed17b9763ULL,
+ 0x888d4fc0d2dcc80cULL,
+ 0x435671a479aad56dULL,
+ 0xf0783d1a7a0d8a02ULL,
+ 0xb7d247f3d1ea7536ULL,
+ 0x04fc0b4dd24d2a59ULL,
+ 0x3886b22086258b5eULL,
+ 0x8ba8fe9e8582d431ULL,
+ 0xcc0284772e652b05ULL,
+ 0x7f2cc8c92dc2746aULL,
+ 0x325b15e575e1c3d0ULL,
+ 0x8175595b76469cbfULL,
+ 0xc6df23b2dda1638bULL,
+ 0x75f16f0cde063ce4ULL,
+ 0x498bd6618a6e9de3ULL,
+ 0xfaa59adf89c9c28cULL,
+ 0xbd0fe036222e3db8ULL,
+ 0x0e21ac88218962d7ULL,
+ 0xc5fa92ec8aff7fb6ULL,
+ 0x76d4de52895820d9ULL,
+ 0x317ea4bb22bfdfedULL,
+ 0x8250e80521188082ULL,
+ 0xbe2a516875702185ULL,
+ 0x0d041dd676d77eeaULL,
+ 0x4aae673fdd3081deULL,
+ 0xf9802b81de97deb1ULL,
+ 0x4fc0b4dd24d2a599ULL,
+ 0xfceef8632775faf6ULL,
+ 0xbb44828a8c9205c2ULL,
+ 0x086ace348f355aadULL,
+ 0x34107759db5dfbaaULL,
+ 0x873e3be7d8faa4c5ULL,
+ 0xc094410e731d5bf1ULL,
+ 0x73ba0db070ba049eULL,
+ 0xb86133d4dbcc19ffULL,
+ 0x0b4f7f6ad86b4690ULL,
+ 0x4ce50583738cb9a4ULL,
+ 0xffcb493d702be6cbULL,
+ 0xc3b1f050244347ccULL,
+ 0x709fbcee27e418a3ULL,
+ 0x3735c6078c03e797ULL,
+ 0x841b8ab98fa4b8f8ULL,
+ 0xadda7c5f3c4488e3ULL,
+ 0x1ef430e13fe3d78cULL,
+ 0x595e4a08940428b8ULL,
+ 0xea7006b697a377d7ULL,
+ 0xd60abfdbc3cbd6d0ULL,
+ 0x6524f365c06c89bfULL,
+ 0x228e898c6b8b768bULL,
+ 0x91a0c532682c29e4ULL,
+ 0x5a7bfb56c35a3485ULL,
+ 0xe955b7e8c0fd6beaULL,
+ 0xaeffcd016b1a94deULL,
+ 0x1dd181bf68bdcbb1ULL,
+ 0x21ab38d23cd56ab6ULL,
+ 0x9285746c3f7235d9ULL,
+ 0xd52f0e859495caedULL,
+ 0x6601423b97329582ULL,
+ 0xd041dd676d77eeaaULL,
+ 0x636f91d96ed0b1c5ULL,
+ 0x24c5eb30c5374ef1ULL,
+ 0x97eba78ec690119eULL,
+ 0xab911ee392f8b099ULL,
+ 0x18bf525d915feff6ULL,
+ 0x5f1528b43ab810c2ULL,
+ 0xec3b640a391f4fadULL,
+ 0x27e05a6e926952ccULL,
+ 0x94ce16d091ce0da3ULL,
+ 0xd3646c393a29f297ULL,
+ 0x604a2087398eadf8ULL,
+ 0x5c3099ea6de60cffULL,
+ 0xef1ed5546e415390ULL,
+ 0xa8b4afbdc5a6aca4ULL,
+ 0x1b9ae303c601f3cbULL,
+ 0x56ed3e2f9e224471ULL,
+ 0xe5c372919d851b1eULL,
+ 0xa26908783662e42aULL,
+ 0x114744c635c5bb45ULL,
+ 0x2d3dfdab61ad1a42ULL,
+ 0x9e13b115620a452dULL,
+ 0xd9b9cbfcc9edba19ULL,
+ 0x6a978742ca4ae576ULL,
+ 0xa14cb926613cf817ULL,
+ 0x1262f598629ba778ULL,
+ 0x55c88f71c97c584cULL,
+ 0xe6e6c3cfcadb0723ULL,
+ 0xda9c7aa29eb3a624ULL,
+ 0x69b2361c9d14f94bULL,
+ 0x2e184cf536f3067fULL,
+ 0x9d36004b35545910ULL,
+ 0x2b769f17cf112238ULL,
+ 0x9858d3a9ccb67d57ULL,
+ 0xdff2a94067518263ULL,
+ 0x6cdce5fe64f6dd0cULL,
+ 0x50a65c93309e7c0bULL,
+ 0xe388102d33392364ULL,
+ 0xa4226ac498dedc50ULL,
+ 0x170c267a9b79833fULL,
+ 0xdcd7181e300f9e5eULL,
+ 0x6ff954a033a8c131ULL,
+ 0x28532e49984f3e05ULL,
+ 0x9b7d62f79be8616aULL,
+ 0xa707db9acf80c06dULL,
+ 0x14299724cc279f02ULL,
+ 0x5383edcd67c06036ULL,
+ 0xe0ada17364673f59ULL
+ }
+};
+
+
+/*
+ * crc64_ecma_seed - Initializes the CRC64 ECMA seed.
+ */
+u64 crc64_ecma_seed(void)
+{
+ return CRC64_ECMA_182.seed;
+}
+EXPORT_SYMBOL(crc64_ecma_seed);
+
+/*
+ * crc64_ecma - Computes the 64 bit ECMA CRC.
+ *
+ * pdata: pointer to the data to compute checksum for.
+ * nbytes: number of bytes in data buffer.
+ * seed: CRC seed.
+ */
+u64 crc64_ecma(u8 const *pdata, u32 nbytes, u64 seed)
+{
+ unsigned int i;
+ u64 crc = seed;
+
+ for (i = 0; i < nbytes; i++)
+ crc = CRC64_ECMA_182.table[(crc ^ pdata[i]) & CRC64_BYTE_MASK] ^
+ (crc >> 8);
+
+ return crc;
+}
+EXPORT_SYMBOL(crc64_ecma);
+
+MODULE_DESCRIPTION("CRC64 ECMA function");
+MODULE_AUTHOR("Freescale Semiconductor Inc.");
+MODULE_LICENSE("GPL");
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 061550de77bc04..f9ebe1c82060ec 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -65,7 +65,8 @@ static struct kmem_cache *radix_tree_node_cachep;
*/
struct radix_tree_preload {
int nr;
- struct radix_tree_node *nodes[RADIX_TREE_PRELOAD_SIZE];
+ /* nodes->private_data points to next preallocated node */
+ struct radix_tree_node *nodes;
};
static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
@@ -197,8 +198,9 @@ radix_tree_node_alloc(struct radix_tree_root *root)
*/
rtp = this_cpu_ptr(&radix_tree_preloads);
if (rtp->nr) {
- ret = rtp->nodes[rtp->nr - 1];
- rtp->nodes[rtp->nr - 1] = NULL;
+ ret = rtp->nodes;
+ rtp->nodes = ret->private_data;
+ ret->private_data = NULL;
rtp->nr--;
}
/*
@@ -257,17 +259,20 @@ static int __radix_tree_preload(gfp_t gfp_mask)
preempt_disable();
rtp = this_cpu_ptr(&radix_tree_preloads);
- while (rtp->nr < ARRAY_SIZE(rtp->nodes)) {
+ while (rtp->nr < RADIX_TREE_PRELOAD_SIZE) {
preempt_enable();
node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
if (node == NULL)
goto out;
preempt_disable();
rtp = this_cpu_ptr(&radix_tree_preloads);
- if (rtp->nr < ARRAY_SIZE(rtp->nodes))
- rtp->nodes[rtp->nr++] = node;
- else
+ if (rtp->nr < RADIX_TREE_PRELOAD_SIZE) {
+ node->private_data = rtp->nodes;
+ rtp->nodes = node;
+ rtp->nr++;
+ } else {
kmem_cache_free(radix_tree_node_cachep, node);
+ }
}
ret = 0;
out:
@@ -1463,15 +1468,16 @@ static int radix_tree_callback(struct notifier_block *nfb,
{
int cpu = (long)hcpu;
struct radix_tree_preload *rtp;
+ struct radix_tree_node *node;
/* Free per-cpu pool of perloaded nodes */
if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
rtp = &per_cpu(radix_tree_preloads, cpu);
while (rtp->nr) {
- kmem_cache_free(radix_tree_node_cachep,
- rtp->nodes[rtp->nr-1]);
- rtp->nodes[rtp->nr-1] = NULL;
- rtp->nr--;
+ node = rtp->nodes;
+ rtp->nodes = node->private_data;
+ kmem_cache_free(radix_tree_node_cachep, node);
+ rtp->nr--;
}
}
return NOTIFY_OK;
diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index 99fbc2f238c497..d105a9f56878e5 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -650,9 +650,8 @@ EXPORT_SYMBOL(sg_miter_stop);
* Returns the number of copied bytes.
*
**/
-static size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents,
- void *buf, size_t buflen, off_t skip,
- bool to_buffer)
+size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
+ size_t buflen, off_t skip, bool to_buffer)
{
unsigned int offset = 0;
struct sg_mapping_iter miter;
@@ -689,6 +688,7 @@ static size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents,
local_irq_restore(flags);
return offset;
}
+EXPORT_SYMBOL(sg_copy_buffer);
/**
* sg_copy_from_buffer - Copy from a linear buffer to an SG list
@@ -701,9 +701,9 @@ static size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents,
*
**/
size_t sg_copy_from_buffer(struct scatterlist *sgl, unsigned int nents,
- void *buf, size_t buflen)
+ const void *buf, size_t buflen)
{
- return sg_copy_buffer(sgl, nents, buf, buflen, 0, false);
+ return sg_copy_buffer(sgl, nents, (void *)buf, buflen, 0, false);
}
EXPORT_SYMBOL(sg_copy_from_buffer);
@@ -729,16 +729,16 @@ EXPORT_SYMBOL(sg_copy_to_buffer);
* @sgl: The SG list
* @nents: Number of SG entries
* @buf: Where to copy from
- * @skip: Number of bytes to skip before copying
* @buflen: The number of bytes to copy
+ * @skip: Number of bytes to skip before copying
*
* Returns the number of copied bytes.
*
**/
size_t sg_pcopy_from_buffer(struct scatterlist *sgl, unsigned int nents,
- void *buf, size_t buflen, off_t skip)
+ const void *buf, size_t buflen, off_t skip)
{
- return sg_copy_buffer(sgl, nents, buf, buflen, skip, false);
+ return sg_copy_buffer(sgl, nents, (void *)buf, buflen, skip, false);
}
EXPORT_SYMBOL(sg_pcopy_from_buffer);
@@ -747,8 +747,8 @@ EXPORT_SYMBOL(sg_pcopy_from_buffer);
* @sgl: The SG list
* @nents: Number of SG entries
* @buf: Where to copy to
- * @skip: Number of bytes to skip before copying
* @buflen: The number of bytes to copy
+ * @skip: Number of bytes to skip before copying
*
* Returns the number of copied bytes.
*
diff --git a/lib/sort.c b/lib/sort.c
index 43c9fe73ae2e2f..fc20df42aa6f29 100644
--- a/lib/sort.c
+++ b/lib/sort.c
@@ -8,6 +8,12 @@
#include <linux/export.h>
#include <linux/sort.h>
+static int alignment_ok(const void *base, int align)
+{
+ return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
+ ((unsigned long)base & (align - 1)) == 0;
+}
+
static void u32_swap(void *a, void *b, int size)
{
u32 t = *(u32 *)a;
@@ -15,6 +21,13 @@ static void u32_swap(void *a, void *b, int size)
*(u32 *)b = t;
}
+static void u64_swap(void *a, void *b, int size)
+{
+ u64 t = *(u64 *)a;
+ *(u64 *)a = *(u64 *)b;
+ *(u64 *)b = t;
+}
+
static void generic_swap(void *a, void *b, int size)
{
char t;
@@ -50,8 +63,14 @@ void sort(void *base, size_t num, size_t size,
/* pre-scale counters for performance */
int i = (num/2 - 1) * size, n = num * size, c, r;
- if (!swap_func)
- swap_func = (size == 4 ? u32_swap : generic_swap);
+ if (!swap_func) {
+ if (size == 4 && alignment_ok(base, 4))
+ swap_func = u32_swap;
+ else if (size == 8 && alignment_ok(base, 8))
+ swap_func = u64_swap;
+ else
+ swap_func = generic_swap;
+ }
/* heapify */
for ( ; i >= 0; i -= size) {
diff --git a/lib/test-hexdump.c b/lib/test-hexdump.c
index c227cc43ec0a39..5241df36eedf92 100644
--- a/lib/test-hexdump.c
+++ b/lib/test-hexdump.c
@@ -25,19 +25,19 @@ static const char * const test_data_1_le[] __initconst = {
"4c", "d1", "19", "99", "43", "b1", "af", "0c",
};
-static const char *test_data_2_le[] __initdata = {
+static const char * const test_data_2_le[] __initconst = {
"32be", "7bdb", "180a", "b293",
"ba70", "24c4", "837d", "9b34",
"9ca6", "ad31", "0f9c", "e9ac",
"d14c", "9919", "b143", "0caf",
};
-static const char *test_data_4_le[] __initdata = {
+static const char * const test_data_4_le[] __initconst = {
"7bdb32be", "b293180a", "24c4ba70", "9b34837d",
"ad319ca6", "e9ac0f9c", "9919d14c", "0cafb143",
};
-static const char *test_data_8_le[] __initdata = {
+static const char * const test_data_8_le[] __initconst = {
"b293180a7bdb32be", "9b34837d24c4ba70",
"e9ac0f9cad319ca6", "0cafb1439919d14c",
};
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index da39c608a28cb9..8243e2fb1e6bf2 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -1360,6 +1360,21 @@ char *clock(char *buf, char *end, struct clk *clk, struct printf_spec spec,
}
}
+static noinline_for_stack
+char *comm_name(char *buf, char *end, struct task_struct *tsk,
+ struct printf_spec spec, const char *fmt)
+{
+ char name[TASK_COMM_LEN];
+
+ /* Caller can pass NULL instead of current. */
+ if (!tsk)
+ tsk = current;
+ /* Not using get_task_comm() in case I'm in IRQ context. */
+ memcpy(name, tsk->comm, TASK_COMM_LEN);
+ name[sizeof(name) - 1] = '\0';
+ return string(buf, end, name, spec);
+}
+
int kptr_restrict __read_mostly;
/*
@@ -1447,6 +1462,7 @@ int kptr_restrict __read_mostly;
* - 'Cn' For a clock, it prints the name (Common Clock Framework) or address
* (legacy clock framework) of the clock
* - 'Cr' For a clock, it prints the current rate of the clock
+ * - 'T' task_struct->comm
*
* Note: The difference between 'S' and 'F' is that on ia64 and ppc64
* function pointers are really function descriptors, which contain a
@@ -1458,7 +1474,7 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr,
{
int default_width = 2 * sizeof(void *) + (spec.flags & SPECIAL ? 2 : 0);
- if (!ptr && *fmt != 'K') {
+ if (!ptr && *fmt != 'K' && *fmt != 'T') {
/*
* Print (null) with the same width as a pointer so it makes
* tabular output look nice.
@@ -1597,6 +1613,8 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr,
return dentry_name(buf, end,
((const struct file *)ptr)->f_path.dentry,
spec, fmt);
+ case 'T':
+ return comm_name(buf, end, ptr, spec, fmt);
}
spec.flags |= SMALL;
if (spec.field_width == -1) {
diff --git a/mm/Kconfig b/mm/Kconfig
index 390214da45463e..e79de2bd12cd04 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -368,6 +368,7 @@ config MEMORY_FAILURE
depends on ARCH_SUPPORTS_MEMORY_FAILURE
bool "Enable recovery from hardware memory errors"
select MEMORY_ISOLATION
+ select RAS
help
Enables code to recover from some memory failures on systems
with MCA recovery. This allows a system to continue running
@@ -635,3 +636,21 @@ config MAX_STACK_SIZE_MB
changed to a smaller value in which case that is used.
A sane initial value is 80 MB.
+
+# For architectures that support deferred memory initialisation
+config ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
+ bool
+
+config DEFERRED_STRUCT_PAGE_INIT
+ bool "Defer initialisation of struct pages to kswapd"
+ default n
+ depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
+ depends on MEMORY_HOTPLUG
+ help
+ Ordinarily all struct pages are initialised during early boot in a
+ single thread. On very large machines this can take a considerable
+ amount of time. If this option is set, large machines will bring up
+ a subset of memmap at boot and then initialise the rest in parallel
+ when kswapd starts. This has a potential performance impact on
+ processes running early in the lifetime of the systemm until kswapd
+ finishes the initialisation.
diff --git a/mm/Makefile b/mm/Makefile
index 98c4eaeabdcb45..b424d5e5b6ff5b 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -78,3 +78,4 @@ obj-$(CONFIG_CMA) += cma.o
obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
+obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 477be696511d66..a23dd193465482 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -164,7 +164,7 @@ void __init free_bootmem_late(unsigned long physaddr, unsigned long size)
end = PFN_DOWN(physaddr + size);
for (; cursor < end; cursor++) {
- __free_pages_bootmem(pfn_to_page(cursor), 0);
+ __free_pages_bootmem(pfn_to_page(cursor), cursor, 0);
totalram_pages++;
}
}
@@ -172,7 +172,7 @@ void __init free_bootmem_late(unsigned long physaddr, unsigned long size)
static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
{
struct page *page;
- unsigned long *map, start, end, pages, count = 0;
+ unsigned long *map, start, end, pages, cur, count = 0;
if (!bdata->node_bootmem_map)
return 0;
@@ -210,17 +210,17 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
if (IS_ALIGNED(start, BITS_PER_LONG) && vec == ~0UL) {
int order = ilog2(BITS_PER_LONG);
- __free_pages_bootmem(pfn_to_page(start), order);
+ __free_pages_bootmem(pfn_to_page(start), start, order);
count += BITS_PER_LONG;
start += BITS_PER_LONG;
} else {
- unsigned long cur = start;
+ cur = start;
start = ALIGN(start + 1, BITS_PER_LONG);
while (vec && cur != start) {
if (vec & 1) {
page = pfn_to_page(cur);
- __free_pages_bootmem(page, 0);
+ __free_pages_bootmem(page, cur, 0);
count++;
}
vec >>= 1;
@@ -229,12 +229,13 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
}
}
+ cur = bdata->node_min_pfn;
page = virt_to_page(bdata->node_bootmem_map);
pages = bdata->node_low_pfn - bdata->node_min_pfn;
pages = bootmem_bootmap_pages(pages);
count += pages;
while (pages--)
- __free_pages_bootmem(page++, 0);
+ __free_pages_bootmem(page++, cur++, 0);
bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);
diff --git a/mm/cma.c b/mm/cma.c
index 3a7a67b933942f..e7d1db5330254d 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -182,7 +182,7 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
if (!size || !memblock_is_region_reserved(base, size))
return -EINVAL;
- /* ensure minimal alignment requied by mm core */
+ /* ensure minimal alignment required by mm core */
alignment = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order);
/* alignment should be aligned with order_per_bit */
@@ -238,7 +238,7 @@ int __init cma_declare_contiguous(phys_addr_t base,
/*
* high_memory isn't direct mapped memory so retrieving its physical
* address isn't appropriate. But it would be useful to check the
- * physical address of the highmem boundary so it's justfiable to get
+ * physical address of the highmem boundary so it's justifiable to get
* the physical address from it. On x86 there is a validation check for
* this case, so the following workaround is needed to avoid it.
*/
@@ -316,13 +316,15 @@ int __init cma_declare_contiguous(phys_addr_t base,
*/
if (base < highmem_start && limit > highmem_start) {
addr = memblock_alloc_range(size, alignment,
- highmem_start, limit);
+ highmem_start, limit,
+ MEMBLOCK_NONE);
limit = highmem_start;
}
if (!addr) {
addr = memblock_alloc_range(size, alignment, base,
- limit);
+ limit,
+ MEMBLOCK_NONE);
if (!addr) {
ret = -ENOMEM;
goto err;
diff --git a/mm/compaction.c b/mm/compaction.c
index 018f08da99a2ec..16e1b57934527f 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -732,18 +732,21 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* splitting and collapsing (collapsing has already happened
* if PageLRU is set) but the lock is not necessarily taken
* here and it is wasteful to take it just to check transhuge.
- * Check TransHuge without lock and skip the whole pageblock if
- * it's either a transhuge or hugetlbfs page, as calling
- * compound_order() without preventing THP from splitting the
- * page underneath us may return surprising results.
+ * Check PageCompound without lock and skip the whole pageblock
+ * if it's a transhuge page, as calling compound_order()
+ * without preventing THP from splitting the page underneath us
+ * may return surprising results.
+ * If we happen to check a THP tail page, compound_order()
+ * returns 0. It should be rare enough to not bother with
+ * using compound_head() in that case.
*/
- if (PageTransHuge(page)) {
- if (!locked)
- low_pfn = ALIGN(low_pfn + 1,
- pageblock_nr_pages) - 1;
+ if (PageCompound(page)) {
+ int nr;
+ if (locked)
+ nr = 1 << compound_order(page);
else
- low_pfn += (1 << compound_order(page)) - 1;
-
+ nr = pageblock_nr_pages;
+ low_pfn += nr - 1;
continue;
}
@@ -763,11 +766,12 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (!locked)
break;
- /* Recheck PageLRU and PageTransHuge under lock */
+ /* Recheck PageLRU and PageCompound under lock */
if (!PageLRU(page))
continue;
- if (PageTransHuge(page)) {
- low_pfn += (1 << compound_order(page)) - 1;
+ if (PageCompound(page)) {
+ int nr = 1 << compound_order(page);
+ low_pfn += nr - 1;
continue;
}
}
@@ -778,7 +782,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (__isolate_lru_page(page, isolate_mode) != 0)
continue;
- VM_BUG_ON_PAGE(PageTransCompound(page), page);
+ VM_BUG_ON_PAGE(PageCompound(page), page);
/* Successfully isolated */
del_page_from_lru_list(page, lruvec, page_lru(page));
diff --git a/mm/filemap.c b/mm/filemap.c
index bfc1ab053b1224..b63fb81df336e0 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -199,7 +199,9 @@ void __delete_from_page_cache(struct page *page, void *shadow,
page->mapping = NULL;
/* Leave page->index set: truncation lookup relies upon it */
- __dec_zone_page_state(page, NR_FILE_PAGES);
+ /* hugetlb pages do not participate in page cache accounting. */
+ if (!PageHuge(page))
+ __dec_zone_page_state(page, NR_FILE_PAGES);
if (PageSwapBacked(page))
__dec_zone_page_state(page, NR_SHMEM);
BUG_ON(page_mapped(page));
@@ -498,7 +500,12 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
error = radix_tree_insert(&mapping->page_tree, offset, new);
BUG_ON(error);
mapping->nrpages++;
- __inc_zone_page_state(new, NR_FILE_PAGES);
+
+ /*
+ * hugetlb pages do not participate in page cache accounting.
+ */
+ if (!PageHuge(new))
+ __inc_zone_page_state(new, NR_FILE_PAGES);
if (PageSwapBacked(new))
__inc_zone_page_state(new, NR_SHMEM);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
@@ -591,7 +598,10 @@ static int __add_to_page_cache_locked(struct page *page,
radix_tree_preload_end();
if (unlikely(error))
goto err_insert;
- __inc_zone_page_state(page, NR_FILE_PAGES);
+
+ /* hugetlb pages do not participate in page cache accounting. */
+ if (!huge)
+ __inc_zone_page_state(page, NR_FILE_PAGES);
spin_unlock_irq(&mapping->tree_lock);
if (!huge)
mem_cgroup_commit_charge(page, memcg, false);
@@ -631,11 +641,11 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
void *shadow = NULL;
int ret;
- __set_page_locked(page);
+ __SetPageLocked(page);
ret = __add_to_page_cache_locked(page, mapping, offset,
gfp_mask, &shadow);
if (unlikely(ret))
- __clear_page_locked(page);
+ __ClearPageLocked(page);
else {
/*
* The page might have been evicted from cache only
@@ -758,6 +768,7 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue);
*/
void unlock_page(struct page *page)
{
+ page = compound_head(page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
clear_bit_unlock(PG_locked, &page->flags);
smp_mb__after_atomic();
@@ -822,18 +833,20 @@ EXPORT_SYMBOL_GPL(page_endio);
*/
void __lock_page(struct page *page)
{
- DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+ struct page *page_head = compound_head(page);
+ DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
- __wait_on_bit_lock(page_waitqueue(page), &wait, bit_wait_io,
+ __wait_on_bit_lock(page_waitqueue(page_head), &wait, bit_wait_io,
TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(__lock_page);
int __lock_page_killable(struct page *page)
{
- DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+ struct page *page_head = compound_head(page);
+ DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
- return __wait_on_bit_lock(page_waitqueue(page), &wait,
+ return __wait_on_bit_lock(page_waitqueue(page_head), &wait,
bit_wait_io, TASK_KILLABLE);
}
EXPORT_SYMBOL_GPL(__lock_page_killable);
@@ -1670,8 +1683,8 @@ no_cached_page:
error = -ENOMEM;
goto out;
}
- error = add_to_page_cache_lru(page, mapping,
- index, GFP_KERNEL);
+ error = add_to_page_cache_lru(page, mapping, index,
+ GFP_KERNEL & mapping_gfp_mask(mapping));
if (error) {
page_cache_release(page);
if (error == -EEXIST) {
@@ -1772,7 +1785,8 @@ static int page_cache_read(struct file *file, pgoff_t offset)
if (!page)
return -ENOMEM;
- ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
+ ret = add_to_page_cache_lru(page, mapping, offset,
+ GFP_KERNEL & mapping_gfp_mask(mapping));
if (ret == 0)
ret = mapping->a_ops->readpage(file, page);
else if (ret == -EEXIST)
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 8d82809eb0859a..27a9924caf6174 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -21,11 +21,16 @@
#include <linux/swapfile.h>
/*
- * frontswap_ops is set by frontswap_register_ops to contain the pointers
- * to the frontswap "backend" implementation functions.
+ * frontswap_ops are added by frontswap_register_ops, and provide the
+ * frontswap "backend" implementation functions. Multiple implementations
+ * may be registered, but implementations can never deregister. This
+ * is a simple singly-linked list of all registered implementations.
*/
static struct frontswap_ops *frontswap_ops __read_mostly;
+#define for_each_frontswap_ops(ops) \
+ for ((ops) = frontswap_ops; (ops); (ops) = (ops)->next)
+
/*
* If enabled, frontswap_store will return failure even on success. As
* a result, the swap subsystem will always write the page to swap, in
@@ -79,15 +84,6 @@ static inline void inc_frontswap_invalidates(void) { }
* on all frontswap functions to not call the backend until the backend
* has registered.
*
- * Specifically when no backend is registered (nobody called
- * frontswap_register_ops) all calls to frontswap_init (which is done via
- * swapon -> enable_swap_info -> frontswap_init) are registered and remembered
- * (via the setting of need_init bitmap) but fail to create tmem_pools. When a
- * backend registers with frontswap at some later point the previous
- * calls to frontswap_init are executed (by iterating over the need_init
- * bitmap) to create tmem_pools and set the respective poolids. All of that is
- * guarded by us using atomic bit operations on the 'need_init' bitmap.
- *
* This would not guards us against the user deciding to call swapoff right as
* we are calling the backend to initialize (so swapon is in action).
* Fortunatly for us, the swapon_mutex has been taked by the callee so we are
@@ -106,37 +102,64 @@ static inline void inc_frontswap_invalidates(void) { }
*
* Obviously the opposite (unloading the backend) must be done after all
* the frontswap_[store|load|invalidate_area|invalidate_page] start
- * ignorning or failing the requests - at which point frontswap_ops
- * would have to be made in some fashion atomic.
+ * ignoring or failing the requests. However, there is currently no way
+ * to unload a backend once it is registered.
*/
-static DECLARE_BITMAP(need_init, MAX_SWAPFILES);
/*
- * Register operations for frontswap, returning previous thus allowing
- * detection of multiple backends and possible nesting.
+ * Register operations for frontswap
*/
-struct frontswap_ops *frontswap_register_ops(struct frontswap_ops *ops)
+void frontswap_register_ops(struct frontswap_ops *ops)
{
- struct frontswap_ops *old = frontswap_ops;
- int i;
-
- for (i = 0; i < MAX_SWAPFILES; i++) {
- if (test_and_clear_bit(i, need_init)) {
- struct swap_info_struct *sis = swap_info[i];
- /* __frontswap_init _should_ have set it! */
- if (!sis->frontswap_map)
- return ERR_PTR(-EINVAL);
- ops->init(i);
- }
+ DECLARE_BITMAP(a, MAX_SWAPFILES);
+ DECLARE_BITMAP(b, MAX_SWAPFILES);
+ struct swap_info_struct *si;
+ unsigned int i;
+
+ bitmap_zero(a, MAX_SWAPFILES);
+ bitmap_zero(b, MAX_SWAPFILES);
+
+ spin_lock(&swap_lock);
+ plist_for_each_entry(si, &swap_active_head, list) {
+ if (!WARN_ON(!si->frontswap_map))
+ set_bit(si->type, a);
}
+ spin_unlock(&swap_lock);
+
+ /* the new ops needs to know the currently active swap devices */
+ for_each_set_bit(i, a, MAX_SWAPFILES)
+ ops->init(i);
+
/*
- * We MUST have frontswap_ops set _after_ the frontswap_init's
- * have been called. Otherwise __frontswap_store might fail. Hence
- * the barrier to make sure compiler does not re-order us.
+ * Setting frontswap_ops must happen after the ops->init() calls
+ * above; cmpxchg implies smp_mb() which will ensure the init is
+ * complete at this point.
*/
- barrier();
- frontswap_ops = ops;
- return old;
+ do {
+ ops->next = frontswap_ops;
+ } while (cmpxchg(&frontswap_ops, ops->next, ops) != ops->next);
+
+ spin_lock(&swap_lock);
+ plist_for_each_entry(si, &swap_active_head, list) {
+ if (si->frontswap_map)
+ set_bit(si->type, b);
+ }
+ spin_unlock(&swap_lock);
+
+ /*
+ * On the very unlikely chance that a swap device was added or
+ * removed between setting the "a" list bits and the ops init
+ * calls, we re-check and do init or invalidate for any changed
+ * bits.
+ */
+ if (unlikely(!bitmap_equal(a, b, MAX_SWAPFILES))) {
+ for (i = 0; i < MAX_SWAPFILES; i++) {
+ if (!test_bit(i, a) && test_bit(i, b))
+ ops->init(i);
+ else if (test_bit(i, a) && !test_bit(i, b))
+ ops->invalidate_area(i);
+ }
+ }
}
EXPORT_SYMBOL(frontswap_register_ops);
@@ -164,6 +187,7 @@ EXPORT_SYMBOL(frontswap_tmem_exclusive_gets);
void __frontswap_init(unsigned type, unsigned long *map)
{
struct swap_info_struct *sis = swap_info[type];
+ struct frontswap_ops *ops;
BUG_ON(sis == NULL);
@@ -179,28 +203,30 @@ void __frontswap_init(unsigned type, unsigned long *map)
* p->frontswap set to something valid to work properly.
*/
frontswap_map_set(sis, map);
- if (frontswap_ops)
- frontswap_ops->init(type);
- else {
- BUG_ON(type >= MAX_SWAPFILES);
- set_bit(type, need_init);
- }
+
+ for_each_frontswap_ops(ops)
+ ops->init(type);
}
EXPORT_SYMBOL(__frontswap_init);
bool __frontswap_test(struct swap_info_struct *sis,
pgoff_t offset)
{
- bool ret = false;
-
- if (frontswap_ops && sis->frontswap_map)
- ret = test_bit(offset, sis->frontswap_map);
- return ret;
+ if (sis->frontswap_map)
+ return test_bit(offset, sis->frontswap_map);
+ return false;
}
EXPORT_SYMBOL(__frontswap_test);
+static inline void __frontswap_set(struct swap_info_struct *sis,
+ pgoff_t offset)
+{
+ set_bit(offset, sis->frontswap_map);
+ atomic_inc(&sis->frontswap_pages);
+}
+
static inline void __frontswap_clear(struct swap_info_struct *sis,
- pgoff_t offset)
+ pgoff_t offset)
{
clear_bit(offset, sis->frontswap_map);
atomic_dec(&sis->frontswap_pages);
@@ -215,39 +241,46 @@ static inline void __frontswap_clear(struct swap_info_struct *sis,
*/
int __frontswap_store(struct page *page)
{
- int ret = -1, dup = 0;
+ int ret = -1;
swp_entry_t entry = { .val = page_private(page), };
int type = swp_type(entry);
struct swap_info_struct *sis = swap_info[type];
pgoff_t offset = swp_offset(entry);
+ struct frontswap_ops *ops;
/*
* Return if no backend registed.
* Don't need to inc frontswap_failed_stores here.
*/
if (!frontswap_ops)
- return ret;
+ return -1;
BUG_ON(!PageLocked(page));
BUG_ON(sis == NULL);
- if (__frontswap_test(sis, offset))
- dup = 1;
- ret = frontswap_ops->store(type, offset, page);
+
+ /*
+ * If a dup, we must remove the old page first; we can't leave the
+ * old page no matter if the store of the new page succeeds or fails,
+ * and we can't rely on the new page replacing the old page as we may
+ * not store to the same implementation that contains the old page.
+ */
+ if (__frontswap_test(sis, offset)) {
+ __frontswap_clear(sis, offset);
+ for_each_frontswap_ops(ops)
+ ops->invalidate_page(type, offset);
+ }
+
+ /* Try to store in each implementation, until one succeeds. */
+ for_each_frontswap_ops(ops) {
+ ret = ops->store(type, offset, page);
+ if (!ret) /* successful store */
+ break;
+ }
if (ret == 0) {
- set_bit(offset, sis->frontswap_map);
+ __frontswap_set(sis, offset);
inc_frontswap_succ_stores();
- if (!dup)
- atomic_inc(&sis->frontswap_pages);
} else {
- /*
- failed dup always results in automatic invalidate of
- the (older) page from frontswap
- */
inc_frontswap_failed_stores();
- if (dup) {
- __frontswap_clear(sis, offset);
- frontswap_ops->invalidate_page(type, offset);
- }
}
if (frontswap_writethrough_enabled)
/* report failure so swap also writes to swap device */
@@ -268,14 +301,22 @@ int __frontswap_load(struct page *page)
int type = swp_type(entry);
struct swap_info_struct *sis = swap_info[type];
pgoff_t offset = swp_offset(entry);
+ struct frontswap_ops *ops;
+
+ if (!frontswap_ops)
+ return -1;
BUG_ON(!PageLocked(page));
BUG_ON(sis == NULL);
- /*
- * __frontswap_test() will check whether there is backend registered
- */
- if (__frontswap_test(sis, offset))
- ret = frontswap_ops->load(type, offset, page);
+ if (!__frontswap_test(sis, offset))
+ return -1;
+
+ /* Try loading from each implementation, until one succeeds. */
+ for_each_frontswap_ops(ops) {
+ ret = ops->load(type, offset, page);
+ if (!ret) /* successful load */
+ break;
+ }
if (ret == 0) {
inc_frontswap_loads();
if (frontswap_tmem_exclusive_gets_enabled) {
@@ -294,16 +335,19 @@ EXPORT_SYMBOL(__frontswap_load);
void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
{
struct swap_info_struct *sis = swap_info[type];
+ struct frontswap_ops *ops;
+
+ if (!frontswap_ops)
+ return;
BUG_ON(sis == NULL);
- /*
- * __frontswap_test() will check whether there is backend registered
- */
- if (__frontswap_test(sis, offset)) {
- frontswap_ops->invalidate_page(type, offset);
- __frontswap_clear(sis, offset);
- inc_frontswap_invalidates();
- }
+ if (!__frontswap_test(sis, offset))
+ return;
+
+ for_each_frontswap_ops(ops)
+ ops->invalidate_page(type, offset);
+ __frontswap_clear(sis, offset);
+ inc_frontswap_invalidates();
}
EXPORT_SYMBOL(__frontswap_invalidate_page);
@@ -314,16 +358,19 @@ EXPORT_SYMBOL(__frontswap_invalidate_page);
void __frontswap_invalidate_area(unsigned type)
{
struct swap_info_struct *sis = swap_info[type];
+ struct frontswap_ops *ops;
- if (frontswap_ops) {
- BUG_ON(sis == NULL);
- if (sis->frontswap_map == NULL)
- return;
- frontswap_ops->invalidate_area(type);
- atomic_set(&sis->frontswap_pages, 0);
- bitmap_zero(sis->frontswap_map, sis->max);
- }
- clear_bit(type, need_init);
+ if (!frontswap_ops)
+ return;
+
+ BUG_ON(sis == NULL);
+ if (sis->frontswap_map == NULL)
+ return;
+
+ for_each_frontswap_ops(ops)
+ ops->invalidate_area(type);
+ atomic_set(&sis->frontswap_pages, 0);
+ bitmap_zero(sis->frontswap_map, sis->max);
}
EXPORT_SYMBOL(__frontswap_invalidate_area);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 078832cf363657..9671f51e954da4 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -23,6 +23,7 @@
#include <linux/pagemap.h>
#include <linux/migrate.h>
#include <linux/hashtable.h>
+#include <linux/userfaultfd_k.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
@@ -717,7 +718,8 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long haddr, pmd_t *pmd,
- struct page *page, gfp_t gfp)
+ struct page *page, gfp_t gfp,
+ unsigned int flags)
{
struct mem_cgroup *memcg;
pgtable_t pgtable;
@@ -725,12 +727,16 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
VM_BUG_ON_PAGE(!PageCompound(page), page);
- if (mem_cgroup_try_charge(page, mm, gfp, &memcg))
- return VM_FAULT_OOM;
+ if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) {
+ put_page(page);
+ count_vm_event(THP_FAULT_FALLBACK);
+ return VM_FAULT_FALLBACK;
+ }
pgtable = pte_alloc_one(mm, haddr);
if (unlikely(!pgtable)) {
mem_cgroup_cancel_charge(page, memcg);
+ put_page(page);
return VM_FAULT_OOM;
}
@@ -750,6 +756,21 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
pte_free(mm, pgtable);
} else {
pmd_t entry;
+
+ /* Deliver the page fault to userland */
+ if (userfaultfd_missing(vma)) {
+ int ret;
+
+ spin_unlock(ptl);
+ mem_cgroup_cancel_charge(page, memcg);
+ put_page(page);
+ pte_free(mm, pgtable);
+ ret = handle_userfault(vma, haddr, flags,
+ VM_UFFD_MISSING);
+ VM_BUG_ON(ret & VM_FAULT_FALLBACK);
+ return ret;
+ }
+
entry = mk_huge_pmd(page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
page_add_new_anon_rmap(page, vma, haddr);
@@ -760,6 +781,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
atomic_long_inc(&mm->nr_ptes);
spin_unlock(ptl);
+ count_vm_event(THP_FAULT_ALLOC);
}
return 0;
@@ -771,19 +793,16 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
}
/* Caller must hold page table lock. */
-static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
+static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
struct page *zero_page)
{
pmd_t entry;
- if (!pmd_none(*pmd))
- return false;
entry = mk_pmd(zero_page, vma->vm_page_prot);
entry = pmd_mkhuge(entry);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
atomic_long_inc(&mm->nr_ptes);
- return true;
}
int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -806,6 +825,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
pgtable_t pgtable;
struct page *zero_page;
bool set;
+ int ret;
pgtable = pte_alloc_one(mm, haddr);
if (unlikely(!pgtable))
return VM_FAULT_OOM;
@@ -816,14 +836,28 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_FALLBACK;
}
ptl = pmd_lock(mm, pmd);
- set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
- zero_page);
- spin_unlock(ptl);
+ ret = 0;
+ set = false;
+ if (pmd_none(*pmd)) {
+ if (userfaultfd_missing(vma)) {
+ spin_unlock(ptl);
+ ret = handle_userfault(vma, haddr, flags,
+ VM_UFFD_MISSING);
+ VM_BUG_ON(ret & VM_FAULT_FALLBACK);
+ } else {
+ set_huge_zero_page(pgtable, mm, vma,
+ haddr, pmd,
+ zero_page);
+ spin_unlock(ptl);
+ set = true;
+ }
+ } else
+ spin_unlock(ptl);
if (!set) {
pte_free(mm, pgtable);
put_huge_zero_page();
}
- return 0;
+ return ret;
}
gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
@@ -831,14 +865,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
}
- if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page, gfp))) {
- put_page(page);
- count_vm_event(THP_FAULT_FALLBACK);
- return VM_FAULT_FALLBACK;
- }
-
- count_vm_event(THP_FAULT_ALLOC);
- return 0;
+ return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page, gfp, flags);
}
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -873,16 +900,14 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
*/
if (is_huge_zero_pmd(pmd)) {
struct page *zero_page;
- bool set;
/*
* get_huge_zero_page() will never allocate a new page here,
* since we already have a zero page to copy. It just takes a
* reference.
*/
zero_page = get_huge_zero_page();
- set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
+ set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
zero_page);
- BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */
ret = 0;
goto out_unlock;
}
@@ -1031,7 +1056,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
goto out_free_pages;
VM_BUG_ON_PAGE(!PageHead(page), page);
- pmdp_clear_flush_notify(vma, haddr, pmd);
+ pmdp_huge_clear_flush_notify(vma, haddr, pmd);
/* leave pmd empty until pte is filled */
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
@@ -1174,7 +1199,7 @@ alloc:
pmd_t entry;
entry = mk_huge_pmd(new_page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- pmdp_clear_flush_notify(vma, haddr, pmd);
+ pmdp_huge_clear_flush_notify(vma, haddr, pmd);
page_add_new_anon_rmap(new_page, vma, haddr);
mem_cgroup_commit_charge(new_page, memcg, false);
lru_cache_add_active_or_unevictable(new_page, vma);
@@ -1384,6 +1409,36 @@ out:
return 0;
}
+int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ pmd_t *pmd, unsigned long addr)
+
+{
+ spinlock_t *ptl;
+ struct mm_struct *mm = tlb->mm;
+ int ret = 1;
+
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ struct page *page;
+ pmd_t orig_pmd;
+
+ orig_pmd = pmdp_huge_get_and_clear(mm, addr, pmd);
+
+ /* No hugepage in swapcache */
+ page = pmd_page(orig_pmd);
+ VM_BUG_ON_PAGE(PageSwapCache(page), page);
+
+ orig_pmd = pmd_mkold(orig_pmd);
+ orig_pmd = pmd_mkclean(orig_pmd);
+
+ set_pmd_at(mm, addr, pmd, orig_pmd);
+ tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+ spin_unlock(ptl);
+ ret = 0;
+ }
+
+ return ret;
+}
+
int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr)
{
@@ -1396,12 +1451,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t orig_pmd;
/*
* For architectures like ppc64 we look at deposited pgtable
- * when calling pmdp_get_and_clear. So do the
+ * when calling pmdp_huge_get_and_clear. So do the
* pgtable_trans_huge_withdraw after finishing pmdp related
* operations.
*/
- orig_pmd = pmdp_get_and_clear_full(tlb->mm, addr, pmd,
- tlb->fullmm);
+ orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
+ tlb->fullmm);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
if (is_huge_zero_pmd(orig_pmd)) {
@@ -1459,7 +1514,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
new_ptl = pmd_lockptr(mm, new_pmd);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
- pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
+ pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
VM_BUG_ON(!pmd_none(*new_pmd));
if (pmd_move_must_withdraw(new_ptl, old_ptl)) {
@@ -1505,7 +1560,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
}
if (!prot_numa || !pmd_protnone(*pmd)) {
- entry = pmdp_get_and_clear_notify(mm, addr, pmd);
+ entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd);
entry = pmd_modify(entry, newprot);
if (preserve_write)
entry = pmd_mkwrite(entry);
@@ -1599,6 +1654,11 @@ unlock:
return NULL;
}
+int pmd_freeable(pmd_t pmd)
+{
+ return !pmd_dirty(pmd);
+}
+
static int __split_huge_page_splitting(struct page *page,
struct vm_area_struct *vma,
unsigned long address)
@@ -1710,7 +1770,7 @@ static void __split_huge_page_refcount(struct page *page,
*/
page_tail->_mapcount = page->_mapcount;
- BUG_ON(page_tail->mapping);
+ BUG_ON(page_tail->mapping != TAIL_MAPPING);
page_tail->mapping = page->mapping;
page_tail->index = page->index + i;
@@ -2138,7 +2198,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
_pte++, address += PAGE_SIZE) {
pte_t pteval = *_pte;
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
- if (++none_or_zero <= khugepaged_max_ptes_none)
+ if (!userfaultfd_armed(vma) &&
+ ++none_or_zero <= khugepaged_max_ptes_none)
continue;
else
goto out;
@@ -2499,7 +2560,7 @@ static void collapse_huge_page(struct mm_struct *mm,
* huge and small TLB entries for the same virtual address
* to avoid the risk of CPU bugs in that area.
*/
- _pmd = pmdp_clear_flush(vma, address, pmd);
+ _pmd = pmdp_collapse_flush(vma, address, pmd);
spin_unlock(pmd_ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
@@ -2591,7 +2652,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
_pte++, _address += PAGE_SIZE) {
pte_t pteval = *_pte;
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
- if (++none_or_zero <= khugepaged_max_ptes_none)
+ if (!userfaultfd_armed(vma) &&
+ ++none_or_zero <= khugepaged_max_ptes_none)
continue;
else
goto out_unmap;
@@ -2799,7 +2861,7 @@ static void khugepaged_do_scan(void)
cond_resched();
- if (unlikely(kthread_should_stop() || freezing(current)))
+ if (unlikely(kthread_should_stop() || try_to_freeze()))
break;
spin_lock(&khugepaged_mm_lock);
@@ -2820,8 +2882,6 @@ static void khugepaged_do_scan(void)
static void khugepaged_wait_work(void)
{
- try_to_freeze();
-
if (khugepaged_has_work()) {
if (!khugepaged_scan_sleep_millisecs)
return;
@@ -2865,7 +2925,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
pmd_t _pmd;
int i;
- pmdp_clear_flush_notify(vma, haddr, pmd);
+ pmdp_huge_clear_flush_notify(vma, haddr, pmd);
/* leave pmd empty until pte is filled */
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 271e4432734c37..a8c3087089d8a8 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -40,6 +40,11 @@ int hugepages_treat_as_movable;
int hugetlb_max_hstate __read_mostly;
unsigned int default_hstate_idx;
struct hstate hstates[HUGE_MAX_HSTATE];
+/*
+ * Minimum page order among possible hugepage sizes, set to a proper value
+ * at boot time.
+ */
+static unsigned int minimum_order __read_mostly = UINT_MAX;
__initdata LIST_HEAD(huge_boot_pages);
@@ -212,8 +217,20 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
* Region tracking -- allows tracking of reservations and instantiated pages
* across the pages in a mapping.
*
- * The region data structures are embedded into a resv_map and
- * protected by a resv_map's lock
+ * The region data structures are embedded into a resv_map and protected
+ * by a resv_map's lock. The set of regions within the resv_map represent
+ * reservations for huge pages, or huge pages that have already been
+ * instantiated within the map. The from and to elements are huge page
+ * indicies into the associated mapping. from indicates the starting index
+ * of the region. to represents the first index past the end of the region.
+ *
+ * For example, a file region structure with from == 0 and to == 4 represents
+ * four huge pages in a mapping. It is important to note that the to element
+ * represents the first element past the end of the region. This is used in
+ * arithmetic as 4(to) - 0(from) = 4 huge pages in the region.
+ *
+ * Interval notation of the form [from, to) will be used to indicate that
+ * the endpoint from is inclusive and to is exclusive.
*/
struct file_region {
struct list_head link;
@@ -221,10 +238,22 @@ struct file_region {
long to;
};
+/*
+ * Add the huge page range represented by [f, t) to the reserve
+ * map. Existing regions will be expanded to accommodate the
+ * specified range. We know only existing regions need to be
+ * expanded, because region_add is only called after region_chg
+ * with the same range. If a new file_region structure must
+ * be allocated, it is done in region_chg.
+ *
+ * Return the number of new huge pages added to the map. This
+ * number is greater than or equal to zero.
+ */
static long region_add(struct resv_map *resv, long f, long t)
{
struct list_head *head = &resv->regions;
struct file_region *rg, *nrg, *trg;
+ long add = 0;
spin_lock(&resv->lock);
/* Locate the region we are either in or before. */
@@ -250,16 +279,45 @@ static long region_add(struct resv_map *resv, long f, long t)
if (rg->to > t)
t = rg->to;
if (rg != nrg) {
+ /* Decrement return value by the deleted range.
+ * Another range will span this area so that by
+ * end of routine add will be >= zero
+ */
+ add -= (rg->to - rg->from);
list_del(&rg->link);
kfree(rg);
}
}
+
+ add += (nrg->from - f); /* Added to beginning of region */
nrg->from = f;
+ add += t - nrg->to; /* Added to end of region */
nrg->to = t;
+
spin_unlock(&resv->lock);
- return 0;
+ VM_BUG_ON(add < 0);
+ return add;
}
+/*
+ * Examine the existing reserve map and determine how many
+ * huge pages in the specified range [f, t) are NOT currently
+ * represented. This routine is called before a subsequent
+ * call to region_add that will actually modify the reserve
+ * map to add the specified range [f, t). region_chg does
+ * not change the number of huge pages represented by the
+ * map. However, if the existing regions in the map can not
+ * be expanded to represent the new range, a new file_region
+ * structure is added to the map as a placeholder. This is
+ * so that the subsequent region_add call will have all the
+ * regions it needs and will not fail.
+ *
+ * Returns the number of huge pages that need to be added
+ * to the existing reservation map for the range [f, t).
+ * This number is greater or equal to zero. -ENOMEM is
+ * returned if a new file_region structure is needed and can
+ * not be allocated.
+ */
static long region_chg(struct resv_map *resv, long f, long t)
{
struct list_head *head = &resv->regions;
@@ -326,6 +384,11 @@ out_nrg:
return chg;
}
+/*
+ * Truncate the reserve map at index 'end'. Modify/truncate any
+ * region which contains end. Delete any regions past end.
+ * Return the number of huge pages removed from the map.
+ */
static long region_truncate(struct resv_map *resv, long end)
{
struct list_head *head = &resv->regions;
@@ -361,6 +424,10 @@ out:
return chg;
}
+/*
+ * Count and return the number of huge pages in the reserve map
+ * that intersect with the range [f, t).
+ */
static long region_count(struct resv_map *resv, long f, long t)
{
struct list_head *head = &resv->regions;
@@ -908,7 +975,6 @@ static void update_and_free_page(struct hstate *h, struct page *page)
destroy_compound_gigantic_page(page, huge_page_order(h));
free_gigantic_page(page, huge_page_order(h));
} else {
- arch_release_hugepage(page);
__free_pages(page, huge_page_order(h));
}
}
@@ -1093,10 +1159,6 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
__GFP_REPEAT|__GFP_NOWARN,
huge_page_order(h));
if (page) {
- if (arch_prepare_hugepage(page)) {
- __free_pages(page, huge_page_order(h));
- return NULL;
- }
prep_new_huge_page(h, page, nid);
}
@@ -1188,19 +1250,13 @@ static void dissolve_free_huge_page(struct page *page)
*/
void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
{
- unsigned int order = 8 * sizeof(void *);
unsigned long pfn;
- struct hstate *h;
if (!hugepages_supported())
return;
- /* Set scan step to minimum hugepage size */
- for_each_hstate(h)
- if (order > huge_page_order(h))
- order = huge_page_order(h);
- VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << order));
- for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order)
+ VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << minimum_order));
+ for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order)
dissolve_free_huge_page(pfn_to_page(pfn));
}
@@ -1254,11 +1310,6 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
__GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
- if (page && arch_prepare_hugepage(page)) {
- __free_pages(page, huge_page_order(h));
- page = NULL;
- }
-
spin_lock(&hugetlb_lock);
if (page) {
INIT_LIST_HEAD(&page->lru);
@@ -1423,46 +1474,56 @@ static void return_unused_surplus_pages(struct hstate *h,
}
/*
- * Determine if the huge page at addr within the vma has an associated
- * reservation. Where it does not we will need to logically increase
- * reservation and actually increase subpool usage before an allocation
- * can occur. Where any new reservation would be required the
- * reservation change is prepared, but not committed. Once the page
- * has been allocated from the subpool and instantiated the change should
- * be committed via vma_commit_reservation. No action is required on
- * failure.
+ * vma_needs_reservation and vma_commit_reservation are used by the huge
+ * page allocation routines to manage reservations.
+ *
+ * vma_needs_reservation is called to determine if the huge page at addr
+ * within the vma has an associated reservation. If a reservation is
+ * needed, the value 1 is returned. The caller is then responsible for
+ * managing the global reservation and subpool usage counts. After
+ * the huge page has been allocated, vma_commit_reservation is called
+ * to add the page to the reservation map.
+ *
+ * In the normal case, vma_commit_reservation returns the same value
+ * as the preceding vma_needs_reservation call. The only time this
+ * is not the case is if a reserve map was changed between calls. It
+ * is the responsibility of the caller to notice the difference and
+ * take appropriate action.
*/
-static long vma_needs_reservation(struct hstate *h,
- struct vm_area_struct *vma, unsigned long addr)
+static long __vma_reservation_common(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long addr,
+ bool commit)
{
struct resv_map *resv;
pgoff_t idx;
- long chg;
+ long ret;
resv = vma_resv_map(vma);
if (!resv)
return 1;
idx = vma_hugecache_offset(h, vma, addr);
- chg = region_chg(resv, idx, idx + 1);
+ if (commit)
+ ret = region_add(resv, idx, idx + 1);
+ else
+ ret = region_chg(resv, idx, idx + 1);
if (vma->vm_flags & VM_MAYSHARE)
- return chg;
+ return ret;
else
- return chg < 0 ? chg : 0;
+ return ret < 0 ? ret : 0;
}
-static void vma_commit_reservation(struct hstate *h,
+
+static long vma_needs_reservation(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
{
- struct resv_map *resv;
- pgoff_t idx;
-
- resv = vma_resv_map(vma);
- if (!resv)
- return;
+ return __vma_reservation_common(h, vma, addr, false);
+}
- idx = vma_hugecache_offset(h, vma, addr);
- region_add(resv, idx, idx + 1);
+static long vma_commit_reservation(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ return __vma_reservation_common(h, vma, addr, true);
}
static struct page *alloc_huge_page(struct vm_area_struct *vma,
@@ -1471,7 +1532,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
struct hugepage_subpool *spool = subpool_vma(vma);
struct hstate *h = hstate_vma(vma);
struct page *page;
- long chg;
+ long chg, commit;
int ret, idx;
struct hugetlb_cgroup *h_cg;
@@ -1512,7 +1573,22 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
set_page_private(page, (unsigned long)spool);
- vma_commit_reservation(h, vma, addr);
+ commit = vma_commit_reservation(h, vma, addr);
+ if (unlikely(chg > commit)) {
+ /*
+ * The page was added to the reservation map between
+ * vma_needs_reservation and vma_commit_reservation.
+ * This indicates a race with hugetlb_reserve_pages.
+ * Adjust for the subpool count incremented above AND
+ * in hugetlb_reserve_pages for the same page. Also,
+ * the reservation count added in hugetlb_reserve_pages
+ * no longer applies.
+ */
+ long rsv_adjust;
+
+ rsv_adjust = hugepage_subpool_put_pages(spool, 1);
+ hugetlb_acct_memory(h, -rsv_adjust);
+ }
return page;
out_uncharge_cgroup:
@@ -1627,10 +1703,14 @@ static void __init hugetlb_init_hstates(void)
struct hstate *h;
for_each_hstate(h) {
+ if (minimum_order > huge_page_order(h))
+ minimum_order = huge_page_order(h);
+
/* oversize hugepages were init'ed in early boot */
if (!hstate_is_gigantic(h))
hugetlb_hstate_alloc_pages(h);
}
+ VM_BUG_ON(minimum_order == UINT_MAX);
}
static char * __init memfmt(char *buf, unsigned long n)
@@ -3626,8 +3706,24 @@ int hugetlb_reserve_pages(struct inode *inode,
* consumed reservations are stored in the map. Hence, nothing
* else has to be done for private mappings here
*/
- if (!vma || vma->vm_flags & VM_MAYSHARE)
- region_add(resv_map, from, to);
+ if (!vma || vma->vm_flags & VM_MAYSHARE) {
+ long add = region_add(resv_map, from, to);
+
+ if (unlikely(chg > add)) {
+ /*
+ * pages in this range were added to the reserve
+ * map between region_chg and region_add. This
+ * indicates a race with alloc_huge_page. Adjust
+ * the subpool and reserve counts modified above
+ * based on the difference.
+ */
+ long rsv_adjust;
+
+ rsv_adjust = hugepage_subpool_put_pages(spool,
+ chg - add);
+ hugetlb_acct_memory(h, -rsv_adjust);
+ }
+ }
return 0;
out_err:
if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
@@ -3789,6 +3885,11 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
{
return NULL;
}
+
+int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+{
+ return 0;
+}
#define want_pmd_share() (0)
#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 4ca5fe0042e17c..bf73ac17dad424 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -28,7 +28,7 @@ static int hwpoison_inject(void *data, u64 val)
/*
* This implies unable to support free buddy pages.
*/
- if (!get_page_unless_zero(hpage))
+ if (!get_hwpoison_page(p))
return 0;
if (!hwpoison_filter_enable)
@@ -58,7 +58,7 @@ inject:
pr_info("Injecting memory failure at pfn %#lx\n", pfn);
return memory_failure(pfn, 18, MF_COUNT_INCREASED);
put_out:
- put_page(hpage);
+ put_page(p);
return 0;
}
diff --git a/mm/internal.h b/mm/internal.h
index a25e359a40396c..36b23f1e2ca626 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -155,7 +155,8 @@ __find_buddy_index(unsigned long page_idx, unsigned int order)
}
extern int __isolate_free_page(struct page *page, unsigned int order);
-extern void __free_pages_bootmem(struct page *page, unsigned int order);
+extern void __free_pages_bootmem(struct page *page, unsigned long pfn,
+ unsigned int order);
extern void prep_compound_page(struct page *page, unsigned long order);
#ifdef CONFIG_MEMORY_FAILURE
extern bool is_free_buddy_page(struct page *page);
@@ -361,10 +362,7 @@ do { \
} while (0)
extern void mminit_verify_pageflags_layout(void);
-extern void mminit_verify_page_links(struct page *page,
- enum zone_type zone, unsigned long nid, unsigned long pfn);
extern void mminit_verify_zonelist(void);
-
#else
static inline void mminit_dprintk(enum mminit_level level,
@@ -376,11 +374,6 @@ static inline void mminit_verify_pageflags_layout(void)
{
}
-static inline void mminit_verify_page_links(struct page *page,
- enum zone_type zone, unsigned long nid, unsigned long pfn)
-{
-}
-
static inline void mminit_verify_zonelist(void)
{
}
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 4986b0acab212e..c242adf6bc8578 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -7,7 +7,6 @@
#define KASAN_SHADOW_MASK (KASAN_SHADOW_SCALE_SIZE - 1)
#define KASAN_FREE_PAGE 0xFF /* page was freed */
-#define KASAN_FREE_PAGE 0xFF /* page was freed */
#define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */
#define KASAN_KMALLOC_REDZONE 0xFC /* redzone inside slub object */
#define KASAN_KMALLOC_FREE 0xFB /* object was freed (kmem_cache_free/kfree) */
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index f0fe4f2c1fa7aa..11d6f80158967f 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -195,6 +195,8 @@ static struct kmem_cache *scan_area_cache;
/* set if tracing memory operations is enabled */
static int kmemleak_enabled;
+/* same as above but only for the kmemleak_free() callback */
+static int kmemleak_free_enabled;
/* set in the late_initcall if there were no errors */
static int kmemleak_initialized;
/* enables or disables early logging of the memory operations */
@@ -942,7 +944,7 @@ void __ref kmemleak_free(const void *ptr)
{
pr_debug("%s(0x%p)\n", __func__, ptr);
- if (kmemleak_enabled && ptr && !IS_ERR(ptr))
+ if (kmemleak_free_enabled && ptr && !IS_ERR(ptr))
delete_object_full((unsigned long)ptr);
else if (kmemleak_early_log)
log_early(KMEMLEAK_FREE, ptr, 0, 0);
@@ -982,7 +984,7 @@ void __ref kmemleak_free_percpu(const void __percpu *ptr)
pr_debug("%s(0x%p)\n", __func__, ptr);
- if (kmemleak_enabled && ptr && !IS_ERR(ptr))
+ if (kmemleak_free_enabled && ptr && !IS_ERR(ptr))
for_each_possible_cpu(cpu)
delete_object_full((unsigned long)per_cpu_ptr(ptr,
cpu));
@@ -1750,6 +1752,12 @@ static void kmemleak_do_cleanup(struct work_struct *work)
mutex_lock(&scan_mutex);
stop_scan_thread();
+ /*
+ * Once the scan thread has stopped, it is safe to no longer track
+ * object freeing.
+ */
+ kmemleak_free_enabled = 0;
+
if (!kmemleak_found_leaks)
__kmemleak_do_cleanup();
else
@@ -1776,6 +1784,8 @@ static void kmemleak_disable(void)
/* check whether it is too early for a kernel thread */
if (kmemleak_initialized)
schedule_work(&cleanup_work);
+ else
+ kmemleak_free_enabled = 0;
pr_info("Kernel memory leak detector disabled\n");
}
@@ -1840,8 +1850,10 @@ void __init kmemleak_init(void)
if (kmemleak_error) {
local_irq_restore(flags);
return;
- } else
+ } else {
kmemleak_enabled = 1;
+ kmemleak_free_enabled = 1;
+ }
local_irq_restore(flags);
/*
diff --git a/mm/ksm.c b/mm/ksm.c
index 7ee101eaacdfe9..bc7be0ee208050 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1884,7 +1884,7 @@ struct page *ksm_might_need_to_copy(struct page *page,
SetPageDirty(new_page);
__SetPageUptodate(new_page);
- __set_page_locked(new_page);
+ __SetPageLocked(new_page);
}
return new_page;
diff --git a/mm/madvise.c b/mm/madvise.c
index 64bb8a22110c23..70ce0d425d7231 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -20,6 +20,14 @@
#include <linux/backing-dev.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/mmu_notifier.h>
+
+#include <asm/tlb.h>
+
+struct madvise_free_private {
+ struct vm_area_struct *vma;
+ struct mmu_gather *tlb;
+};
/*
* Any behaviour which results in changes to the vma->vm_flags needs to
@@ -32,6 +40,7 @@ static int madvise_need_mmap_write(int behavior)
case MADV_REMOVE:
case MADV_WILLNEED:
case MADV_DONTNEED:
+ case MADV_FREE:
return 0;
default:
/* be safe, default to 1. list exceptions explicitly */
@@ -103,7 +112,8 @@ static long madvise_behavior(struct vm_area_struct *vma,
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
- vma->vm_file, pgoff, vma_policy(vma));
+ vma->vm_file, pgoff, vma_policy(vma),
+ vma->vm_userfaultfd_ctx);
if (*prev) {
vma = *prev;
goto success;
@@ -255,6 +265,164 @@ static long madvise_willneed(struct vm_area_struct *vma,
return 0;
}
+static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+
+{
+ struct madvise_free_private *fp = walk->private;
+ struct mmu_gather *tlb = fp->tlb;
+ struct mm_struct *mm = tlb->mm;
+ struct vm_area_struct *vma = fp->vma;
+ spinlock_t *ptl;
+ pte_t *pte, ptent;
+ struct page *page;
+ swp_entry_t entry;
+ unsigned long next;
+ int nr_swap = 0;
+
+ next = pmd_addr_end(addr, end);
+ if (pmd_trans_huge(*pmd)) {
+ if (next - addr != HPAGE_PMD_SIZE)
+ split_huge_page_pmd(vma, addr, pmd);
+ else if (!madvise_free_huge_pmd(tlb, vma, pmd, addr))
+ goto next;
+ /* fall through */
+ }
+
+ if (pmd_trans_unstable(pmd))
+ return 0;
+
+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ arch_enter_lazy_mmu_mode();
+ for (; addr != end; pte++, addr += PAGE_SIZE) {
+ ptent = *pte;
+
+ if (pte_none(ptent))
+ continue;
+ /*
+ * If the pte has swp_entry, just clear page table to
+ * prevent swap-in which is more expensive rather than
+ * (page allocation + zeroing).
+ */
+ if (!pte_present(ptent)) {
+ entry = pte_to_swp_entry(ptent);
+ if (non_swap_entry(entry))
+ continue;
+ nr_swap--;
+ free_swap_and_cache(entry);
+ pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+ continue;
+ }
+
+ page = vm_normal_page(vma, addr, ptent);
+ if (!page)
+ continue;
+
+ if (PageSwapCache(page)) {
+ if (!trylock_page(page))
+ continue;
+
+ if (!try_to_free_swap(page)) {
+ unlock_page(page);
+ continue;
+ }
+
+ ClearPageDirty(page);
+ unlock_page(page);
+ }
+
+ /*
+ * Some of architecture(ex, PPC) don't update TLB
+ * with set_pte_at and tlb_remove_tlb_entry so for
+ * the portability, remap the pte with old|clean
+ * after pte clearing.
+ */
+ ptent = ptep_get_and_clear_full(mm, addr, pte,
+ tlb->fullmm);
+ ptent = pte_mkold(ptent);
+ ptent = pte_mkclean(ptent);
+ set_pte_at(mm, addr, pte, ptent);
+ if (PageActive(page))
+ deactivate_page(page);
+ tlb_remove_tlb_entry(tlb, pte, addr);
+ }
+
+ if (nr_swap) {
+ if (current->mm == mm)
+ sync_mm_rss(mm);
+
+ add_mm_counter(mm, MM_SWAPENTS, nr_swap);
+ }
+
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(pte - 1, ptl);
+next:
+ cond_resched();
+ return 0;
+}
+
+static void madvise_free_page_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ struct madvise_free_private fp = {
+ .vma = vma,
+ .tlb = tlb,
+ };
+
+ struct mm_walk free_walk = {
+ .pmd_entry = madvise_free_pte_range,
+ .mm = vma->vm_mm,
+ .private = &fp,
+ };
+
+ BUG_ON(addr >= end);
+ tlb_start_vma(tlb, vma);
+ walk_page_range(addr, end, &free_walk);
+ tlb_end_vma(tlb, vma);
+}
+
+static int madvise_free_single_vma(struct vm_area_struct *vma,
+ unsigned long start_addr, unsigned long end_addr)
+{
+ unsigned long start, end;
+ struct mm_struct *mm = vma->vm_mm;
+ struct mmu_gather tlb;
+
+ if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
+ return -EINVAL;
+
+ /* MADV_FREE works for only anon vma at the moment */
+ if (vma->vm_file)
+ return -EINVAL;
+
+ start = max(vma->vm_start, start_addr);
+ if (start >= vma->vm_end)
+ return -EINVAL;
+ end = min(vma->vm_end, end_addr);
+ if (end <= vma->vm_start)
+ return -EINVAL;
+
+ lru_add_drain();
+ tlb_gather_mmu(&tlb, mm, start, end);
+ update_hiwater_rss(mm);
+
+ mmu_notifier_invalidate_range_start(mm, start, end);
+ madvise_free_page_range(&tlb, vma, start, end);
+ mmu_notifier_invalidate_range_end(mm, start, end);
+ tlb_finish_mmu(&tlb, start, end);
+
+ return 0;
+}
+
+static long madvise_free(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end)
+{
+ *prev = vma;
+ return madvise_free_single_vma(vma, start, end);
+}
+
/*
* Application no longer needs these pages. If the pages are dirty,
* it's OK to just throw them away. The app will be more careful about
@@ -378,6 +546,14 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
return madvise_remove(vma, prev, start, end);
case MADV_WILLNEED:
return madvise_willneed(vma, prev, start, end);
+ case MADV_FREE:
+ /*
+ * XXX: In this implementation, MADV_FREE works like
+ * MADV_DONTNEED on swapless system or full swap.
+ */
+ if (get_nr_swap_pages() > 0)
+ return madvise_free(vma, prev, start, end);
+ /* passthrough */
case MADV_DONTNEED:
return madvise_dontneed(vma, prev, start, end);
default:
@@ -397,6 +573,7 @@ madvise_behavior_valid(int behavior)
case MADV_REMOVE:
case MADV_WILLNEED:
case MADV_DONTNEED:
+ case MADV_FREE:
#ifdef CONFIG_KSM
case MADV_MERGEABLE:
case MADV_UNMERGEABLE:
diff --git a/mm/memblock.c b/mm/memblock.c
index 9318b567ed7959..74b142c92a06cd 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -54,10 +54,16 @@ int memblock_debug __initdata_memblock;
#ifdef CONFIG_MOVABLE_NODE
bool movable_node_enabled __initdata_memblock = false;
#endif
+static bool system_has_some_mirror __initdata_memblock = false;
static int memblock_can_resize __initdata_memblock;
static int memblock_memory_in_slab __initdata_memblock = 0;
static int memblock_reserved_in_slab __initdata_memblock = 0;
+ulong __init_memblock choose_memblock_flags(void)
+{
+ return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE;
+}
+
/* inline so we don't get a warning when pr_debug is compiled out */
static __init_memblock const char *
memblock_type_name(struct memblock_type *type)
@@ -107,6 +113,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
* @size: size of free area to find
* @align: alignment of free area to find
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ * @flags: pick from blocks based on memory attributes
*
* Utility called from memblock_find_in_range_node(), find free area bottom-up.
*
@@ -115,12 +122,13 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
*/
static phys_addr_t __init_memblock
__memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
- phys_addr_t size, phys_addr_t align, int nid)
+ phys_addr_t size, phys_addr_t align, int nid,
+ ulong flags)
{
phys_addr_t this_start, this_end, cand;
u64 i;
- for_each_free_mem_range(i, nid, &this_start, &this_end, NULL) {
+ for_each_free_mem_range(i, nid, flags, &this_start, &this_end, NULL) {
this_start = clamp(this_start, start, end);
this_end = clamp(this_end, start, end);
@@ -139,6 +147,7 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
* @size: size of free area to find
* @align: alignment of free area to find
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ * @flags: pick from blocks based on memory attributes
*
* Utility called from memblock_find_in_range_node(), find free area top-down.
*
@@ -147,12 +156,14 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
*/
static phys_addr_t __init_memblock
__memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
- phys_addr_t size, phys_addr_t align, int nid)
+ phys_addr_t size, phys_addr_t align, int nid,
+ ulong flags)
{
phys_addr_t this_start, this_end, cand;
u64 i;
- for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) {
+ for_each_free_mem_range_reverse(i, nid, flags, &this_start, &this_end,
+ NULL) {
this_start = clamp(this_start, start, end);
this_end = clamp(this_end, start, end);
@@ -174,6 +185,7 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
* @start: start of candidate range
* @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ * @flags: pick from blocks based on memory attributes
*
* Find @size free area aligned to @align in the specified range and node.
*
@@ -190,7 +202,7 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
*/
phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
phys_addr_t align, phys_addr_t start,
- phys_addr_t end, int nid)
+ phys_addr_t end, int nid, ulong flags)
{
phys_addr_t kernel_end, ret;
@@ -215,7 +227,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
/* ok, try bottom-up allocation first */
ret = __memblock_find_range_bottom_up(bottom_up_start, end,
- size, align, nid);
+ size, align, nid, flags);
if (ret)
return ret;
@@ -233,7 +245,8 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
"memory hotunplug may be affected\n");
}
- return __memblock_find_range_top_down(start, end, size, align, nid);
+ return __memblock_find_range_top_down(start, end, size, align, nid,
+ flags);
}
/**
@@ -252,8 +265,21 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
phys_addr_t end, phys_addr_t size,
phys_addr_t align)
{
- return memblock_find_in_range_node(size, align, start, end,
- NUMA_NO_NODE);
+ phys_addr_t ret;
+ ulong flags = choose_memblock_flags();
+
+again:
+ ret = memblock_find_in_range_node(size, align, start, end,
+ NUMA_NO_NODE, flags);
+
+ if (!ret && (flags & MEMBLOCK_MIRROR)) {
+ pr_warn("Could not allocate %pap bytes of mirrored memory\n",
+ &size);
+ flags &= ~MEMBLOCK_MIRROR;
+ goto again;
+ }
+
+ return ret;
}
static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
@@ -779,9 +805,57 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
}
/**
+ * __next_reserved_mem_region - next function for for_each_reserved_region()
+ * @idx: pointer to u64 loop variable
+ * @out_start: ptr to phys_addr_t for start address of the region, can be %NULL
+ * @out_end: ptr to phys_addr_t for end address of the region, can be %NULL
+ *
+ * Iterate over all reserved memory regions.
+ */
+void __init_memblock __next_reserved_mem_region(u64 *idx,
+ phys_addr_t *out_start,
+ phys_addr_t *out_end)
+{
+ struct memblock_type *rsv = &memblock.reserved;
+
+ if (*idx >= 0 && *idx < rsv->cnt) {
+ struct memblock_region *r = &rsv->regions[*idx];
+ phys_addr_t base = r->base;
+ phys_addr_t size = r->size;
+
+ if (out_start)
+ *out_start = base;
+ if (out_end)
+ *out_end = base + size - 1;
+
+ *idx += 1;
+ return;
+ }
+
+ /* signal end of iteration */
+ *idx = ULLONG_MAX;
+}
+
+/**
+ * memblock_mark_mirror - Mark mirrored memory with flag MEMBLOCK_MIRROR.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
+ *
+ * Return 0 on succees, -errno on failure.
+ */
+int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size)
+{
+ system_has_some_mirror = true;
+
+ return memblock_setclr_flag(base, size, 1, MEMBLOCK_MIRROR);
+}
+
+
+/**
* __next__mem_range - next function for for_each_free_mem_range() etc.
* @idx: pointer to u64 loop variable
* @nid: node selector, %NUMA_NO_NODE for all nodes
+ * @flags: pick from blocks based on memory attributes
* @type_a: pointer to memblock_type from where the range is taken
* @type_b: pointer to memblock_type which excludes memory from being taken
* @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
@@ -803,7 +877,7 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
* As both region arrays are sorted, the function advances the two indices
* in lockstep and returns each intersection.
*/
-void __init_memblock __next_mem_range(u64 *idx, int nid,
+void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags,
struct memblock_type *type_a,
struct memblock_type *type_b,
phys_addr_t *out_start,
@@ -831,6 +905,10 @@ void __init_memblock __next_mem_range(u64 *idx, int nid,
if (movable_node_is_enabled() && memblock_is_hotpluggable(m))
continue;
+ /* if we want mirror memory skip non-mirror memory regions */
+ if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m))
+ continue;
+
if (!type_b) {
if (out_start)
*out_start = m_start;
@@ -895,6 +973,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid,
*
* @idx: pointer to u64 loop variable
* @nid: nid: node selector, %NUMA_NO_NODE for all nodes
+ * @flags: pick from blocks based on memory attributes
* @type_a: pointer to memblock_type from where the range is taken
* @type_b: pointer to memblock_type which excludes memory from being taken
* @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
@@ -903,7 +982,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid,
*
* Reverse of __next_mem_range().
*/
-void __init_memblock __next_mem_range_rev(u64 *idx, int nid,
+void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags,
struct memblock_type *type_a,
struct memblock_type *type_b,
phys_addr_t *out_start,
@@ -935,6 +1014,10 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid,
if (movable_node_is_enabled() && memblock_is_hotpluggable(m))
continue;
+ /* if we want mirror memory skip non-mirror memory regions */
+ if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m))
+ continue;
+
if (!type_b) {
if (out_start)
*out_start = m_start;
@@ -1050,14 +1133,15 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
phys_addr_t align, phys_addr_t start,
- phys_addr_t end, int nid)
+ phys_addr_t end, int nid, ulong flags)
{
phys_addr_t found;
if (!align)
align = SMP_CACHE_BYTES;
- found = memblock_find_in_range_node(size, align, start, end, nid);
+ found = memblock_find_in_range_node(size, align, start, end, nid,
+ flags);
if (found && !memblock_reserve(found, size)) {
/*
* The min_count is set to 0 so that memblock allocations are
@@ -1070,26 +1154,40 @@ static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
}
phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
- phys_addr_t start, phys_addr_t end)
+ phys_addr_t start, phys_addr_t end,
+ ulong flags)
{
- return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE);
+ return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE,
+ flags);
}
static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
phys_addr_t align, phys_addr_t max_addr,
- int nid)
+ int nid, ulong flags)
{
- return memblock_alloc_range_nid(size, align, 0, max_addr, nid);
+ return memblock_alloc_range_nid(size, align, 0, max_addr, nid, flags);
}
phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid)
{
- return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+ ulong flags = choose_memblock_flags();
+ phys_addr_t ret;
+
+again:
+ ret = memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE,
+ nid, flags);
+
+ if (!ret && (flags & MEMBLOCK_MIRROR)) {
+ flags &= ~MEMBLOCK_MIRROR;
+ goto again;
+ }
+ return ret;
}
phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
{
- return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE);
+ return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE,
+ MEMBLOCK_NONE);
}
phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
@@ -1153,6 +1251,7 @@ static void * __init memblock_virt_alloc_internal(
{
phys_addr_t alloc;
void *ptr;
+ ulong flags = choose_memblock_flags();
if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
nid = NUMA_NO_NODE;
@@ -1173,13 +1272,14 @@ static void * __init memblock_virt_alloc_internal(
again:
alloc = memblock_find_in_range_node(size, align, min_addr, max_addr,
- nid);
+ nid, flags);
if (alloc)
goto done;
if (nid != NUMA_NO_NODE) {
alloc = memblock_find_in_range_node(size, align, min_addr,
- max_addr, NUMA_NO_NODE);
+ max_addr, NUMA_NO_NODE,
+ flags);
if (alloc)
goto done;
}
@@ -1187,10 +1287,16 @@ again:
if (min_addr) {
min_addr = 0;
goto again;
- } else {
- goto error;
}
+ if (flags & MEMBLOCK_MIRROR) {
+ flags &= ~MEMBLOCK_MIRROR;
+ pr_warn("Could not allocate %pap bytes of mirrored memory\n",
+ &size);
+ goto again;
+ }
+
+ return NULL;
done:
memblock_reserve(alloc, size);
ptr = phys_to_virt(alloc);
@@ -1205,9 +1311,6 @@ done:
kmemleak_alloc(ptr, size, 0, 0);
return ptr;
-
-error:
- return NULL;
}
/**
@@ -1316,7 +1419,7 @@ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size)
end = PFN_DOWN(base + size);
for (; cursor < end; cursor++) {
- __free_pages_bootmem(pfn_to_page(cursor), 0);
+ __free_pages_bootmem(pfn_to_page(cursor), cursor, 0);
totalram_pages++;
}
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f816d91c643b7e..fe7b150d5e558e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -287,9 +287,9 @@ struct mem_cgroup {
*/
bool use_hierarchy;
+ /* protected by memcg_oom_lock */
bool oom_lock;
- atomic_t under_oom;
- atomic_t oom_wakeups;
+ int under_oom;
int swappiness;
/* OOM-Killer disable */
@@ -1551,17 +1551,20 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned int points = 0;
struct task_struct *chosen = NULL;
+ mutex_lock(&oom_lock);
+
+ check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg);
+
/*
* If current has a pending SIGKILL or is exiting, then automatically
* select it. The goal is to allow it to allocate so that it may
* quickly exit and free its memory.
*/
if (fatal_signal_pending(current) || task_will_free_mem(current)) {
- mark_tsk_oom_victim(current);
- return;
+ mark_oom_victim(current);
+ goto unlock;
}
- check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg);
totalpages = mem_cgroup_get_limit(memcg) ? : 1;
for_each_mem_cgroup_tree(iter, memcg) {
struct css_task_iter it;
@@ -1585,7 +1588,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
mem_cgroup_iter_break(memcg, iter);
if (chosen)
put_task_struct(chosen);
- return;
+ goto unlock;
case OOM_SCAN_OK:
break;
};
@@ -1606,11 +1609,13 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
css_task_iter_end(&it);
}
- if (!chosen)
- return;
- points = chosen_points * 1000 / totalpages;
- oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
- NULL, "Memory cgroup out of memory");
+ if (chosen) {
+ points = chosen_points * 1000 / totalpages;
+ oom_kill_process(chosen, gfp_mask, order, points, totalpages,
+ memcg, NULL, "Memory cgroup out of memory");
+ }
+unlock:
+ mutex_unlock(&oom_lock);
}
#if MAX_NUMNODES > 1
@@ -1827,8 +1832,10 @@ static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
{
struct mem_cgroup *iter;
+ spin_lock(&memcg_oom_lock);
for_each_mem_cgroup_tree(iter, memcg)
- atomic_inc(&iter->under_oom);
+ iter->under_oom++;
+ spin_unlock(&memcg_oom_lock);
}
static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
@@ -1837,11 +1844,13 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
/*
* When a new child is created while the hierarchy is under oom,
- * mem_cgroup_oom_lock() may not be called. We have to use
- * atomic_add_unless() here.
+ * mem_cgroup_oom_lock() may not be called. Watch for underflow.
*/
+ spin_lock(&memcg_oom_lock);
for_each_mem_cgroup_tree(iter, memcg)
- atomic_add_unless(&iter->under_oom, -1, 0);
+ if (iter->under_oom > 0)
+ iter->under_oom--;
+ spin_unlock(&memcg_oom_lock);
}
static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
@@ -1867,17 +1876,18 @@ static int memcg_oom_wake_function(wait_queue_t *wait,
return autoremove_wake_function(wait, mode, sync, arg);
}
-static void memcg_wakeup_oom(struct mem_cgroup *memcg)
-{
- atomic_inc(&memcg->oom_wakeups);
- /* for filtering, pass "memcg" as argument. */
- __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
-}
-
static void memcg_oom_recover(struct mem_cgroup *memcg)
{
- if (memcg && atomic_read(&memcg->under_oom))
- memcg_wakeup_oom(memcg);
+ /*
+ * For the following lockless ->under_oom test, the only required
+ * guarantee is that it must see the state asserted by an OOM when
+ * this function is called as a result of userland actions
+ * triggered by the notification of the OOM. This is trivially
+ * achieved by invoking mem_cgroup_mark_under_oom() before
+ * triggering notification.
+ */
+ if (memcg && memcg->under_oom)
+ __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
}
static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
@@ -2318,6 +2328,8 @@ done_restock:
css_get_many(&memcg->css, batch);
if (batch > nr_pages)
refill_stock(memcg, batch - nr_pages);
+ if (!(gfp_mask & __GFP_WAIT))
+ goto done;
/*
* If the hierarchy is above the normal consumption range,
* make the charging task trim their excess contribution.
@@ -3857,7 +3869,7 @@ static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
list_add(&event->list, &memcg->oom_notify);
/* already in OOM ? */
- if (atomic_read(&memcg->under_oom))
+ if (memcg->under_oom)
eventfd_signal(eventfd, 1);
spin_unlock(&memcg_oom_lock);
@@ -3886,7 +3898,7 @@ static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
- seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom));
+ seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
return 0;
}
@@ -5954,9 +5966,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
if (!mem_cgroup_is_root(memcg))
page_counter_uncharge(&memcg->memory, 1);
- /* XXX: caller holds IRQ-safe mapping->tree_lock */
- VM_BUG_ON(!irqs_disabled());
-
+ /* Caller disabled preemption with mapping->tree_lock */
mem_cgroup_charge_statistics(memcg, page, -1);
memcg_check_events(memcg, page);
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 501820c815b335..1cf7f2988422a6 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -20,6 +20,14 @@
* this code has to be extremely careful. Generally it tries to use
* normal locking rules, as in get the standard locks, even if that means
* the error handling takes potentially a long time.
+ *
+ * It can be very tempting to add handling for obscure cases here.
+ * In general any code for handling new cases should only be added iff:
+ * - You know how to test it.
+ * - You have a test that can be added to mce-test
+ * https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/
+ * - The case actually shows up as a frequent (top 10) page state in
+ * tools/vm/page-types when running a real workload.
*
* There are several operations here with exponential complexity because
* of unsuitable VM data structures. For example the operation to map back
@@ -28,13 +36,6 @@
* are rare we hope to get away with this. This avoids impacting the core
* VM.
*/
-
-/*
- * Notebook:
- * - hugetlb needs more code
- * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
- * - pass bad pages to kdump next kernel
- */
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/page-flags.h>
@@ -56,6 +57,7 @@
#include <linux/mm_inline.h>
#include <linux/kfifo.h>
#include "internal.h"
+#include "ras/ras_event.h"
int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -503,68 +505,34 @@ static void collect_procs(struct page *page, struct list_head *tokill,
kfree(tk);
}
-/*
- * Error handlers for various types of pages.
- */
-
-enum outcome {
- IGNORED, /* Error: cannot be handled */
- FAILED, /* Error: handling failed */
- DELAYED, /* Will be handled later */
- RECOVERED, /* Successfully recovered */
-};
-
static const char *action_name[] = {
- [IGNORED] = "Ignored",
- [FAILED] = "Failed",
- [DELAYED] = "Delayed",
- [RECOVERED] = "Recovered",
-};
-
-enum action_page_type {
- MSG_KERNEL,
- MSG_KERNEL_HIGH_ORDER,
- MSG_SLAB,
- MSG_DIFFERENT_COMPOUND,
- MSG_POISONED_HUGE,
- MSG_HUGE,
- MSG_FREE_HUGE,
- MSG_UNMAP_FAILED,
- MSG_DIRTY_SWAPCACHE,
- MSG_CLEAN_SWAPCACHE,
- MSG_DIRTY_MLOCKED_LRU,
- MSG_CLEAN_MLOCKED_LRU,
- MSG_DIRTY_UNEVICTABLE_LRU,
- MSG_CLEAN_UNEVICTABLE_LRU,
- MSG_DIRTY_LRU,
- MSG_CLEAN_LRU,
- MSG_TRUNCATED_LRU,
- MSG_BUDDY,
- MSG_BUDDY_2ND,
- MSG_UNKNOWN,
+ [MF_IGNORED] = "Ignored",
+ [MF_FAILED] = "Failed",
+ [MF_DELAYED] = "Delayed",
+ [MF_RECOVERED] = "Recovered",
};
static const char * const action_page_types[] = {
- [MSG_KERNEL] = "reserved kernel page",
- [MSG_KERNEL_HIGH_ORDER] = "high-order kernel page",
- [MSG_SLAB] = "kernel slab page",
- [MSG_DIFFERENT_COMPOUND] = "different compound page after locking",
- [MSG_POISONED_HUGE] = "huge page already hardware poisoned",
- [MSG_HUGE] = "huge page",
- [MSG_FREE_HUGE] = "free huge page",
- [MSG_UNMAP_FAILED] = "unmapping failed page",
- [MSG_DIRTY_SWAPCACHE] = "dirty swapcache page",
- [MSG_CLEAN_SWAPCACHE] = "clean swapcache page",
- [MSG_DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page",
- [MSG_CLEAN_MLOCKED_LRU] = "clean mlocked LRU page",
- [MSG_DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page",
- [MSG_CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page",
- [MSG_DIRTY_LRU] = "dirty LRU page",
- [MSG_CLEAN_LRU] = "clean LRU page",
- [MSG_TRUNCATED_LRU] = "already truncated LRU page",
- [MSG_BUDDY] = "free buddy page",
- [MSG_BUDDY_2ND] = "free buddy page (2nd try)",
- [MSG_UNKNOWN] = "unknown page",
+ [MF_MSG_KERNEL] = "reserved kernel page",
+ [MF_MSG_KERNEL_HIGH_ORDER] = "high-order kernel page",
+ [MF_MSG_SLAB] = "kernel slab page",
+ [MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking",
+ [MF_MSG_POISONED_HUGE] = "huge page already hardware poisoned",
+ [MF_MSG_HUGE] = "huge page",
+ [MF_MSG_FREE_HUGE] = "free huge page",
+ [MF_MSG_UNMAP_FAILED] = "unmapping failed page",
+ [MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page",
+ [MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page",
+ [MF_MSG_DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page",
+ [MF_MSG_CLEAN_MLOCKED_LRU] = "clean mlocked LRU page",
+ [MF_MSG_DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page",
+ [MF_MSG_CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page",
+ [MF_MSG_DIRTY_LRU] = "dirty LRU page",
+ [MF_MSG_CLEAN_LRU] = "clean LRU page",
+ [MF_MSG_TRUNCATED_LRU] = "already truncated LRU page",
+ [MF_MSG_BUDDY] = "free buddy page",
+ [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)",
+ [MF_MSG_UNKNOWN] = "unknown page",
};
/*
@@ -598,7 +566,7 @@ static int delete_from_lru_cache(struct page *p)
*/
static int me_kernel(struct page *p, unsigned long pfn)
{
- return IGNORED;
+ return MF_IGNORED;
}
/*
@@ -607,7 +575,7 @@ static int me_kernel(struct page *p, unsigned long pfn)
static int me_unknown(struct page *p, unsigned long pfn)
{
printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
- return FAILED;
+ return MF_FAILED;
}
/*
@@ -616,7 +584,7 @@ static int me_unknown(struct page *p, unsigned long pfn)
static int me_pagecache_clean(struct page *p, unsigned long pfn)
{
int err;
- int ret = FAILED;
+ int ret = MF_FAILED;
struct address_space *mapping;
delete_from_lru_cache(p);
@@ -626,7 +594,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
* should be the one m_f() holds.
*/
if (PageAnon(p))
- return RECOVERED;
+ return MF_RECOVERED;
/*
* Now truncate the page in the page cache. This is really
@@ -640,7 +608,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
/*
* Page has been teared down in the meanwhile
*/
- return FAILED;
+ return MF_FAILED;
}
/*
@@ -657,7 +625,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
!try_to_release_page(p, GFP_NOIO)) {
pr_info("MCE %#lx: failed to release buffers\n", pfn);
} else {
- ret = RECOVERED;
+ ret = MF_RECOVERED;
}
} else {
/*
@@ -665,7 +633,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
* This fails on dirty or anything with private pages
*/
if (invalidate_inode_page(p))
- ret = RECOVERED;
+ ret = MF_RECOVERED;
else
printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
pfn);
@@ -751,9 +719,9 @@ static int me_swapcache_dirty(struct page *p, unsigned long pfn)
ClearPageUptodate(p);
if (!delete_from_lru_cache(p))
- return DELAYED;
+ return MF_DELAYED;
else
- return FAILED;
+ return MF_FAILED;
}
static int me_swapcache_clean(struct page *p, unsigned long pfn)
@@ -761,9 +729,9 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
delete_from_swap_cache(p);
if (!delete_from_lru_cache(p))
- return RECOVERED;
+ return MF_RECOVERED;
else
- return FAILED;
+ return MF_FAILED;
}
/*
@@ -776,6 +744,10 @@ static int me_huge_page(struct page *p, unsigned long pfn)
{
int res = 0;
struct page *hpage = compound_head(p);
+
+ if (!PageHuge(hpage))
+ return MF_DELAYED;
+
/*
* We can safely recover from error on free or reserved (i.e.
* not in-use) hugepage by dequeuing it from freelist.
@@ -789,9 +761,9 @@ static int me_huge_page(struct page *p, unsigned long pfn)
if (!(page_mapping(hpage) || PageAnon(hpage))) {
res = dequeue_hwpoisoned_huge_page(hpage);
if (!res)
- return RECOVERED;
+ return MF_RECOVERED;
}
- return DELAYED;
+ return MF_DELAYED;
}
/*
@@ -823,10 +795,10 @@ static int me_huge_page(struct page *p, unsigned long pfn)
static struct page_state {
unsigned long mask;
unsigned long res;
- enum action_page_type type;
+ enum mf_action_page_type type;
int (*action)(struct page *p, unsigned long pfn);
} error_states[] = {
- { reserved, reserved, MSG_KERNEL, me_kernel },
+ { reserved, reserved, MF_MSG_KERNEL, me_kernel },
/*
* free pages are specially detected outside this table:
* PG_buddy pages only make a small fraction of all free pages.
@@ -837,31 +809,31 @@ static struct page_state {
* currently unused objects without touching them. But just
* treat it as standard kernel for now.
*/
- { slab, slab, MSG_SLAB, me_kernel },
+ { slab, slab, MF_MSG_SLAB, me_kernel },
#ifdef CONFIG_PAGEFLAGS_EXTENDED
- { head, head, MSG_HUGE, me_huge_page },
- { tail, tail, MSG_HUGE, me_huge_page },
+ { head, head, MF_MSG_HUGE, me_huge_page },
+ { tail, tail, MF_MSG_HUGE, me_huge_page },
#else
- { compound, compound, MSG_HUGE, me_huge_page },
+ { compound, compound, MF_MSG_HUGE, me_huge_page },
#endif
- { sc|dirty, sc|dirty, MSG_DIRTY_SWAPCACHE, me_swapcache_dirty },
- { sc|dirty, sc, MSG_CLEAN_SWAPCACHE, me_swapcache_clean },
+ { sc|dirty, sc|dirty, MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty },
+ { sc|dirty, sc, MF_MSG_CLEAN_SWAPCACHE, me_swapcache_clean },
- { mlock|dirty, mlock|dirty, MSG_DIRTY_MLOCKED_LRU, me_pagecache_dirty },
- { mlock|dirty, mlock, MSG_CLEAN_MLOCKED_LRU, me_pagecache_clean },
+ { mlock|dirty, mlock|dirty, MF_MSG_DIRTY_MLOCKED_LRU, me_pagecache_dirty },
+ { mlock|dirty, mlock, MF_MSG_CLEAN_MLOCKED_LRU, me_pagecache_clean },
- { unevict|dirty, unevict|dirty, MSG_DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty },
- { unevict|dirty, unevict, MSG_CLEAN_UNEVICTABLE_LRU, me_pagecache_clean },
+ { unevict|dirty, unevict|dirty, MF_MSG_DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty },
+ { unevict|dirty, unevict, MF_MSG_CLEAN_UNEVICTABLE_LRU, me_pagecache_clean },
- { lru|dirty, lru|dirty, MSG_DIRTY_LRU, me_pagecache_dirty },
- { lru|dirty, lru, MSG_CLEAN_LRU, me_pagecache_clean },
+ { lru|dirty, lru|dirty, MF_MSG_DIRTY_LRU, me_pagecache_dirty },
+ { lru|dirty, lru, MF_MSG_CLEAN_LRU, me_pagecache_clean },
/*
* Catchall entry: must be at end.
*/
- { 0, 0, MSG_UNKNOWN, me_unknown },
+ { 0, 0, MF_MSG_UNKNOWN, me_unknown },
};
#undef dirty
@@ -881,8 +853,11 @@ static struct page_state {
* "Dirty/Clean" indication is not 100% accurate due to the possibility of
* setting PG_dirty outside page lock. See also comment above set_page_dirty().
*/
-static void action_result(unsigned long pfn, enum action_page_type type, int result)
+static void action_result(unsigned long pfn, enum mf_action_page_type type,
+ enum mf_result result)
{
+ trace_memory_failure_event(pfn, type, result);
+
pr_err("MCE %#lx: recovery action for %s: %s\n",
pfn, action_page_types[type], action_name[result]);
}
@@ -896,13 +871,13 @@ static int page_action(struct page_state *ps, struct page *p,
result = ps->action(p, pfn);
count = page_count(p) - 1;
- if (ps->action == me_swapcache_dirty && result == DELAYED)
+ if (ps->action == me_swapcache_dirty && result == MF_DELAYED)
count--;
if (count != 0) {
printk(KERN_ERR
"MCE %#lx: %s still referenced by %d users\n",
pfn, action_page_types[ps->type], count);
- result = FAILED;
+ result = MF_FAILED;
}
action_result(pfn, ps->type, result);
@@ -911,9 +886,42 @@ static int page_action(struct page_state *ps, struct page *p,
* Could adjust zone counters here to correct for the missing page.
*/
- return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
+ return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
}
+/**
+ * get_hwpoison_page() - Get refcount for memory error handling:
+ * @page: raw error page (hit by memory error)
+ *
+ * Return: return 0 if failed to grab the refcount, otherwise true (some
+ * non-zero value.)
+ */
+int get_hwpoison_page(struct page *page)
+{
+ struct page *head = compound_head(page);
+
+ if (PageHuge(head))
+ return get_page_unless_zero(head);
+
+ /*
+ * Thp tail page has special refcounting rule (refcount of tail pages
+ * is stored in ->_mapcount,) so we can't call get_page_unless_zero()
+ * directly for tail pages.
+ */
+ if (PageTransHuge(head)) {
+ if (get_page_unless_zero(head)) {
+ if (PageTail(page))
+ get_page(page);
+ return 1;
+ } else {
+ return 0;
+ }
+ }
+
+ return get_page_unless_zero(page);
+}
+EXPORT_SYMBOL_GPL(get_hwpoison_page);
+
/*
* Do all that is necessary to remove user space mappings. Unmap
* the pages and send SIGBUS to the processes if the data was dirty.
@@ -927,7 +935,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
int ret;
int kill = 1, forcekill;
struct page *hpage = *hpagep;
- struct page *ppage;
/*
* Here we are interested only in user-mapped pages, so skip any
@@ -977,59 +984,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
}
/*
- * ppage: poisoned page
- * if p is regular page(4k page)
- * ppage == real poisoned page;
- * else p is hugetlb or THP, ppage == head page.
- */
- ppage = hpage;
-
- if (PageTransHuge(hpage)) {
- /*
- * Verify that this isn't a hugetlbfs head page, the check for
- * PageAnon is just for avoid tripping a split_huge_page
- * internal debug check, as split_huge_page refuses to deal with
- * anything that isn't an anon page. PageAnon can't go away fro
- * under us because we hold a refcount on the hpage, without a
- * refcount on the hpage. split_huge_page can't be safely called
- * in the first place, having a refcount on the tail isn't
- * enough * to be safe.
- */
- if (!PageHuge(hpage) && PageAnon(hpage)) {
- if (unlikely(split_huge_page(hpage))) {
- /*
- * FIXME: if splitting THP is failed, it is
- * better to stop the following operation rather
- * than causing panic by unmapping. System might
- * survive if the page is freed later.
- */
- printk(KERN_INFO
- "MCE %#lx: failed to split THP\n", pfn);
-
- BUG_ON(!PageHWPoison(p));
- return SWAP_FAIL;
- }
- /*
- * We pinned the head page for hwpoison handling,
- * now we split the thp and we are interested in
- * the hwpoisoned raw page, so move the refcount
- * to it. Similarly, page lock is shifted.
- */
- if (hpage != p) {
- if (!(flags & MF_COUNT_INCREASED)) {
- put_page(hpage);
- get_page(p);
- }
- lock_page(p);
- unlock_page(hpage);
- *hpagep = p;
- }
- /* THP is split, so ppage should be the real poisoned page. */
- ppage = p;
- }
- }
-
- /*
* First collect all the processes that have the page
* mapped in dirty form. This has to be done before try_to_unmap,
* because ttu takes the rmap data structures down.
@@ -1038,12 +992,12 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* there's nothing that can be done.
*/
if (kill)
- collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED);
+ collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
- ret = try_to_unmap(ppage, ttu);
+ ret = try_to_unmap(hpage, ttu);
if (ret != SWAP_SUCCESS)
printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
- pfn, page_mapcount(ppage));
+ pfn, page_mapcount(hpage));
/*
* Now that the dirty bit has been propagated to the
@@ -1055,7 +1009,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* use a more force-full uncatchable kill to prevent
* any accesses to the poisoned memory.
*/
- forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL);
+ forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL);
kill_procs(&tokill, forcekill, trapno,
ret != SWAP_SUCCESS, p, pfn, flags);
@@ -1101,6 +1055,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
struct page_state *ps;
struct page *p;
struct page *hpage;
+ struct page *orig_head;
int res;
unsigned int nr_pages;
unsigned long page_flags;
@@ -1116,7 +1071,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
}
p = pfn_to_page(pfn);
- hpage = compound_head(p);
+ orig_head = hpage = compound_head(p);
if (TestSetPageHWPoison(p)) {
printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
return 0;
@@ -1149,10 +1104,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
* In fact it's dangerous to directly bump up page count from 0,
* that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
*/
- if (!(flags & MF_COUNT_INCREASED) &&
- !get_page_unless_zero(hpage)) {
+ if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
if (is_free_buddy_page(p)) {
- action_result(pfn, MSG_BUDDY, DELAYED);
+ action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
return 0;
} else if (PageHuge(hpage)) {
/*
@@ -1169,37 +1123,60 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
}
set_page_hwpoison_huge_page(hpage);
res = dequeue_hwpoisoned_huge_page(hpage);
- action_result(pfn, MSG_FREE_HUGE,
- res ? IGNORED : DELAYED);
+ action_result(pfn, MF_MSG_FREE_HUGE,
+ res ? MF_IGNORED : MF_DELAYED);
unlock_page(hpage);
return res;
} else {
- action_result(pfn, MSG_KERNEL_HIGH_ORDER, IGNORED);
+ action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
return -EBUSY;
}
}
+ if (!PageHuge(p) && PageTransHuge(hpage)) {
+ if (!PageAnon(hpage)) {
+ pr_err("MCE: %#lx: non anonymous thp\n", pfn);
+ if (TestClearPageHWPoison(p))
+ atomic_long_sub(nr_pages, &num_poisoned_pages);
+ put_page(p);
+ if (p != hpage)
+ put_page(hpage);
+ return -EBUSY;
+ }
+ if (unlikely(split_huge_page(hpage))) {
+ pr_err("MCE: %#lx: thp split failed\n", pfn);
+ if (TestClearPageHWPoison(p))
+ atomic_long_sub(nr_pages, &num_poisoned_pages);
+ put_page(p);
+ if (p != hpage)
+ put_page(hpage);
+ return -EBUSY;
+ }
+ VM_BUG_ON_PAGE(!page_count(p), p);
+ hpage = compound_head(p);
+ }
+
/*
* We ignore non-LRU pages for good reasons.
* - PG_locked is only well defined for LRU pages and a few others
- * - to avoid races with __set_page_locked()
+ * - to avoid races with __SetPageLocked()
* - to avoid races with __SetPageSlab*() (and more non-atomic ops)
* The check (unnecessarily) ignores LRU pages being isolated and
* walked by the page reclaim code, however that's not a big loss.
*/
if (!PageHuge(p)) {
- if (!PageLRU(hpage))
- shake_page(hpage, 0);
- if (!PageLRU(hpage)) {
+ if (!PageLRU(p))
+ shake_page(p, 0);
+ if (!PageLRU(p)) {
/*
* shake_page could have turned it free.
*/
if (is_free_buddy_page(p)) {
if (flags & MF_COUNT_INCREASED)
- action_result(pfn, MSG_BUDDY, DELAYED);
+ action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
else
- action_result(pfn, MSG_BUDDY_2ND,
- DELAYED);
+ action_result(pfn, MF_MSG_BUDDY_2ND,
+ MF_DELAYED);
return 0;
}
}
@@ -1211,8 +1188,8 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
* The page could have changed compound pages during the locking.
* If this happens just bail out.
*/
- if (compound_head(p) != hpage) {
- action_result(pfn, MSG_DIFFERENT_COMPOUND, IGNORED);
+ if (PageCompound(p) && compound_head(p) != orig_head) {
+ action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
res = -EBUSY;
goto out;
}
@@ -1252,7 +1229,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
* on the head page to show that the hugepage is hwpoisoned
*/
if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
- action_result(pfn, MSG_POISONED_HUGE, IGNORED);
+ action_result(pfn, MF_MSG_POISONED_HUGE, MF_IGNORED);
unlock_page(hpage);
put_page(hpage);
return 0;
@@ -1281,7 +1258,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
*/
if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)
!= SWAP_SUCCESS) {
- action_result(pfn, MSG_UNMAP_FAILED, IGNORED);
+ action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
res = -EBUSY;
goto out;
}
@@ -1290,7 +1267,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
* Torn down by someone else?
*/
if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
- action_result(pfn, MSG_TRUNCATED_LRU, IGNORED);
+ action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
res = -EBUSY;
goto out;
}
@@ -1450,12 +1427,12 @@ int unpoison_memory(unsigned long pfn)
*/
if (!PageHuge(page) && PageTransHuge(page)) {
pr_info("MCE: Memory failure is now running on %#lx\n", pfn);
- return 0;
+ return 0;
}
nr_pages = 1 << compound_order(page);
- if (!get_page_unless_zero(page)) {
+ if (!get_hwpoison_page(p)) {
/*
* Since HWPoisoned hugepage should have non-zero refcount,
* race between memory failure and unpoison seems to happen.
@@ -1523,7 +1500,7 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
* When the target page is a free hugepage, just remove it
* from free hugepage list.
*/
- if (!get_page_unless_zero(compound_head(p))) {
+ if (!get_hwpoison_page(p)) {
if (PageHuge(p)) {
pr_info("%s: %#lx free huge page\n", __func__, pfn);
ret = 0;
@@ -1694,20 +1671,7 @@ static int __soft_offline_page(struct page *page, int flags)
if (ret > 0)
ret = -EIO;
} else {
- /*
- * After page migration succeeds, the source page can
- * be trapped in pagevec and actual freeing is delayed.
- * Freeing code works differently based on PG_hwpoison,
- * so there's a race. We need to make sure that the
- * source page should be freed back to buddy before
- * setting PG_hwpoison.
- */
- if (!is_free_buddy_page(page))
- drain_all_pages(page_zone(page));
SetPageHWPoison(page);
- if (!is_free_buddy_page(page))
- pr_info("soft offline: %#lx: page leaked\n",
- pfn);
atomic_long_inc(&num_poisoned_pages);
}
} else {
@@ -1759,14 +1723,6 @@ int soft_offline_page(struct page *page, int flags)
get_online_mems();
- /*
- * Isolate the page, so that it doesn't get reallocated if it
- * was free. This flag should be kept set until the source page
- * is freed and PG_hwpoison on it is set.
- */
- if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
- set_migratetype_isolate(page, true);
-
ret = get_any_page(page, pfn, flags);
put_online_mems();
if (ret > 0) { /* for in-use pages */
@@ -1785,6 +1741,5 @@ int soft_offline_page(struct page *page, int flags)
atomic_long_inc(&num_poisoned_pages);
}
}
- unset_migratetype_isolate(page, MIGRATE_MOVABLE);
return ret;
}
diff --git a/mm/memory.c b/mm/memory.c
index 17734c3c1183ed..e1c45d09806481 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -61,6 +61,7 @@
#include <linux/string.h>
#include <linux/dma-debug.h>
#include <linux/debugfs.h>
+#include <linux/userfaultfd_k.h>
#include <asm/io.h>
#include <asm/pgalloc.h>
@@ -2081,11 +2082,12 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
goto oom;
cow_user_page(new_page, old_page, address, vma);
}
- __SetPageUptodate(new_page);
if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
goto oom_free_new;
+ __SetPageUptodate(new_page);
+
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
/*
@@ -2680,6 +2682,12 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (!pte_none(*page_table))
goto unlock;
+ /* Deliver the page fault to userland, check inside PT lock */
+ if (userfaultfd_missing(vma)) {
+ pte_unmap_unlock(page_table, ptl);
+ return handle_userfault(vma, address, flags,
+ VM_UFFD_MISSING);
+ }
goto setpte;
}
@@ -2689,6 +2697,10 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
page = alloc_zeroed_user_highpage_movable(vma, address);
if (!page)
goto oom;
+
+ if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))
+ goto oom_free_page;
+
/*
* The memory barrier inside __SetPageUptodate makes sure that
* preceeding stores to the page contents become visible before
@@ -2696,9 +2708,6 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
*/
__SetPageUptodate(page);
- if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))
- goto oom_free_page;
-
entry = mk_pte(page, vma->vm_page_prot);
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));
@@ -2707,6 +2716,15 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (!pte_none(*page_table))
goto release;
+ /* Deliver the page fault to userland, check inside PT lock */
+ if (userfaultfd_missing(vma)) {
+ pte_unmap_unlock(page_table, ptl);
+ mem_cgroup_cancel_charge(page, memcg);
+ page_cache_release(page);
+ return handle_userfault(vma, address, flags,
+ VM_UFFD_MISSING);
+ }
+
inc_mm_counter_fast(mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, address);
mem_cgroup_commit_charge(page, memcg, false);
@@ -3067,7 +3085,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* pinned by vma->vm_file's reference. We rely on unlock_page()'s
* release semantics to prevent the compiler from undoing this copying.
*/
- mapping = fault_page->mapping;
+ mapping = page_rmapping(fault_page);
unlock_page(fault_page);
if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {
/*
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 457bde530cbedc..4602e3a38c714c 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1332,7 +1332,7 @@ int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
}
/*
- * Confirm all pages in a range [start, end) is belongs to the same zone.
+ * Confirm all pages in a range [start, end) belong to the same zone.
*/
int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
{
@@ -1343,10 +1343,11 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
for (pfn = start_pfn;
pfn < end_pfn;
pfn += MAX_ORDER_NR_PAGES) {
- i = 0;
- /* This is just a CONFIG_HOLES_IN_ZONE check.*/
- while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i))
- i++;
+ /* Find the first valid pfn in this pageblock */
+ for (i = 0; i < MAX_ORDER_NR_PAGES; i++) {
+ if (pfn_valid(pfn + i))
+ break;
+ }
if (i == MAX_ORDER_NR_PAGES)
continue;
page = pfn_to_page(pfn + i);
@@ -1969,8 +1970,10 @@ void try_offline_node(int nid)
* wait_table may be allocated from boot memory,
* here only free if it's allocated by vmalloc.
*/
- if (is_vmalloc_addr(zone->wait_table))
+ if (is_vmalloc_addr(zone->wait_table)) {
vfree(zone->wait_table);
+ zone->wait_table = NULL;
+ }
}
}
EXPORT_SYMBOL(try_offline_node);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 747743237d9f4d..eeec77abd16831 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -722,8 +722,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
pgoff = vma->vm_pgoff +
((vmstart - vma->vm_start) >> PAGE_SHIFT);
prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
- vma->anon_vma, vma->vm_file, pgoff,
- new_pol);
+ vma->anon_vma, vma->vm_file, pgoff,
+ new_pol, vma->vm_userfaultfd_ctx);
if (prev) {
vma = prev;
next = vma->vm_next;
diff --git a/mm/memtest.c b/mm/memtest.c
index 1997d934b13b00..0a1cc133f6d72a 100644
--- a/mm/memtest.c
+++ b/mm/memtest.c
@@ -74,7 +74,8 @@ static void __init do_one_pass(u64 pattern, phys_addr_t start, phys_addr_t end)
u64 i;
phys_addr_t this_start, this_end;
- for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) {
+ for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &this_start,
+ &this_end, NULL) {
this_start = clamp(this_start, start, end);
this_end = clamp(this_end, start, end);
if (this_start < this_end) {
diff --git a/mm/migrate.c b/mm/migrate.c
index f53838fe3dfe6e..236ee25e79d9fe 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -918,7 +918,8 @@ out:
static ICE_noinline int unmap_and_move(new_page_t get_new_page,
free_page_t put_new_page,
unsigned long private, struct page *page,
- int force, enum migrate_mode mode)
+ int force, enum migrate_mode mode,
+ enum migrate_reason reason)
{
int rc = 0;
int *result = NULL;
@@ -949,7 +950,8 @@ out:
list_del(&page->lru);
dec_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
- putback_lru_page(page);
+ if (reason != MR_MEMORY_FAILURE)
+ putback_lru_page(page);
}
/*
@@ -1122,7 +1124,8 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
pass > 2, mode);
else
rc = unmap_and_move(get_new_page, put_new_page,
- private, page, pass > 2, mode);
+ private, page, pass > 2, mode,
+ reason);
switch(rc) {
case -ENOMEM:
@@ -1746,7 +1749,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
flush_tlb_range(vma, mmun_start, mmun_end);
/* Prepare a page as a migration target */
- __set_page_locked(new_page);
+ __SetPageLocked(new_page);
SetPageSwapBacked(new_page);
/* anon mapping, we can simply copy page->mapping to the new page: */
@@ -1796,7 +1799,7 @@ fail_putback:
*/
flush_cache_range(vma, mmun_start, mmun_end);
page_add_anon_rmap(new_page, vma, mmun_start);
- pmdp_clear_flush_notify(vma, mmun_start, pmd);
+ pmdp_huge_clear_flush_notify(vma, mmun_start, pmd);
set_pmd_at(mm, mmun_start, pmd, entry);
flush_tlb_range(vma, mmun_start, mmun_end);
update_mmu_cache_pmd(vma, address, &entry);
diff --git a/mm/mlock.c b/mm/mlock.c
index 6fd2cf15e8687d..25936680064fd4 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -510,7 +510,8 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
- vma->vm_file, pgoff, vma_policy(vma));
+ vma->vm_file, pgoff, vma_policy(vma),
+ vma->vm_userfaultfd_ctx);
if (*prev) {
vma = *prev;
goto success;
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 5f420f7fafa1c6..fdadf918de7691 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -11,6 +11,7 @@
#include <linux/export.h>
#include <linux/memory.h>
#include <linux/notifier.h>
+#include <linux/sched.h>
#include "internal.h"
#ifdef CONFIG_DEBUG_MEMORY_INIT
@@ -130,14 +131,6 @@ void __init mminit_verify_pageflags_layout(void)
BUG_ON(or_mask != add_mask);
}
-void __meminit mminit_verify_page_links(struct page *page, enum zone_type zone,
- unsigned long nid, unsigned long pfn)
-{
- BUG_ON(page_to_nid(page) != nid);
- BUG_ON(page_zonenum(page) != zone);
- BUG_ON(page_to_pfn(page) != pfn);
-}
-
static __init int set_mminit_loglevel(char *str)
{
get_option(&str, &mminit_loglevel);
diff --git a/mm/mmap.c b/mm/mmap.c
index bb50cacc3ea576..34033f3b05f2af 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -41,6 +41,7 @@
#include <linux/notifier.h>
#include <linux/memory.h>
#include <linux/printk.h>
+#include <linux/userfaultfd_k.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
@@ -919,7 +920,8 @@ again: remove_next = 1 + (end > next->vm_end);
* per-vma resources, so we don't attempt to merge those.
*/
static inline int is_mergeable_vma(struct vm_area_struct *vma,
- struct file *file, unsigned long vm_flags)
+ struct file *file, unsigned long vm_flags,
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
{
/*
* VM_SOFTDIRTY should not prevent from VMA merging, if we
@@ -935,6 +937,8 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
return 0;
if (vma->vm_ops && vma->vm_ops->close)
return 0;
+ if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
+ return 0;
return 1;
}
@@ -965,9 +969,11 @@ static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
*/
static int
can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
- struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
+ struct anon_vma *anon_vma, struct file *file,
+ pgoff_t vm_pgoff,
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
{
- if (is_mergeable_vma(vma, file, vm_flags) &&
+ if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
if (vma->vm_pgoff == vm_pgoff)
return 1;
@@ -984,9 +990,11 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
*/
static int
can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
- struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
+ struct anon_vma *anon_vma, struct file *file,
+ pgoff_t vm_pgoff,
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
{
- if (is_mergeable_vma(vma, file, vm_flags) &&
+ if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
pgoff_t vm_pglen;
vm_pglen = vma_pages(vma);
@@ -1029,7 +1037,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
struct vm_area_struct *prev, unsigned long addr,
unsigned long end, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
- pgoff_t pgoff, struct mempolicy *policy)
+ pgoff_t pgoff, struct mempolicy *policy,
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
{
pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
struct vm_area_struct *area, *next;
@@ -1056,14 +1065,17 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
if (prev && prev->vm_end == addr &&
mpol_equal(vma_policy(prev), policy) &&
can_vma_merge_after(prev, vm_flags,
- anon_vma, file, pgoff)) {
+ anon_vma, file, pgoff,
+ vm_userfaultfd_ctx)) {
/*
* OK, it can. Can we now merge in the successor as well?
*/
if (next && end == next->vm_start &&
mpol_equal(policy, vma_policy(next)) &&
can_vma_merge_before(next, vm_flags,
- anon_vma, file, pgoff+pglen) &&
+ anon_vma, file,
+ pgoff+pglen,
+ vm_userfaultfd_ctx) &&
is_mergeable_anon_vma(prev->anon_vma,
next->anon_vma, NULL)) {
/* cases 1, 6 */
@@ -1084,7 +1096,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
if (next && end == next->vm_start &&
mpol_equal(policy, vma_policy(next)) &&
can_vma_merge_before(next, vm_flags,
- anon_vma, file, pgoff+pglen)) {
+ anon_vma, file, pgoff+pglen,
+ vm_userfaultfd_ctx)) {
if (prev && addr < prev->vm_end) /* case 4 */
err = vma_adjust(prev, prev->vm_start,
addr, prev->vm_pgoff, NULL);
@@ -1570,8 +1583,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
/*
* Can we just expand an old mapping?
*/
- vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff,
- NULL);
+ vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
+ NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX);
if (vma)
goto out;
@@ -2757,7 +2770,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
/* Can we just expand an old private anonymous mapping? */
vma = vma_merge(mm, prev, addr, addr + len, flags,
- NULL, NULL, pgoff, NULL);
+ NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX);
if (vma)
goto out;
@@ -2913,7 +2926,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
return NULL; /* should never get here */
new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
- vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
+ vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
+ vma->vm_userfaultfd_ctx);
if (new_vma) {
/*
* Source vma may have been merged into new_vma
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 88584838e7046b..ef5be8eaab0017 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -29,6 +29,8 @@
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
+#include "internal.h"
+
/*
* For a prot_numa update we only hold mmap_sem for read so there is a
* potential race with faulting where a pmd was temporarily none. This
@@ -290,7 +292,8 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
*/
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*pprev = vma_merge(mm, *pprev, start, end, newflags,
- vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
+ vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
+ vma->vm_userfaultfd_ctx);
if (*pprev) {
vma = *pprev;
goto success;
@@ -322,6 +325,15 @@ success:
change_protection(vma, start, end, vma->vm_page_prot,
dirty_accountable, 0);
+ /*
+ * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
+ * fault on access.
+ */
+ if ((oldflags & (VM_WRITE | VM_SHARED | VM_LOCKED)) == VM_LOCKED &&
+ (newflags & VM_WRITE)) {
+ populate_vma_page_range(vma, start, end, NULL);
+ }
+
vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
vm_stat_account(mm, newflags, vma->vm_file, nrpages);
perf_event_mmap(vma);
diff --git a/mm/mremap.c b/mm/mremap.c
index 034e2d3606522b..a7c93eceb1c8d1 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -22,6 +22,7 @@
#include <linux/mmu_notifier.h>
#include <linux/sched/sysctl.h>
#include <linux/uaccess.h>
+#include <linux/mm-arch-hooks.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
@@ -286,13 +287,17 @@ static unsigned long move_vma(struct vm_area_struct *vma,
old_len = new_len;
old_addr = new_addr;
new_addr = -ENOMEM;
- } else if (vma->vm_file && vma->vm_file->f_op->mremap) {
- err = vma->vm_file->f_op->mremap(vma->vm_file, new_vma);
- if (err < 0) {
- move_page_tables(new_vma, new_addr, vma, old_addr,
- moved_len, true);
- return err;
+ } else {
+ if (vma->vm_file && vma->vm_file->f_op->mremap) {
+ err = vma->vm_file->f_op->mremap(vma->vm_file, new_vma);
+ if (err < 0) {
+ move_page_tables(new_vma, new_addr, vma,
+ old_addr, moved_len, true);
+ return err;
+ }
}
+ arch_remap(mm, old_addr, old_addr + old_len,
+ new_addr, new_addr + new_len);
}
/* Conceal VM_ACCOUNT so old reservation is not undone */
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 90b50468333e38..e57cf24babd671 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -37,11 +37,20 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
{
void *ptr;
u64 addr;
+ ulong flags = choose_memblock_flags();
if (limit > memblock.current_limit)
limit = memblock.current_limit;
- addr = memblock_find_in_range_node(size, align, goal, limit, nid);
+again:
+ addr = memblock_find_in_range_node(size, align, goal, limit, nid,
+ flags);
+ if (!addr && (flags & MEMBLOCK_MIRROR)) {
+ flags &= ~MEMBLOCK_MIRROR;
+ pr_warn("Could not allocate %pap bytes of mirrored memory\n",
+ &size);
+ goto again;
+ }
if (!addr)
return NULL;
@@ -77,7 +86,7 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size)
end = PFN_DOWN(addr + size);
for (; cursor < end; cursor++) {
- __free_pages_bootmem(pfn_to_page(cursor), 0);
+ __free_pages_bootmem(pfn_to_page(cursor), cursor, 0);
totalram_pages++;
}
}
@@ -92,7 +101,7 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end)
while (start + (1UL << order) > end)
order--;
- __free_pages_bootmem(pfn_to_page(start), order);
+ __free_pages_bootmem(pfn_to_page(start), start, order);
start += (1UL << order);
}
@@ -121,7 +130,11 @@ static unsigned long __init free_low_memory_core_early(void)
memblock_clear_hotplug(0, -1);
- for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL)
+ for_each_reserved_mem_region(i, &start, &end)
+ reserve_bootmem_region(start, end);
+
+ for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
+ NULL)
count += __free_memory_core(start, end);
#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
diff --git a/mm/nommu.c b/mm/nommu.c
index e7b24dcec505c0..58ea3643b9e996 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -42,22 +42,6 @@
#include <asm/mmu_context.h>
#include "internal.h"
-#if 0
-#define kenter(FMT, ...) \
- printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
-#define kleave(FMT, ...) \
- printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__)
-#define kdebug(FMT, ...) \
- printk(KERN_DEBUG "xxx" FMT"yyy\n", ##__VA_ARGS__)
-#else
-#define kenter(FMT, ...) \
- no_printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
-#define kleave(FMT, ...) \
- no_printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__)
-#define kdebug(FMT, ...) \
- no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__)
-#endif
-
void *high_memory;
EXPORT_SYMBOL(high_memory);
struct page *mem_map;
@@ -665,11 +649,7 @@ static void free_page_series(unsigned long from, unsigned long to)
for (; from < to; from += PAGE_SIZE) {
struct page *page = virt_to_page(from);
- kdebug("- free %lx", from);
atomic_long_dec(&mmap_pages_allocated);
- if (page_count(page) != 1)
- kdebug("free page %p: refcount not one: %d",
- page, page_count(page));
put_page(page);
}
}
@@ -683,8 +663,6 @@ static void free_page_series(unsigned long from, unsigned long to)
static void __put_nommu_region(struct vm_region *region)
__releases(nommu_region_sem)
{
- kenter("%p{%d}", region, region->vm_usage);
-
BUG_ON(!nommu_region_tree.rb_node);
if (--region->vm_usage == 0) {
@@ -697,10 +675,8 @@ static void __put_nommu_region(struct vm_region *region)
/* IO memory and memory shared directly out of the pagecache
* from ramfs/tmpfs mustn't be released here */
- if (region->vm_flags & VM_MAPPED_COPY) {
- kdebug("free series");
+ if (region->vm_flags & VM_MAPPED_COPY)
free_page_series(region->vm_start, region->vm_top);
- }
kmem_cache_free(vm_region_jar, region);
} else {
up_write(&nommu_region_sem);
@@ -744,8 +720,6 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
struct address_space *mapping;
struct rb_node **p, *parent, *rb_prev;
- kenter(",%p", vma);
-
BUG_ON(!vma->vm_region);
mm->map_count++;
@@ -813,8 +787,6 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
struct mm_struct *mm = vma->vm_mm;
struct task_struct *curr = current;
- kenter("%p", vma);
-
protect_vma(vma, 0);
mm->map_count--;
@@ -854,7 +826,6 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
*/
static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
{
- kenter("%p", vma);
if (vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);
if (vma->vm_file)
@@ -957,12 +928,8 @@ static int validate_mmap_request(struct file *file,
int ret;
/* do the simple checks first */
- if (flags & MAP_FIXED) {
- printk(KERN_DEBUG
- "%d: Can't do fixed-address/overlay mmap of RAM\n",
- current->pid);
+ if (flags & MAP_FIXED)
return -EINVAL;
- }
if ((flags & MAP_TYPE) != MAP_PRIVATE &&
(flags & MAP_TYPE) != MAP_SHARED)
@@ -1060,8 +1027,7 @@ static int validate_mmap_request(struct file *file,
) {
capabilities &= ~NOMMU_MAP_DIRECT;
if (flags & MAP_SHARED) {
- printk(KERN_WARNING
- "MAP_SHARED not completely supported on !MMU\n");
+ pr_warn("MAP_SHARED not completely supported on !MMU\n");
return -EINVAL;
}
}
@@ -1205,16 +1171,12 @@ static int do_mmap_private(struct vm_area_struct *vma,
* we're allocating is smaller than a page
*/
order = get_order(len);
- kdebug("alloc order %d for %lx", order, len);
-
total = 1 << order;
point = len >> PAGE_SHIFT;
/* we don't want to allocate a power-of-2 sized page set */
- if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) {
+ if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages)
total = point;
- kdebug("try to alloc exact %lu pages", total);
- }
base = alloc_pages_exact(total << PAGE_SHIFT, GFP_KERNEL);
if (!base)
@@ -1285,18 +1247,14 @@ unsigned long do_mmap_pgoff(struct file *file,
unsigned long capabilities, vm_flags, result;
int ret;
- kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff);
-
*populate = 0;
/* decide whether we should attempt the mapping, and if so what sort of
* mapping */
ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
&capabilities);
- if (ret < 0) {
- kleave(" = %d [val]", ret);
+ if (ret < 0)
return ret;
- }
/* we ignore the address hint */
addr = 0;
@@ -1383,11 +1341,9 @@ unsigned long do_mmap_pgoff(struct file *file,
vma->vm_start = start;
vma->vm_end = start + len;
- if (pregion->vm_flags & VM_MAPPED_COPY) {
- kdebug("share copy");
+ if (pregion->vm_flags & VM_MAPPED_COPY)
vma->vm_flags |= VM_MAPPED_COPY;
- } else {
- kdebug("share mmap");
+ else {
ret = do_mmap_shared_file(vma);
if (ret < 0) {
vma->vm_region = NULL;
@@ -1467,7 +1423,6 @@ share:
up_write(&nommu_region_sem);
- kleave(" = %lx", result);
return result;
error_just_free:
@@ -1479,27 +1434,24 @@ error:
if (vma->vm_file)
fput(vma->vm_file);
kmem_cache_free(vm_area_cachep, vma);
- kleave(" = %d", ret);
return ret;
sharing_violation:
up_write(&nommu_region_sem);
- printk(KERN_WARNING "Attempt to share mismatched mappings\n");
+ pr_warn("Attempt to share mismatched mappings\n");
ret = -EINVAL;
goto error;
error_getting_vma:
kmem_cache_free(vm_region_jar, region);
- printk(KERN_WARNING "Allocation of vma for %lu byte allocation"
- " from process %d failed\n",
- len, current->pid);
+ pr_warn("Allocation of vma for %lu byte allocation from process %d failed\n",
+ len, current->pid);
show_free_areas(0);
return -ENOMEM;
error_getting_region:
- printk(KERN_WARNING "Allocation of vm region for %lu byte allocation"
- " from process %d failed\n",
- len, current->pid);
+ pr_warn("Allocation of vm region for %lu byte allocation from process %d failed\n",
+ len, current->pid);
show_free_areas(0);
return -ENOMEM;
}
@@ -1563,8 +1515,6 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_region *region;
unsigned long npages;
- kenter("");
-
/* we're only permitted to split anonymous regions (these should have
* only a single usage on the region) */
if (vma->vm_file)
@@ -1628,8 +1578,6 @@ static int shrink_vma(struct mm_struct *mm,
{
struct vm_region *region;
- kenter("");
-
/* adjust the VMA's pointers, which may reposition it in the MM's tree
* and list */
delete_vma_from_mm(vma);
@@ -1669,8 +1617,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
unsigned long end;
int ret;
- kenter(",%lx,%zx", start, len);
-
len = PAGE_ALIGN(len);
if (len == 0)
return -EINVAL;
@@ -1682,11 +1628,9 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
if (!vma) {
static int limit;
if (limit < 5) {
- printk(KERN_WARNING
- "munmap of memory not mmapped by process %d"
- " (%s): 0x%lx-0x%lx\n",
- current->pid, current->comm,
- start, start + len - 1);
+ pr_warn("munmap of memory not mmapped by process %d (%s): 0x%lx-0x%lx\n",
+ current->pid, current->comm,
+ start, start + len - 1);
limit++;
}
return -EINVAL;
@@ -1695,38 +1639,27 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
/* we're allowed to split an anonymous VMA but not a file-backed one */
if (vma->vm_file) {
do {
- if (start > vma->vm_start) {
- kleave(" = -EINVAL [miss]");
+ if (start > vma->vm_start)
return -EINVAL;
- }
if (end == vma->vm_end)
goto erase_whole_vma;
vma = vma->vm_next;
} while (vma);
- kleave(" = -EINVAL [split file]");
return -EINVAL;
} else {
/* the chunk must be a subset of the VMA found */
if (start == vma->vm_start && end == vma->vm_end)
goto erase_whole_vma;
- if (start < vma->vm_start || end > vma->vm_end) {
- kleave(" = -EINVAL [superset]");
+ if (start < vma->vm_start || end > vma->vm_end)
return -EINVAL;
- }
- if (start & ~PAGE_MASK) {
- kleave(" = -EINVAL [unaligned start]");
+ if (start & ~PAGE_MASK)
return -EINVAL;
- }
- if (end != vma->vm_end && end & ~PAGE_MASK) {
- kleave(" = -EINVAL [unaligned split]");
+ if (end != vma->vm_end && end & ~PAGE_MASK)
return -EINVAL;
- }
if (start != vma->vm_start && end != vma->vm_end) {
ret = split_vma(mm, vma, start, 1);
- if (ret < 0) {
- kleave(" = %d [split]", ret);
+ if (ret < 0)
return ret;
- }
}
return shrink_vma(mm, vma, start, end);
}
@@ -1734,7 +1667,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
erase_whole_vma:
delete_vma_from_mm(vma);
delete_vma(mm, vma);
- kleave(" = 0");
return 0;
}
EXPORT_SYMBOL(do_munmap);
@@ -1766,8 +1698,6 @@ void exit_mmap(struct mm_struct *mm)
if (!mm)
return;
- kenter("");
-
mm->total_vm = 0;
while ((vma = mm->mmap)) {
@@ -1776,8 +1706,6 @@ void exit_mmap(struct mm_struct *mm)
delete_vma(mm, vma);
cond_resched();
}
-
- kleave("");
}
unsigned long vm_brk(unsigned long addr, unsigned long len)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 2b665da1b3c920..d7fb1275e20069 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -42,7 +42,8 @@
int sysctl_panic_on_oom;
int sysctl_oom_kill_allocating_task;
int sysctl_oom_dump_tasks = 1;
-static DEFINE_SPINLOCK(zone_scan_lock);
+
+DEFINE_MUTEX(oom_lock);
#ifdef CONFIG_NUMA
/**
@@ -405,16 +406,15 @@ static atomic_t oom_victims = ATOMIC_INIT(0);
static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
bool oom_killer_disabled __read_mostly;
-static DECLARE_RWSEM(oom_sem);
/**
- * mark_tsk_oom_victim - marks the given task as OOM victim.
+ * mark_oom_victim - mark the given task as OOM victim
* @tsk: task to mark
*
- * Has to be called with oom_sem taken for read and never after
+ * Has to be called with oom_lock held and never after
* oom has been disabled already.
*/
-void mark_tsk_oom_victim(struct task_struct *tsk)
+void mark_oom_victim(struct task_struct *tsk)
{
WARN_ON(oom_killer_disabled);
/* OOM killer might race with memcg OOM */
@@ -431,23 +431,14 @@ void mark_tsk_oom_victim(struct task_struct *tsk)
}
/**
- * unmark_oom_victim - unmarks the current task as OOM victim.
- *
- * Wakes up all waiters in oom_killer_disable()
+ * exit_oom_victim - note the exit of an OOM victim
*/
-void unmark_oom_victim(void)
+void exit_oom_victim(void)
{
- if (!test_and_clear_thread_flag(TIF_MEMDIE))
- return;
+ clear_thread_flag(TIF_MEMDIE);
- down_read(&oom_sem);
- /*
- * There is no need to signal the lasst oom_victim if there
- * is nobody who cares.
- */
- if (!atomic_dec_return(&oom_victims) && oom_killer_disabled)
+ if (!atomic_dec_return(&oom_victims))
wake_up_all(&oom_victims_wait);
- up_read(&oom_sem);
}
/**
@@ -469,14 +460,14 @@ bool oom_killer_disable(void)
* Make sure to not race with an ongoing OOM killer
* and that the current is not the victim.
*/
- down_write(&oom_sem);
+ mutex_lock(&oom_lock);
if (test_thread_flag(TIF_MEMDIE)) {
- up_write(&oom_sem);
+ mutex_unlock(&oom_lock);
return false;
}
oom_killer_disabled = true;
- up_write(&oom_sem);
+ mutex_unlock(&oom_lock);
wait_event(oom_victims_wait, !atomic_read(&oom_victims));
@@ -488,9 +479,7 @@ bool oom_killer_disable(void)
*/
void oom_killer_enable(void)
{
- down_write(&oom_sem);
oom_killer_disabled = false;
- up_write(&oom_sem);
}
#define K(x) ((x) << (PAGE_SHIFT-10))
@@ -498,7 +487,7 @@ void oom_killer_enable(void)
* Must be called while holding a reference to p, which will be released upon
* returning.
*/
-void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
+static void __oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
unsigned int points, unsigned long totalpages,
struct mem_cgroup *memcg, nodemask_t *nodemask,
const char *message)
@@ -511,24 +500,11 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
- /*
- * If the task is already exiting, don't alarm the sysadmin or kill
- * its children or threads, just set TIF_MEMDIE so it can die quickly
- */
- task_lock(p);
- if (p->mm && task_will_free_mem(p)) {
- mark_tsk_oom_victim(p);
- task_unlock(p);
- put_task_struct(p);
- return;
- }
- task_unlock(p);
-
if (__ratelimit(&oom_rs))
dump_header(p, gfp_mask, order, memcg, nodemask);
task_lock(p);
- pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n",
+ pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
message, task_pid_nr(p), p->comm, points);
task_unlock(p);
@@ -572,7 +548,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
/* mm cannot safely be dereferenced after task_unlock(victim) */
mm = victim->mm;
- mark_tsk_oom_victim(victim);
+ mark_oom_victim(victim);
pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
K(get_mm_counter(victim->mm, MM_ANONPAGES)),
@@ -608,6 +584,28 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
}
#undef K
+void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
+ unsigned int points, unsigned long totalpages,
+ struct mem_cgroup *memcg, nodemask_t *nodemask,
+ const char *message)
+{
+ /*
+ * If the task is already exiting, don't alarm the sysadmin or kill
+ * its children or threads, just set TIF_MEMDIE so it can die quickly
+ */
+ task_lock(p);
+ if (p->mm && task_will_free_mem(p)) {
+ mark_oom_victim(p);
+ task_unlock(p);
+ put_task_struct(p);
+ return;
+ }
+ task_unlock(p);
+
+ __oom_kill_process(p, gfp_mask, order, points, totalpages, memcg,
+ nodemask, message);
+}
+
/*
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
*/
@@ -645,67 +643,48 @@ int unregister_oom_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL_GPL(unregister_oom_notifier);
-/*
- * Try to acquire the OOM killer lock for the zones in zonelist. Returns zero
- * if a parallel OOM killing is already taking place that includes a zone in
- * the zonelist. Otherwise, locks all zones in the zonelist and returns 1.
+/**
+ * force_out_of_memory - forces OOM killer
+ *
+ * External trigger for the OOM killer. The system doesn't have to be under
+ * OOM condition (e.g. sysrq+f).
*/
-bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_mask)
+bool force_out_of_memory(void)
{
- struct zoneref *z;
- struct zone *zone;
- bool ret = true;
-
- spin_lock(&zone_scan_lock);
- for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
- if (test_bit(ZONE_OOM_LOCKED, &zone->flags)) {
- ret = false;
- goto out;
- }
-
- /*
- * Lock each zone in the zonelist under zone_scan_lock so a parallel
- * call to oom_zonelist_trylock() doesn't succeed when it shouldn't.
- */
- for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
- set_bit(ZONE_OOM_LOCKED, &zone->flags);
+ struct zonelist *zonelist;
+ struct task_struct *p;
+ unsigned long totalpages;
+ unsigned int points;
-out:
- spin_unlock(&zone_scan_lock);
- return ret;
-}
+ if (oom_killer_disabled)
+ return false;
-/*
- * Clears the ZONE_OOM_LOCKED flag for all zones in the zonelist so that failed
- * allocation attempts with zonelists containing them may now recall the OOM
- * killer, if necessary.
- */
-void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
-{
- struct zoneref *z;
- struct zone *zone;
+ zonelist = node_zonelist(first_memory_node, GFP_KERNEL);
+ constrained_alloc(zonelist, GFP_KERNEL, NULL, &totalpages);
+ p = select_bad_process(&points, totalpages, NULL, true);
+ if (p != (void *)-1UL)
+ __oom_kill_process(p, GFP_KERNEL, 0, points, totalpages, NULL,
+ NULL, "Forced out of memory killer");
+ else
+ pr_warn("Forced out of memory. No killable task found...\n");
- spin_lock(&zone_scan_lock);
- for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
- clear_bit(ZONE_OOM_LOCKED, &zone->flags);
- spin_unlock(&zone_scan_lock);
+ return true;
}
/**
- * __out_of_memory - kill the "best" process when we run out of memory
+ * out_of_memory - kill the "best" process when we run out of memory
* @zonelist: zonelist pointer
* @gfp_mask: memory allocation flags
* @order: amount of memory being requested as a power of 2
* @nodemask: nodemask passed to page allocator
- * @force_kill: true if a task must be killed, even if others are exiting
*
* If we run out of memory, we have the choice between either
* killing a random task (bad), letting the system crash (worse)
* OR try to be smart about which process to kill. Note that we
* don't have to be perfect here, we just have to be good.
*/
-static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
- int order, nodemask_t *nodemask, bool force_kill)
+bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
+ int order, nodemask_t *nodemask)
{
const nodemask_t *mpol_mask;
struct task_struct *p;
@@ -715,10 +694,22 @@ static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
enum oom_constraint constraint = CONSTRAINT_NONE;
int killed = 0;
+ if (oom_killer_disabled)
+ return false;
+
blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
if (freed > 0)
/* Got some memory back in the last second. */
- return;
+ goto out;
+
+ /*
+ * Check if there were limitations on the allocation (only relevant for
+ * NUMA) that may require different handling.
+ */
+ constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
+ &totalpages);
+ mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
+ check_panic_on_oom(constraint, gfp_mask, order, mpol_mask, NULL);
/*
* If current has a pending SIGKILL or is exiting, then automatically
@@ -730,19 +721,10 @@ static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
*/
if (current->mm &&
(fatal_signal_pending(current) || task_will_free_mem(current))) {
- mark_tsk_oom_victim(current);
- return;
+ mark_oom_victim(current);
+ goto out;
}
- /*
- * Check if there were limitations on the allocation (only relevant for
- * NUMA) that may require different handling.
- */
- constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
- &totalpages);
- mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
- check_panic_on_oom(constraint, gfp_mask, order, mpol_mask, NULL);
-
if (sysctl_oom_kill_allocating_task && current->mm &&
!oom_unkillable_task(current, NULL, nodemask) &&
current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
@@ -753,7 +735,7 @@ static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
goto out;
}
- p = select_bad_process(&points, totalpages, mpol_mask, force_kill);
+ p = select_bad_process(&points, totalpages, mpol_mask, false);
/* Found nothing?!?! Either we hang forever, or we panic. */
if (!p) {
dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
@@ -771,32 +753,8 @@ out:
*/
if (killed)
schedule_timeout_killable(1);
-}
-
-/**
- * out_of_memory - tries to invoke OOM killer.
- * @zonelist: zonelist pointer
- * @gfp_mask: memory allocation flags
- * @order: amount of memory being requested as a power of 2
- * @nodemask: nodemask passed to page allocator
- * @force_kill: true if a task must be killed, even if others are exiting
- *
- * invokes __out_of_memory if the OOM is not disabled by oom_killer_disable()
- * when it returns false. Otherwise returns true.
- */
-bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
- int order, nodemask_t *nodemask, bool force_kill)
-{
- bool ret = false;
- down_read(&oom_sem);
- if (!oom_killer_disabled) {
- __out_of_memory(zonelist, gfp_mask, order, nodemask, force_kill);
- ret = true;
- }
- up_read(&oom_sem);
-
- return ret;
+ return true;
}
/*
@@ -806,27 +764,21 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
*/
void pagefault_out_of_memory(void)
{
- struct zonelist *zonelist;
-
- down_read(&oom_sem);
if (mem_cgroup_oom_synchronize(true))
- goto unlock;
+ return;
- zonelist = node_zonelist(first_memory_node, GFP_KERNEL);
- if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) {
- if (!oom_killer_disabled)
- __out_of_memory(NULL, 0, 0, NULL, false);
- else
- /*
- * There shouldn't be any user tasks runable while the
- * OOM killer is disabled so the current task has to
- * be a racing OOM victim for which oom_killer_disable()
- * is waiting for.
- */
- WARN_ON(test_thread_flag(TIF_MEMDIE));
+ if (!mutex_trylock(&oom_lock))
+ return;
- oom_zonelist_unlock(zonelist, GFP_KERNEL);
+ if (!out_of_memory(NULL, 0, 0, NULL)) {
+ /*
+ * There shouldn't be any user tasks runnable while the
+ * OOM killer is disabled, so the current task has to
+ * be a racing OOM victim for which oom_killer_disable()
+ * is waiting for.
+ */
+ WARN_ON(test_thread_flag(TIF_MEMDIE));
}
-unlock:
- up_read(&oom_sem);
+
+ mutex_unlock(&oom_lock);
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2fd31aebef30c4..c2a6d3bfa27973 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -18,6 +18,7 @@
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/interrupt.h>
+#include <linux/rwsem.h>
#include <linux/pagemap.h>
#include <linux/jiffies.h>
#include <linux/bootmem.h>
@@ -61,6 +62,7 @@
#include <linux/hugetlb.h>
#include <linux/sched/rt.h>
#include <linux/page_owner.h>
+#include <linux/kthread.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
@@ -235,6 +237,77 @@ EXPORT_SYMBOL(nr_online_nodes);
int page_group_by_mobility_disabled __read_mostly;
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+static inline void reset_deferred_meminit(pg_data_t *pgdat)
+{
+ pgdat->first_deferred_pfn = ULONG_MAX;
+}
+
+/* Returns true if the struct page for the pfn is uninitialised */
+static inline bool __meminit early_page_uninitialised(unsigned long pfn)
+{
+ int nid = early_pfn_to_nid(pfn);
+
+ if (pfn >= NODE_DATA(nid)->first_deferred_pfn)
+ return true;
+
+ return false;
+}
+
+static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid)
+{
+ if (pfn >= NODE_DATA(nid)->first_deferred_pfn)
+ return true;
+
+ return false;
+}
+
+/*
+ * Returns false when the remaining initialisation should be deferred until
+ * later in the boot cycle when it can be parallelised.
+ */
+static inline bool update_defer_init(pg_data_t *pgdat,
+ unsigned long pfn, unsigned long zone_end,
+ unsigned long *nr_initialised)
+{
+ /* Always populate low zones for address-contrained allocations */
+ if (zone_end < pgdat_end_pfn(pgdat))
+ return true;
+
+ /* Initialise at least 2G of the highest zone */
+ (*nr_initialised)++;
+ if (*nr_initialised > (2UL << (30 - PAGE_SHIFT)) &&
+ (pfn & (PAGES_PER_SECTION - 1)) == 0) {
+ pgdat->first_deferred_pfn = pfn;
+ return false;
+ }
+
+ return true;
+}
+#else
+static inline void reset_deferred_meminit(pg_data_t *pgdat)
+{
+}
+
+static inline bool early_page_uninitialised(unsigned long pfn)
+{
+ return false;
+}
+
+static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid)
+{
+ return false;
+}
+
+static inline bool update_defer_init(pg_data_t *pgdat,
+ unsigned long pfn, unsigned long zone_end,
+ unsigned long *nr_initialised)
+{
+ return true;
+}
+#endif
+
+
void set_pageblock_migratetype(struct page *page, int migratetype)
{
if (unlikely(page_group_by_mobility_disabled &&
@@ -373,6 +446,7 @@ void prep_compound_page(struct page *page, unsigned long order)
for (i = 1; i < nr_pages; i++) {
struct page *p = page + i;
set_page_count(p, 0);
+ p->mapping = TAIL_MAPPING;
p->first_page = page;
/* Make sure p->first_page is always valid for PageTail() */
smp_wmb();
@@ -380,20 +454,6 @@ void prep_compound_page(struct page *page, unsigned long order)
}
}
-static inline void prep_zero_page(struct page *page, unsigned int order,
- gfp_t gfp_flags)
-{
- int i;
-
- /*
- * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
- * and __GFP_HIGHMEM from hard or soft interrupt context.
- */
- VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
- for (i = 0; i < (1 << order); i++)
- clear_highpage(page + i);
-}
-
#ifdef CONFIG_DEBUG_PAGEALLOC
unsigned int _debug_guardpage_minorder;
bool _debug_pagealloc_enabled __read_mostly;
@@ -765,6 +825,12 @@ static void free_one_page(struct zone *zone,
static int free_tail_pages_check(struct page *head_page, struct page *page)
{
+ if (page->mapping != TAIL_MAPPING) {
+ bad_page(page, "corrupted mapping in tail page", 0);
+ page->mapping = NULL;
+ return 1;
+ }
+ page->mapping = NULL;
if (!IS_ENABLED(CONFIG_DEBUG_VM))
return 0;
if (unlikely(!PageTail(page))) {
@@ -778,6 +844,75 @@ static int free_tail_pages_check(struct page *head_page, struct page *page)
return 0;
}
+static void __meminit __init_single_page(struct page *page, unsigned long pfn,
+ unsigned long zone, int nid)
+{
+ set_page_links(page, zone, nid, pfn);
+ init_page_count(page);
+ page_mapcount_reset(page);
+ page_cpupid_reset_last(page);
+
+ INIT_LIST_HEAD(&page->lru);
+#ifdef WANT_PAGE_VIRTUAL
+ /* The shift won't overflow because ZONE_NORMAL is below 4G. */
+ if (!is_highmem_idx(zone))
+ set_page_address(page, __va(pfn << PAGE_SHIFT));
+#endif
+}
+
+static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone,
+ int nid)
+{
+ return __init_single_page(pfn_to_page(pfn), pfn, zone, nid);
+}
+
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+static void init_reserved_page(unsigned long pfn)
+{
+ pg_data_t *pgdat;
+ int nid, zid;
+
+ if (!early_page_uninitialised(pfn))
+ return;
+
+ nid = early_pfn_to_nid(pfn);
+ pgdat = NODE_DATA(nid);
+
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ struct zone *zone = &pgdat->node_zones[zid];
+
+ if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone))
+ break;
+ }
+ __init_single_pfn(pfn, zid, nid);
+}
+#else
+static inline void init_reserved_page(unsigned long pfn)
+{
+}
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
+
+/*
+ * Initialised pages do not have PageReserved set. This function is
+ * called for each range allocated by the bootmem allocator and
+ * marks the pages PageReserved. The remaining valid pages are later
+ * sent to the buddy page allocator.
+ */
+void __meminit reserve_bootmem_region(unsigned long start, unsigned long end)
+{
+ unsigned long start_pfn = PFN_DOWN(start);
+ unsigned long end_pfn = PFN_UP(end);
+
+ for (; start_pfn < end_pfn; start_pfn++) {
+ if (pfn_valid(start_pfn)) {
+ struct page *page = pfn_to_page(start_pfn);
+
+ init_reserved_page(start_pfn);
+ SetPageReserved(page);
+ }
+ }
+}
+
static bool free_pages_prepare(struct page *page, unsigned int order)
{
bool compound = PageCompound(page);
@@ -832,7 +967,8 @@ static void __free_pages_ok(struct page *page, unsigned int order)
local_irq_restore(flags);
}
-void __init __free_pages_bootmem(struct page *page, unsigned int order)
+static void __init __free_pages_boot_core(struct page *page,
+ unsigned long pfn, unsigned int order)
{
unsigned int nr_pages = 1 << order;
struct page *p = page;
@@ -852,6 +988,223 @@ void __init __free_pages_bootmem(struct page *page, unsigned int order)
__free_pages(page, order);
}
+#if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \
+ defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
+/* Only safe to use early in boot when initialisation is single-threaded */
+static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
+
+int __meminit early_pfn_to_nid(unsigned long pfn)
+{
+ int nid;
+
+ /* The system will behave unpredictably otherwise */
+ BUG_ON(system_state != SYSTEM_BOOTING);
+
+ nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
+ if (nid >= 0)
+ return nid;
+ /* just returns 0 */
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_NODES_SPAN_OTHER_NODES
+static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node,
+ struct mminit_pfnnid_cache *state)
+{
+ int nid;
+
+ nid = __early_pfn_to_nid(pfn, state);
+ if (nid >= 0 && nid != node)
+ return false;
+ return true;
+}
+
+/* Only safe to use early in boot when initialisation is single-threaded */
+static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
+{
+ return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache);
+}
+
+#else
+
+static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
+{
+ return true;
+}
+static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node,
+ struct mminit_pfnnid_cache *state)
+{
+ return true;
+}
+#endif
+
+
+void __init __free_pages_bootmem(struct page *page, unsigned long pfn,
+ unsigned int order)
+{
+ if (early_page_uninitialised(pfn))
+ return;
+ return __free_pages_boot_core(page, pfn, order);
+}
+
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+static void __init deferred_free_range(struct page *page,
+ unsigned long pfn, int nr_pages)
+{
+ int i;
+
+ if (!page)
+ return;
+
+ /* Free a large naturally-aligned chunk if possible */
+ if (nr_pages == MAX_ORDER_NR_PAGES &&
+ (pfn & (MAX_ORDER_NR_PAGES-1)) == 0) {
+ set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ __free_pages_boot_core(page, pfn, MAX_ORDER-1);
+ return;
+ }
+
+ for (i = 0; i < nr_pages; i++, page++, pfn++)
+ __free_pages_boot_core(page, pfn, 0);
+}
+
+static __initdata DECLARE_RWSEM(pgdat_init_rwsem);
+
+/* Initialise remaining memory on a node */
+static int __init deferred_init_memmap(void *data)
+{
+ pg_data_t *pgdat = data;
+ int nid = pgdat->node_id;
+ struct mminit_pfnnid_cache nid_init_state = { };
+ unsigned long start = jiffies;
+ unsigned long nr_pages = 0;
+ unsigned long walk_start, walk_end;
+ int i, zid;
+ struct zone *zone;
+ unsigned long first_init_pfn = pgdat->first_deferred_pfn;
+ const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+
+ if (first_init_pfn == ULONG_MAX) {
+ up_read(&pgdat_init_rwsem);
+ return 0;
+ }
+
+ /* Bind memory initialisation thread to a local node if possible */
+ if (!cpumask_empty(cpumask))
+ set_cpus_allowed_ptr(current, cpumask);
+
+ /* Sanity check boundaries */
+ BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
+ BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
+ pgdat->first_deferred_pfn = ULONG_MAX;
+
+ /* Only the highest zone is deferred so find it */
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ zone = pgdat->node_zones + zid;
+ if (first_init_pfn < zone_end_pfn(zone))
+ break;
+ }
+
+ for_each_mem_pfn_range(i, nid, &walk_start, &walk_end, NULL) {
+ unsigned long pfn, end_pfn;
+ struct page *page = NULL;
+ struct page *free_base_page = NULL;
+ unsigned long free_base_pfn = 0;
+ int nr_to_free = 0;
+
+ end_pfn = min(walk_end, zone_end_pfn(zone));
+ pfn = first_init_pfn;
+ if (pfn < walk_start)
+ pfn = walk_start;
+ if (pfn < zone->zone_start_pfn)
+ pfn = zone->zone_start_pfn;
+
+ for (; pfn < end_pfn; pfn++) {
+ if (!pfn_valid_within(pfn))
+ goto free_range;
+
+ /*
+ * Ensure pfn_valid is checked every
+ * MAX_ORDER_NR_PAGES for memory holes
+ */
+ if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) {
+ if (!pfn_valid(pfn)) {
+ page = NULL;
+ goto free_range;
+ }
+ }
+
+ if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
+ page = NULL;
+ goto free_range;
+ }
+
+ /* Minimise pfn page lookups and scheduler checks */
+ if (page && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) {
+ page++;
+ } else {
+ nr_pages += nr_to_free;
+ deferred_free_range(free_base_page,
+ free_base_pfn, nr_to_free);
+ free_base_page = NULL;
+ free_base_pfn = nr_to_free = 0;
+
+ page = pfn_to_page(pfn);
+ cond_resched();
+ }
+
+ if (page->flags) {
+ VM_BUG_ON(page_zone(page) != zone);
+ goto free_range;
+ }
+
+ __init_single_page(page, pfn, zid, nid);
+ if (!free_base_page) {
+ free_base_page = page;
+ free_base_pfn = pfn;
+ nr_to_free = 0;
+ }
+ nr_to_free++;
+
+ /* Where possible, batch up pages for a single free */
+ continue;
+free_range:
+ /* Free the current block of pages to allocator */
+ nr_pages += nr_to_free;
+ deferred_free_range(free_base_page, free_base_pfn,
+ nr_to_free);
+ free_base_page = NULL;
+ free_base_pfn = nr_to_free = 0;
+ }
+
+ first_init_pfn = max(end_pfn, first_init_pfn);
+ }
+
+ /* Sanity check that the next zone really is unpopulated */
+ WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
+
+ pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages,
+ jiffies_to_msecs(jiffies - start));
+ up_read(&pgdat_init_rwsem);
+ return 0;
+}
+
+void __init page_alloc_init_late(void)
+{
+ int nid;
+
+ for_each_node_state(nid, N_MEMORY) {
+ down_read(&pgdat_init_rwsem);
+ kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
+ }
+
+ /* Block until all are initialised */
+ down_write(&pgdat_init_rwsem);
+ up_write(&pgdat_init_rwsem);
+}
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
+
#ifdef CONFIG_CMA
/* Free whole pageblock and set its migration type to MIGRATE_CMA. */
void __init init_cma_reserved_pageblock(struct page *page)
@@ -975,7 +1328,8 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
kasan_alloc_pages(page, order);
if (gfp_flags & __GFP_ZERO)
- prep_zero_page(page, order, gfp_flags);
+ for (i = 0; i < (1 << order); i++)
+ clear_highpage(page + i);
if (order && (gfp_flags & __GFP_COMP))
prep_compound_page(page, order);
@@ -2322,48 +2676,6 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
show_mem(filter);
}
-static inline int
-should_alloc_retry(gfp_t gfp_mask, unsigned int order,
- unsigned long did_some_progress,
- unsigned long pages_reclaimed)
-{
- /* Do not loop if specifically requested */
- if (gfp_mask & __GFP_NORETRY)
- return 0;
-
- /* Always retry if specifically requested */
- if (gfp_mask & __GFP_NOFAIL)
- return 1;
-
- /*
- * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim
- * making forward progress without invoking OOM. Suspend also disables
- * storage devices so kswapd will not help. Bail if we are suspending.
- */
- if (!did_some_progress && pm_suspended_storage())
- return 0;
-
- /*
- * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
- * means __GFP_NOFAIL, but that may not be true in other
- * implementations.
- */
- if (order <= PAGE_ALLOC_COSTLY_ORDER)
- return 1;
-
- /*
- * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
- * specified, then we retry until we no longer reclaim any pages
- * (above), or we've reclaimed an order of pages at least as
- * large as the allocation's order. In both cases, if the
- * allocation still fails, we stop retrying.
- */
- if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
- return 1;
-
- return 0;
-}
-
static inline struct page *
__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
const struct alloc_context *ac, unsigned long *did_some_progress)
@@ -2373,10 +2685,10 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
*did_some_progress = 0;
/*
- * Acquire the per-zone oom lock for each zone. If that
- * fails, somebody else is making progress for us.
+ * Acquire the oom lock. If that fails, somebody else is
+ * making progress for us.
*/
- if (!oom_zonelist_trylock(ac->zonelist, gfp_mask)) {
+ if (!mutex_trylock(&oom_lock)) {
*did_some_progress = 1;
schedule_timeout_uninterruptible(1);
return NULL;
@@ -2402,26 +2714,28 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
/* The OOM killer does not needlessly kill tasks for lowmem */
if (ac->high_zoneidx < ZONE_NORMAL)
goto out;
- /* The OOM killer does not compensate for light reclaim */
+ /* The OOM killer does not compensate for IO-less reclaim */
if (!(gfp_mask & __GFP_FS)) {
/*
* XXX: Page reclaim didn't yield anything,
* and the OOM killer can't be invoked, but
- * keep looping as per should_alloc_retry().
+ * keep looping as per tradition.
*/
*did_some_progress = 1;
goto out;
}
+ if (pm_suspended_storage())
+ goto out;
/* The OOM killer may not free memory on a specific node */
if (gfp_mask & __GFP_THISNODE)
goto out;
}
/* Exhausted what can be done so it's blamo time */
- if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false)
+ if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask)
|| WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
*did_some_progress = 1;
out:
- oom_zonelist_unlock(ac->zonelist, gfp_mask);
+ mutex_unlock(&oom_lock);
return page;
}
@@ -2794,40 +3108,40 @@ retry:
if (page)
goto got_pg;
- /* Check if we should retry the allocation */
+ /* Do not loop if specifically requested */
+ if (gfp_mask & __GFP_NORETRY)
+ goto noretry;
+
+ /* Keep reclaiming pages as long as there is reasonable progress */
pages_reclaimed += did_some_progress;
- if (should_alloc_retry(gfp_mask, order, did_some_progress,
- pages_reclaimed)) {
- /*
- * If we fail to make progress by freeing individual
- * pages, but the allocation wants us to keep going,
- * start OOM killing tasks.
- */
- if (!did_some_progress) {
- page = __alloc_pages_may_oom(gfp_mask, order, ac,
- &did_some_progress);
- if (page)
- goto got_pg;
- if (!did_some_progress)
- goto nopage;
- }
+ if ((did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) ||
+ ((gfp_mask & __GFP_REPEAT) && pages_reclaimed < (1 << order))) {
/* Wait for some write requests to complete then retry */
wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50);
goto retry;
- } else {
- /*
- * High-order allocations do not necessarily loop after
- * direct reclaim and reclaim/compaction depends on compaction
- * being called after reclaim so call directly if necessary
- */
- page = __alloc_pages_direct_compact(gfp_mask, order,
- alloc_flags, ac, migration_mode,
- &contended_compaction,
- &deferred_compaction);
- if (page)
- goto got_pg;
}
+ /* Reclaim has failed us, start killing things */
+ page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
+ if (page)
+ goto got_pg;
+
+ /* Retry as long as the OOM killer is making progress */
+ if (did_some_progress)
+ goto retry;
+
+noretry:
+ /*
+ * High-order allocations do not necessarily loop after
+ * direct reclaim and reclaim/compaction depends on compaction
+ * being called after reclaim so call directly if necessary
+ */
+ page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags,
+ ac, migration_mode,
+ &contended_compaction,
+ &deferred_compaction);
+ if (page)
+ goto got_pg;
nopage:
warn_alloc_failed(gfp_mask, order, NULL);
got_pg:
@@ -4203,6 +4517,9 @@ static void setup_zone_migrate_reserve(struct zone *zone)
zone->nr_migrate_reserve_block = reserve;
for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
+ if (!early_page_nid_uninitialised(pfn, zone_to_nid(zone)))
+ return;
+
if (!pfn_valid(pfn))
continue;
page = pfn_to_page(pfn);
@@ -4265,15 +4582,16 @@ static void setup_zone_migrate_reserve(struct zone *zone)
void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn, enum memmap_context context)
{
- struct page *page;
+ pg_data_t *pgdat = NODE_DATA(nid);
unsigned long end_pfn = start_pfn + size;
unsigned long pfn;
struct zone *z;
+ unsigned long nr_initialised = 0;
if (highest_memmap_pfn < end_pfn - 1)
highest_memmap_pfn = end_pfn - 1;
- z = &NODE_DATA(nid)->node_zones[zone];
+ z = &pgdat->node_zones[zone];
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
/*
* There can be holes in boot-time mem_map[]s
@@ -4285,14 +4603,11 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
continue;
if (!early_pfn_in_nid(pfn, nid))
continue;
+ if (!update_defer_init(pgdat, pfn, end_pfn,
+ &nr_initialised))
+ break;
}
- page = pfn_to_page(pfn);
- set_page_links(page, zone, nid, pfn);
- mminit_verify_page_links(page, zone, nid, pfn);
- init_page_count(page);
- page_mapcount_reset(page);
- page_cpupid_reset_last(page);
- SetPageReserved(page);
+
/*
* Mark the block movable so that blocks are reserved for
* movable at startup. This will force kernel allocations
@@ -4307,17 +4622,14 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
* check here not to call set_pageblock_migratetype() against
* pfn out of zone.
*/
- if ((z->zone_start_pfn <= pfn)
- && (pfn < zone_end_pfn(z))
- && !(pfn & (pageblock_nr_pages - 1)))
- set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ if (!(pfn & (pageblock_nr_pages - 1))) {
+ struct page *page = pfn_to_page(pfn);
- INIT_LIST_HEAD(&page->lru);
-#ifdef WANT_PAGE_VIRTUAL
- /* The shift won't overflow because ZONE_NORMAL is below 4G. */
- if (!is_highmem_idx(zone))
- set_page_address(page, __va(pfn << PAGE_SHIFT));
-#endif
+ __init_single_page(page, pfn, zone, nid);
+ set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ } else {
+ __init_single_pfn(pfn, zone, nid);
+ }
}
}
@@ -4575,57 +4887,30 @@ int __meminit init_currently_empty_zone(struct zone *zone,
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
+
/*
* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
*/
-int __meminit __early_pfn_to_nid(unsigned long pfn)
+int __meminit __early_pfn_to_nid(unsigned long pfn,
+ struct mminit_pfnnid_cache *state)
{
unsigned long start_pfn, end_pfn;
int nid;
- /*
- * NOTE: The following SMP-unsafe globals are only used early in boot
- * when the kernel is running single-threaded.
- */
- static unsigned long __meminitdata last_start_pfn, last_end_pfn;
- static int __meminitdata last_nid;
- if (last_start_pfn <= pfn && pfn < last_end_pfn)
- return last_nid;
+ if (state->last_start <= pfn && pfn < state->last_end)
+ return state->last_nid;
nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
if (nid != -1) {
- last_start_pfn = start_pfn;
- last_end_pfn = end_pfn;
- last_nid = nid;
+ state->last_start = start_pfn;
+ state->last_end = end_pfn;
+ state->last_nid = nid;
}
return nid;
}
#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
-int __meminit early_pfn_to_nid(unsigned long pfn)
-{
- int nid;
-
- nid = __early_pfn_to_nid(pfn);
- if (nid >= 0)
- return nid;
- /* just returns 0 */
- return 0;
-}
-
-#ifdef CONFIG_NODES_SPAN_OTHER_NODES
-bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
-{
- int nid;
-
- nid = __early_pfn_to_nid(pfn);
- if (nid >= 0 && nid != node)
- return false;
- return true;
-}
-#endif
-
/**
* free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range
* @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
@@ -4867,22 +5152,28 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
unsigned long *zones_size,
unsigned long *zholes_size)
{
- unsigned long realtotalpages, totalpages = 0;
+ unsigned long realtotalpages = 0, totalpages = 0;
enum zone_type i;
- for (i = 0; i < MAX_NR_ZONES; i++)
- totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
- node_start_pfn,
- node_end_pfn,
- zones_size);
- pgdat->node_spanned_pages = totalpages;
-
- realtotalpages = totalpages;
- for (i = 0; i < MAX_NR_ZONES; i++)
- realtotalpages -=
- zone_absent_pages_in_node(pgdat->node_id, i,
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ struct zone *zone = pgdat->node_zones + i;
+ unsigned long size, real_size;
+
+ size = zone_spanned_pages_in_node(pgdat->node_id, i,
+ node_start_pfn,
+ node_end_pfn,
+ zones_size);
+ real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,
node_start_pfn, node_end_pfn,
zholes_size);
+ zone->spanned_pages = size;
+ zone->present_pages = real_size;
+
+ totalpages += size;
+ realtotalpages += real_size;
+ }
+
+ pgdat->node_spanned_pages = totalpages;
pgdat->node_present_pages = realtotalpages;
printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
realtotalpages);
@@ -4992,8 +5283,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
* NOTE: pgdat should get zeroed by caller.
*/
static void __paginginit free_area_init_core(struct pglist_data *pgdat,
- unsigned long node_start_pfn, unsigned long node_end_pfn,
- unsigned long *zones_size, unsigned long *zholes_size)
+ unsigned long node_start_pfn, unsigned long node_end_pfn)
{
enum zone_type j;
int nid = pgdat->node_id;
@@ -5014,12 +5304,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize, freesize, memmap_pages;
- size = zone_spanned_pages_in_node(nid, j, node_start_pfn,
- node_end_pfn, zones_size);
- realsize = freesize = size - zone_absent_pages_in_node(nid, j,
- node_start_pfn,
- node_end_pfn,
- zholes_size);
+ size = zone->spanned_pages;
+ realsize = freesize = zone->present_pages;
/*
* Adjust freesize so that it accounts for how much memory
@@ -5054,8 +5340,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
nr_kernel_pages -= memmap_pages;
nr_all_pages += freesize;
- zone->spanned_pages = size;
- zone->present_pages = realsize;
/*
* Set an approximate value for lowmem here, it will be adjusted
* when the bootmem allocator frees pages into the buddy system.
@@ -5144,6 +5428,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
/* pg_data_t should be reset to zero when it's allocated */
WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
+ reset_deferred_meminit(pgdat);
pgdat->node_id = nid;
pgdat->node_start_pfn = node_start_pfn;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
@@ -5161,8 +5446,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
(unsigned long)pgdat->node_mem_map);
#endif
- free_area_init_core(pgdat, start_pfn, end_pfn,
- zones_size, zholes_size);
+ free_area_init_core(pgdat, start_pfn, end_pfn);
}
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
@@ -6111,9 +6395,9 @@ out:
return ret;
}
+#ifdef CONFIG_NUMA
int hashdist = HASHDIST_DEFAULT;
-#ifdef CONFIG_NUMA
static int __init set_hashdist(char *str)
{
if (!str)
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 303c908790efca..0e69d259beb7c3 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -178,8 +178,11 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
undo:
for (pfn = start_pfn;
pfn < undo_pfn;
- pfn += pageblock_nr_pages)
- unset_migratetype_isolate(pfn_to_page(pfn), migratetype);
+ pfn += pageblock_nr_pages) {
+ page = __first_valid_page(pfn, pageblock_nr_pages);
+ if (page)
+ unset_migratetype_isolate(page, migratetype);
+ }
return -EBUSY;
}
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index c25f94b338115a..6b674e00153cea 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -119,14 +119,15 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
}
#endif
-#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH
+#ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmdp)
+pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
{
pmd_t pmd;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
+ VM_BUG_ON(!pmd_trans_huge(*pmdp));
+ pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
return pmd;
}
@@ -198,3 +199,23 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif
+
+#ifndef pmdp_collapse_flush
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
+{
+ /*
+ * pmd and hugepage pte format are same. So we could
+ * use the same function.
+ */
+ pmd_t pmd;
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ VM_BUG_ON(pmd_trans_huge(*pmdp));
+ pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ return pmd;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
diff --git a/mm/rmap.c b/mm/rmap.c
index 8fc556ce2dcb7f..49b244b1f18c9c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -627,7 +627,7 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
pmd = pmd_offset(pud, address);
/*
- * Some THP functions use the sequence pmdp_clear_flush(), set_pmd_at()
+ * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
* without holding anon_vma lock for write. So when looking for a
* genuine pmde (in which to find pte), test present and !THP together.
*/
@@ -714,6 +714,7 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
}
struct page_referenced_arg {
+ int dirtied;
int mapcount;
int referenced;
unsigned long vm_flags;
@@ -728,6 +729,7 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
struct mm_struct *mm = vma->vm_mm;
spinlock_t *ptl;
int referenced = 0;
+ int dirty = 0;
struct page_referenced_arg *pra = arg;
if (unlikely(PageTransHuge(page))) {
@@ -751,6 +753,15 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
/* go ahead even if the pmd is pmd_trans_splitting() */
if (pmdp_clear_flush_young_notify(vma, address, pmd))
referenced++;
+
+ /*
+ * Use pmd_freeable instead of raw pmd_dirty because in some
+ * of architecture, pmd_dirty is not defined unless
+ * CONFIG_TRANSPARENT_HUGEPAGE is enabled
+ */
+ if (!pmd_freeable(*pmd))
+ dirty++;
+
spin_unlock(ptl);
} else {
pte_t *pte;
@@ -780,6 +791,10 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
if (likely(!(vma->vm_flags & VM_SEQ_READ)))
referenced++;
}
+
+ if (pte_dirty(*pte))
+ dirty++;
+
pte_unmap_unlock(pte, ptl);
}
@@ -788,6 +803,9 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
pra->vm_flags |= vma->vm_flags;
}
+ if (dirty)
+ pra->dirtied++;
+
pra->mapcount--;
if (!pra->mapcount)
return SWAP_SUCCESS; /* To break the loop */
@@ -812,6 +830,7 @@ static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg)
* @is_locked: caller holds lock on the page
* @memcg: target memory cgroup
* @vm_flags: collect encountered vma->vm_flags who actually referenced the page
+ * @is_pte_dirty: ptes which have marked dirty bit - used for lazyfree page
*
* Quick test_and_clear_referenced for all mappings to a page,
* returns the number of ptes which referenced the page.
@@ -819,7 +838,8 @@ static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg)
int page_referenced(struct page *page,
int is_locked,
struct mem_cgroup *memcg,
- unsigned long *vm_flags)
+ unsigned long *vm_flags,
+ int *is_pte_dirty)
{
int ret;
int we_locked = 0;
@@ -834,6 +854,9 @@ int page_referenced(struct page *page,
};
*vm_flags = 0;
+ if (is_pte_dirty)
+ *is_pte_dirty = 0;
+
if (!page_mapped(page))
return 0;
@@ -861,6 +884,9 @@ int page_referenced(struct page *page,
if (we_locked)
unlock_page(page);
+ if (is_pte_dirty)
+ *is_pte_dirty = pra.dirtied;
+
return pra.referenced;
}
@@ -952,7 +978,12 @@ void page_move_anon_rmap(struct page *page,
VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page);
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
- page->mapping = (struct address_space *) anon_vma;
+ /*
+ * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written
+ * simultaneously, so a concurrent reader (eg page_referenced()'s
+ * PageAnon()) will not see one without the other.
+ */
+ WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
}
/**
@@ -1189,6 +1220,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
spinlock_t *ptl;
int ret = SWAP_AGAIN;
enum ttu_flags flags = (enum ttu_flags)arg;
+ int dirty = 0;
pte = page_check_address(page, mm, address, &ptl, 0);
if (!pte)
@@ -1218,7 +1250,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
pteval = ptep_clear_flush(vma, address, pte);
/* Move the dirty bit to the physical page now the pte is gone. */
- if (pte_dirty(pteval))
+ dirty = pte_dirty(pteval);
+ if (dirty)
set_page_dirty(page);
/* Update high watermark before we lower rss */
@@ -1247,6 +1280,19 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
swp_entry_t entry = { .val = page_private(page) };
pte_t swp_pte;
+ if (flags & TTU_FREE) {
+ VM_BUG_ON_PAGE(PageSwapCache(page), page);
+ if (!dirty && !PageDirty(page)) {
+ /* It's a freeable page by MADV_FREE */
+ dec_mm_counter(mm, MM_ANONPAGES);
+ goto discard;
+ } else {
+ set_pte_at(mm, address, pte, pteval);
+ ret = SWAP_FAIL;
+ goto out_unmap;
+ }
+ }
+
if (PageSwapCache(page)) {
/*
* Store the swap location in the pte.
@@ -1287,6 +1333,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
} else
dec_mm_counter(mm, MM_FILEPAGES);
+discard:
page_remove_rmap(page);
page_cache_release(page);
diff --git a/mm/shmem.c b/mm/shmem.c
index a59087edf72848..b03960be82d60c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -981,7 +981,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
copy_highpage(newpage, oldpage);
flush_dcache_page(newpage);
- __set_page_locked(newpage);
+ __SetPageLocked(newpage);
SetPageUptodate(newpage);
SetPageSwapBacked(newpage);
set_page_private(newpage, swap_index);
@@ -1173,7 +1173,7 @@ repeat:
}
__SetPageSwapBacked(page);
- __set_page_locked(page);
+ __SetPageLocked(page);
if (sgp == SGP_WRITE)
__SetPageReferenced(page);
diff --git a/mm/slab.c b/mm/slab.c
index 7eb38dd1cefa2f..504adb18522ded 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1454,6 +1454,7 @@ void __init kmem_cache_init(void)
kmalloc_caches[INDEX_NODE] = create_kmalloc_cache("kmalloc-node",
kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS);
slab_state = PARTIAL_NODE;
+ setup_kmalloc_cache_index_table();
slab_early_init = 0;
@@ -3415,6 +3416,19 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
}
EXPORT_SYMBOL(kmem_cache_alloc);
+void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
+{
+ __kmem_cache_free_bulk(s, size, p);
+}
+EXPORT_SYMBOL(kmem_cache_free_bulk);
+
+bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
+ void **p)
+{
+ return kmem_cache_alloc_bulk(s, flags, size, p);
+}
+EXPORT_SYMBOL(kmem_cache_alloc_bulk);
+
#ifdef CONFIG_TRACING
void *
kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
diff --git a/mm/slab.h b/mm/slab.h
index 4c3ac12dd64405..88b55497738cb8 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -71,6 +71,7 @@ unsigned long calculate_alignment(unsigned long flags,
#ifndef CONFIG_SLOB
/* Kmalloc array related functions */
+void setup_kmalloc_cache_index_table(void);
void create_kmalloc_caches(unsigned long);
/* Find the kmalloc slab corresponding for a certain size */
@@ -162,6 +163,15 @@ void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s);
ssize_t slabinfo_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos);
+/*
+ * Generic implementation of bulk operations
+ * These are useful for situations in which the allocator cannot
+ * perform optimizations. In that case segments of the objecct listed
+ * may be allocated or freed using these operations.
+ */
+void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
+bool __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
+
#ifdef CONFIG_MEMCG_KMEM
/*
* Iterate over all memcg caches of the given root cache. The caller must hold
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 999bb3424d44df..8873985f905ee9 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -105,6 +105,29 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size)
}
#endif
+void __kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p)
+{
+ size_t i;
+
+ for (i = 0; i < nr; i++)
+ kmem_cache_free(s, p[i]);
+}
+
+bool __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
+ void **p)
+{
+ size_t i;
+
+ for (i = 0; i < nr; i++) {
+ void *x = p[i] = kmem_cache_alloc(s, flags);
+ if (!x) {
+ __kmem_cache_free_bulk(s, i, p);
+ return false;
+ }
+ }
+ return true;
+}
+
#ifdef CONFIG_MEMCG_KMEM
void slab_init_memcg_params(struct kmem_cache *s)
{
@@ -784,25 +807,45 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
}
/*
- * Create the kmalloc array. Some of the regular kmalloc arrays
- * may already have been created because they were needed to
- * enable allocations for slab creation.
+ * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time.
+ * kmalloc_index() supports up to 2^26=64MB, so the final entry of the table is
+ * kmalloc-67108864.
*/
-void __init create_kmalloc_caches(unsigned long flags)
+static struct {
+ const char *name;
+ unsigned long size;
+} const kmalloc_info[] __initconst = {
+ {NULL, 0}, {"kmalloc-96", 96},
+ {"kmalloc-192", 192}, {"kmalloc-8", 8},
+ {"kmalloc-16", 16}, {"kmalloc-32", 32},
+ {"kmalloc-64", 64}, {"kmalloc-128", 128},
+ {"kmalloc-256", 256}, {"kmalloc-512", 512},
+ {"kmalloc-1024", 1024}, {"kmalloc-2048", 2048},
+ {"kmalloc-4096", 4096}, {"kmalloc-8192", 8192},
+ {"kmalloc-16384", 16384}, {"kmalloc-32768", 32768},
+ {"kmalloc-65536", 65536}, {"kmalloc-131072", 131072},
+ {"kmalloc-262144", 262144}, {"kmalloc-524288", 524288},
+ {"kmalloc-1048576", 1048576}, {"kmalloc-2097152", 2097152},
+ {"kmalloc-4194304", 4194304}, {"kmalloc-8388608", 8388608},
+ {"kmalloc-16777216", 16777216}, {"kmalloc-33554432", 33554432},
+ {"kmalloc-67108864", 67108864}
+};
+
+/*
+ * Patch up the size_index table if we have strange large alignment
+ * requirements for the kmalloc array. This is only the case for
+ * MIPS it seems. The standard arches will not generate any code here.
+ *
+ * Largest permitted alignment is 256 bytes due to the way we
+ * handle the index determination for the smaller caches.
+ *
+ * Make sure that nothing crazy happens if someone starts tinkering
+ * around with ARCH_KMALLOC_MINALIGN
+ */
+void __init setup_kmalloc_cache_index_table(void)
{
int i;
- /*
- * Patch up the size_index table if we have strange large alignment
- * requirements for the kmalloc array. This is only the case for
- * MIPS it seems. The standard arches will not generate any code here.
- *
- * Largest permitted alignment is 256 bytes due to the way we
- * handle the index determination for the smaller caches.
- *
- * Make sure that nothing crazy happens if someone starts tinkering
- * around with ARCH_KMALLOC_MINALIGN
- */
BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
(KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
@@ -833,39 +876,41 @@ void __init create_kmalloc_caches(unsigned long flags)
for (i = 128 + 8; i <= 192; i += 8)
size_index[size_index_elem(i)] = 8;
}
- for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
+}
+
+/*
+ * Create the kmalloc array. Some of the regular kmalloc arrays
+ * may already have been created because they were needed to
+ * enable allocations for slab creation.
+ */
+void __init create_kmalloc_caches(unsigned long flags)
+{
+ int i;
+
+ for (i = KMALLOC_LOOP_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
if (!kmalloc_caches[i]) {
- kmalloc_caches[i] = create_kmalloc_cache(NULL,
- 1 << i, flags);
+ kmalloc_caches[i] = create_kmalloc_cache(
+ kmalloc_info[i].name,
+ kmalloc_info[i].size,
+ flags);
}
/*
- * Caches that are not of the two-to-the-power-of size.
- * These have to be created immediately after the
- * earlier power of two caches
+ * "i == 2" is the "kmalloc-192" case which is the last special
+ * case for initialization and it's the point to jump to
+ * allocate the minimize size of the object. In slab allocator,
+ * the KMALLOC_SHIFT_LOW = 5. So, it needs to skip 2^3 and 2^4
+ * and go straight to allocate 2^5. If the ARCH_DMA_MINALIGN is
+ * defined, it may be larger than 2^5 and here is also the
+ * trick to skip the empty gap.
*/
- if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6)
- kmalloc_caches[1] = create_kmalloc_cache(NULL, 96, flags);
-
- if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7)
- kmalloc_caches[2] = create_kmalloc_cache(NULL, 192, flags);
+ if (i == 2)
+ i = (KMALLOC_SHIFT_LOW - 1);
}
/* Kmalloc array is now usable */
slab_state = UP;
- for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
- struct kmem_cache *s = kmalloc_caches[i];
- char *n;
-
- if (s) {
- n = kasprintf(GFP_NOWAIT, "kmalloc-%d", kmalloc_size(i));
-
- BUG_ON(!n);
- s->name = n;
- }
- }
-
#ifdef CONFIG_ZONE_DMA
for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
struct kmem_cache *s = kmalloc_caches[i];
diff --git a/mm/slob.c b/mm/slob.c
index 4765f65019c733..495df8e006ec2f 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -611,6 +611,19 @@ void kmem_cache_free(struct kmem_cache *c, void *b)
}
EXPORT_SYMBOL(kmem_cache_free);
+void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
+{
+ __kmem_cache_free_bulk(s, size, p);
+}
+EXPORT_SYMBOL(kmem_cache_free_bulk);
+
+bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
+ void **p)
+{
+ return kmem_cache_alloc_bulk(s, flags, size, p);
+}
+EXPORT_SYMBOL(kmem_cache_alloc_bulk);
+
int __kmem_cache_shutdown(struct kmem_cache *c)
{
/* No way to check for remaining objects */
diff --git a/mm/slub.c b/mm/slub.c
index 54c0876b43d554..f920dc583f5d6f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -338,11 +338,13 @@ static inline int oo_objects(struct kmem_cache_order_objects x)
*/
static __always_inline void slab_lock(struct page *page)
{
+ VM_BUG_ON_PAGE(PageTail(page), page);
bit_spin_lock(PG_locked, &page->flags);
}
static __always_inline void slab_unlock(struct page *page)
{
+ VM_BUG_ON_PAGE(PageTail(page), page);
__bit_spin_unlock(PG_locked, &page->flags);
}
@@ -2750,6 +2752,72 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
}
EXPORT_SYMBOL(kmem_cache_free);
+void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
+{
+ __kmem_cache_free_bulk(s, size, p);
+}
+EXPORT_SYMBOL(kmem_cache_free_bulk);
+
+bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
+ void **p)
+{
+ if (!kmem_cache_debug(s)) {
+ struct kmem_cache_cpu *c;
+
+ /* Drain objects in the per cpu slab */
+ local_irq_disable();
+ c = this_cpu_ptr(s->cpu_slab);
+
+ while (size) {
+ void *object = c->freelist;
+
+ if (unlikely(!object)) {
+ /*
+ * Check if there are remotely freed objects
+ * available in the page.
+ */
+ object = get_freelist(s, c->page);
+
+ if (!object) {
+ /*
+ * All objects in use. Let's check if
+ * we have other per cpu partial pages
+ * that have available objects.
+ */
+ c->page = c->partial;
+ if (!c->page) {
+ /* No per cpu objects left */
+ c->freelist = NULL;
+ break;
+ }
+
+ /* Next per cpu partial page */
+ c->partial = c->page->next;
+ c->freelist = get_freelist(s, c->page);
+ continue;
+ }
+
+ }
+
+ *p++ = object;
+ size--;
+
+ if (unlikely(flags & __GFP_ZERO))
+ memset(object, 0, s->object_size);
+
+ c->freelist = get_freepointer(s, object);
+
+ }
+ c->tid = next_tid(c->tid);
+
+ local_irq_enable();
+ }
+
+ return __kmem_cache_alloc_bulk(s, flags, size, p);
+}
+EXPORT_SYMBOL(kmem_cache_alloc_bulk);
+
+
/*
* Object placement in a slab is made very easy because we always start at
* offset 0. If we tune the size of the object to the alignment then we can
@@ -3700,6 +3768,7 @@ void __init kmem_cache_init(void)
kmem_cache_node = bootstrap(&boot_kmem_cache_node);
/* Now we can use the kmem_cache to allocate kmalloc slabs */
+ setup_kmalloc_cache_index_table();
create_kmalloc_caches(0);
#ifdef CONFIG_SMP
diff --git a/mm/swap.c b/mm/swap.c
index a7251a8ed53297..ab7c338eda87a4 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -44,6 +44,7 @@ int page_cluster;
static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
+static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
/*
* This path almost never happens for VM activity - pages are normally
@@ -131,7 +132,6 @@ void put_unrefcounted_compound_page(struct page *page_head, struct page *page)
* here, see the comment above this function.
*/
VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
- VM_BUG_ON_PAGE(page_mapcount(page) != 0, page);
if (put_page_testzero(page_head)) {
/*
* If this is the tail of a slab THP page,
@@ -797,6 +797,24 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
update_page_reclaim_stat(lruvec, file, 0);
}
+
+static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
+ void *arg)
+{
+ if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+ int file = page_is_file_cache(page);
+ int lru = page_lru_base_type(page);
+
+ del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
+ ClearPageActive(page);
+ ClearPageReferenced(page);
+ add_page_to_lru_list(page, lruvec, lru);
+
+ __count_vm_event(PGDEACTIVATE);
+ update_page_reclaim_stat(lruvec, file, 0);
+ }
+}
+
/*
* Drain pages out of the cpu's pagevecs.
* Either "cpu" is the current CPU, and preemption has already been
@@ -823,6 +841,10 @@ void lru_add_drain_cpu(int cpu)
if (pagevec_count(pvec))
pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
+ pvec = &per_cpu(lru_deactivate_pvecs, cpu);
+ if (pagevec_count(pvec))
+ pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+
activate_page_drain(cpu);
}
@@ -852,6 +874,26 @@ void deactivate_file_page(struct page *page)
}
}
+/**
+ * deactivate_page - deactivate a page
+ * @page: page to deactivate
+ *
+ * deactivate_page() moves @page to the inactive list if @page was on the active
+ * list and was not an unevictable page. This is done to accelerate the reclaim
+ * of @page.
+ */
+void deactivate_page(struct page *page)
+{
+ if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+ struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
+
+ page_cache_get(page);
+ if (!pagevec_add(pvec, page))
+ pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+ put_cpu_var(lru_deactivate_pvecs);
+ }
+}
+
void lru_add_drain(void)
{
lru_add_drain_cpu(get_cpu());
@@ -881,6 +923,7 @@ void lru_add_drain_all(void)
if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
+ pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
need_activate_page_drain(cpu)) {
INIT_WORK(work, lru_add_drain_per_cpu);
schedule_work_on(cpu, work);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 8bc8e66138da1b..a2611ce55413f8 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -357,7 +357,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
}
/* May fail (-ENOMEM) if radix-tree node allocation failed. */
- __set_page_locked(new_page);
+ __SetPageLocked(new_page);
SetPageSwapBacked(new_page);
err = __add_to_swap_cache(new_page, entry);
if (likely(!err)) {
@@ -371,7 +371,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
}
radix_tree_preload_end();
ClearPageSwapBacked(new_page);
- __clear_page_locked(new_page);
+ __ClearPageLocked(new_page);
/*
* add_to_swap_cache() doesn't return -EEXIST, so we can safely
* clear SWAP_HAS_CACHE flag.
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
new file mode 100644
index 00000000000000..77fee9325a5727
--- /dev/null
+++ b/mm/userfaultfd.c
@@ -0,0 +1,308 @@
+/*
+ * mm/userfaultfd.c
+ *
+ * Copyright (C) 2015 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/userfaultfd_k.h>
+#include <linux/mmu_notifier.h>
+#include <asm/tlbflush.h>
+#include "internal.h"
+
+static int mcopy_atomic_pte(struct mm_struct *dst_mm,
+ pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ unsigned long src_addr,
+ struct page **pagep)
+{
+ struct mem_cgroup *memcg;
+ pte_t _dst_pte, *dst_pte;
+ spinlock_t *ptl;
+ void *page_kaddr;
+ int ret;
+ struct page *page;
+
+ if (!*pagep) {
+ ret = -ENOMEM;
+ page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, dst_vma, dst_addr);
+ if (!page)
+ goto out;
+
+ page_kaddr = kmap_atomic(page);
+ ret = copy_from_user(page_kaddr,
+ (const void __user *) src_addr,
+ PAGE_SIZE);
+ kunmap_atomic(page_kaddr);
+
+ /* fallback to copy_from_user outside mmap_sem */
+ if (unlikely(ret)) {
+ ret = -EFAULT;
+ *pagep = page;
+ /* don't free the page */
+ goto out;
+ }
+ } else {
+ page = *pagep;
+ *pagep = NULL;
+ }
+
+ /*
+ * The memory barrier inside __SetPageUptodate makes sure that
+ * preceeding stores to the page contents become visible before
+ * the set_pte_at() write.
+ */
+ __SetPageUptodate(page);
+
+ ret = -ENOMEM;
+ if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg))
+ goto out_release;
+
+ _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
+ if (dst_vma->vm_flags & VM_WRITE)
+ _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
+
+ ret = -EEXIST;
+ dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
+ if (!pte_none(*dst_pte))
+ goto out_release_uncharge_unlock;
+
+ inc_mm_counter(dst_mm, MM_ANONPAGES);
+ page_add_new_anon_rmap(page, dst_vma, dst_addr);
+ mem_cgroup_commit_charge(page, memcg, false);
+ lru_cache_add_active_or_unevictable(page, dst_vma);
+
+ set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(dst_vma, dst_addr, dst_pte);
+
+ pte_unmap_unlock(dst_pte, ptl);
+ ret = 0;
+out:
+ return ret;
+out_release_uncharge_unlock:
+ pte_unmap_unlock(dst_pte, ptl);
+ mem_cgroup_cancel_charge(page, memcg);
+out_release:
+ page_cache_release(page);
+ goto out;
+}
+
+static int mfill_zeropage_pte(struct mm_struct *dst_mm,
+ pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr)
+{
+ pte_t _dst_pte, *dst_pte;
+ spinlock_t *ptl;
+ int ret;
+
+ _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
+ dst_vma->vm_page_prot));
+ ret = -EEXIST;
+ dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
+ if (!pte_none(*dst_pte))
+ goto out_unlock;
+ set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(dst_vma, dst_addr, dst_pte);
+ ret = 0;
+out_unlock:
+ pte_unmap_unlock(dst_pte, ptl);
+ return ret;
+}
+
+static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd = NULL;
+
+ pgd = pgd_offset(mm, address);
+ pud = pud_alloc(mm, pgd, address);
+ if (pud)
+ /*
+ * Note that we didn't run this because the pmd was
+ * missing, the *pmd may be already established and in
+ * turn it may also be a trans_huge_pmd.
+ */
+ pmd = pmd_alloc(mm, pud, address);
+ return pmd;
+}
+
+static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
+ unsigned long dst_start,
+ unsigned long src_start,
+ unsigned long len,
+ bool zeropage)
+{
+ struct vm_area_struct *dst_vma;
+ ssize_t err;
+ pmd_t *dst_pmd;
+ unsigned long src_addr, dst_addr;
+ long copied;
+ struct page *page;
+
+ /*
+ * Sanitize the command parameters:
+ */
+ BUG_ON(dst_start & ~PAGE_MASK);
+ BUG_ON(len & ~PAGE_MASK);
+
+ /* Does the address range wrap, or is the span zero-sized? */
+ BUG_ON(src_start + len <= src_start);
+ BUG_ON(dst_start + len <= dst_start);
+
+ src_addr = src_start;
+ dst_addr = dst_start;
+ copied = 0;
+ page = NULL;
+retry:
+ down_read(&dst_mm->mmap_sem);
+
+ /*
+ * Make sure the vma is not shared, that the dst range is
+ * both valid and fully within a single existing vma.
+ */
+ err = -EINVAL;
+ dst_vma = find_vma(dst_mm, dst_start);
+ if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
+ goto out_unlock;
+ if (dst_start < dst_vma->vm_start ||
+ dst_start + len > dst_vma->vm_end)
+ goto out_unlock;
+
+ /*
+ * Be strict and only allow __mcopy_atomic on userfaultfd
+ * registered ranges to prevent userland errors going
+ * unnoticed. As far as the VM consistency is concerned, it
+ * would be perfectly safe to remove this check, but there's
+ * no useful usage for __mcopy_atomic ouside of userfaultfd
+ * registered ranges. This is after all why these are ioctls
+ * belonging to the userfaultfd and not syscalls.
+ */
+ if (!dst_vma->vm_userfaultfd_ctx.ctx)
+ goto out_unlock;
+
+ /*
+ * FIXME: only allow copying on anonymous vmas, tmpfs should
+ * be added.
+ */
+ if (dst_vma->vm_ops)
+ goto out_unlock;
+
+ /*
+ * Ensure the dst_vma has a anon_vma or this page
+ * would get a NULL anon_vma when moved in the
+ * dst_vma.
+ */
+ err = -ENOMEM;
+ if (unlikely(anon_vma_prepare(dst_vma)))
+ goto out_unlock;
+
+ while (src_addr < src_start + len) {
+ pmd_t dst_pmdval;
+
+ BUG_ON(dst_addr >= dst_start + len);
+
+ dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
+ if (unlikely(!dst_pmd)) {
+ err = -ENOMEM;
+ break;
+ }
+
+ dst_pmdval = pmd_read_atomic(dst_pmd);
+ /*
+ * If the dst_pmd is mapped as THP don't
+ * override it and just be strict.
+ */
+ if (unlikely(pmd_trans_huge(dst_pmdval))) {
+ err = -EEXIST;
+ break;
+ }
+ if (unlikely(pmd_none(dst_pmdval)) &&
+ unlikely(__pte_alloc(dst_mm, dst_vma, dst_pmd,
+ dst_addr))) {
+ err = -ENOMEM;
+ break;
+ }
+ /* If an huge pmd materialized from under us fail */
+ if (unlikely(pmd_trans_huge(*dst_pmd))) {
+ err = -EFAULT;
+ break;
+ }
+
+ BUG_ON(pmd_none(*dst_pmd));
+ BUG_ON(pmd_trans_huge(*dst_pmd));
+
+ if (!zeropage)
+ err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
+ dst_addr, src_addr, &page);
+ else
+ err = mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma,
+ dst_addr);
+
+ cond_resched();
+
+ if (unlikely(err == -EFAULT)) {
+ void *page_kaddr;
+
+ up_read(&dst_mm->mmap_sem);
+ BUG_ON(!page);
+
+ page_kaddr = kmap(page);
+ err = copy_from_user(page_kaddr,
+ (const void __user *) src_addr,
+ PAGE_SIZE);
+ kunmap(page);
+ if (unlikely(err)) {
+ err = -EFAULT;
+ goto out;
+ }
+ goto retry;
+ } else
+ BUG_ON(page);
+
+ if (!err) {
+ dst_addr += PAGE_SIZE;
+ src_addr += PAGE_SIZE;
+ copied += PAGE_SIZE;
+
+ if (fatal_signal_pending(current))
+ err = -EINTR;
+ }
+ if (err)
+ break;
+ }
+
+out_unlock:
+ up_read(&dst_mm->mmap_sem);
+out:
+ if (page)
+ page_cache_release(page);
+ BUG_ON(copied < 0);
+ BUG_ON(err > 0);
+ BUG_ON(!copied && !err);
+ return copied ? copied : err;
+}
+
+ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
+ unsigned long src_start, unsigned long len)
+{
+ return __mcopy_atomic(dst_mm, dst_start, src_start, len, false);
+}
+
+ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
+ unsigned long len)
+{
+ return __mcopy_atomic(dst_mm, start, 0, len, true);
+}
diff --git a/mm/util.c b/mm/util.c
index 68ff8a5361e79a..c7434060039bd1 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -3,6 +3,7 @@
#include <linux/string.h>
#include <linux/compiler.h>
#include <linux/export.h>
+#include <linux/ctype.h>
#include <linux/err.h>
#include <linux/sched.h>
#include <linux/security.h>
@@ -100,6 +101,35 @@ char *kstrndup(const char *s, size_t max, gfp_t gfp)
EXPORT_SYMBOL(kstrndup);
/**
+ * kstrimdup - Trim and copy a %NUL terminated string.
+ * @s: the string to trim and duplicate
+ * @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ *
+ * Returns an address, which the caller must kfree, containing
+ * a duplicate of the passed string with leading and/or trailing
+ * whitespace (as defined by isspace) removed.
+ */
+char *kstrimdup(const char *s, gfp_t gfp)
+{
+ char *buf;
+ char *begin = skip_spaces(s);
+ size_t len = strlen(begin);
+
+ while (len && isspace(begin[len - 1]))
+ len--;
+
+ buf = kmalloc_track_caller(len + 1, gfp);
+ if (!buf)
+ return NULL;
+
+ memcpy(buf, begin, len);
+ buf[len] = '\0';
+
+ return buf;
+}
+EXPORT_SYMBOL(kstrimdup);
+
+/**
* kmemdup - duplicate region of memory
*
* @src: memory region to duplicate
@@ -355,7 +385,9 @@ struct anon_vma *page_anon_vma(struct page *page)
struct address_space *page_mapping(struct page *page)
{
- unsigned long mapping;
+ struct address_space *mapping;
+
+ page = compound_head(page);
/* This happens if someone calls flush_dcache_page on slab page */
if (unlikely(PageSlab(page)))
@@ -368,10 +400,10 @@ struct address_space *page_mapping(struct page *page)
return swap_address_space(entry);
}
- mapping = (unsigned long)page->mapping;
- if (mapping & PAGE_MAPPING_FLAGS)
+ mapping = page->mapping;
+ if ((unsigned long)mapping & PAGE_MAPPING_FLAGS)
return NULL;
- return page->mapping;
+ return mapping;
}
int overcommit_ratio_handler(struct ctl_table *table, int write,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8cb16ebaf3ed08..c8d82827279a18 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -791,13 +791,17 @@ enum page_references {
};
static enum page_references page_check_references(struct page *page,
- struct scan_control *sc)
+ struct scan_control *sc,
+ bool *freeable)
{
int referenced_ptes, referenced_page;
unsigned long vm_flags;
+ int pte_dirty;
+
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
- &vm_flags);
+ &vm_flags, &pte_dirty);
referenced_page = TestClearPageReferenced(page);
/*
@@ -838,6 +842,10 @@ static enum page_references page_check_references(struct page *page,
return PAGEREF_KEEP;
}
+ if (PageAnon(page) && !pte_dirty && !PageSwapCache(page) &&
+ !PageDirty(page))
+ *freeable = true;
+
/* Reclaim if clean, defer dirty pages to writeback */
if (referenced_page && !PageSwapBacked(page))
return PAGEREF_RECLAIM_CLEAN;
@@ -906,6 +914,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
int may_enter_fs;
enum page_references references = PAGEREF_RECLAIM_CLEAN;
bool dirty, writeback;
+ bool freeable = false;
cond_resched();
@@ -1029,7 +1038,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
}
if (!force_reclaim)
- references = page_check_references(page, sc);
+ references = page_check_references(page, sc,
+ &freeable);
switch (references) {
case PAGEREF_ACTIVATE:
@@ -1046,22 +1056,31 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* Try to allocate it some swap space here.
*/
if (PageAnon(page) && !PageSwapCache(page)) {
- if (!(sc->gfp_mask & __GFP_IO))
- goto keep_locked;
- if (!add_to_swap(page, page_list))
- goto activate_locked;
- may_enter_fs = 1;
-
- /* Adding to swap updated mapping */
- mapping = page_mapping(page);
+ if (!freeable) {
+ if (!(sc->gfp_mask & __GFP_IO))
+ goto keep_locked;
+ if (!add_to_swap(page, page_list))
+ goto activate_locked;
+ may_enter_fs = 1;
+ /* Adding to swap updated mapping */
+ mapping = page_mapping(page);
+ } else {
+ if (likely(!PageTransHuge(page)))
+ goto unmap;
+ /* try_to_unmap isn't aware of THP page */
+ if (unlikely(split_huge_page_to_list(page,
+ page_list)))
+ goto keep_locked;
+ }
}
-
+unmap:
/*
* The page is mapped into the page tables of one or more
* processes. Try to unmap it here.
*/
- if (page_mapped(page) && mapping) {
- switch (try_to_unmap(page, ttu_flags)) {
+ if (page_mapped(page) && (mapping || freeable)) {
+ switch (try_to_unmap(page,
+ freeable ? TTU_FREE : ttu_flags)) {
case SWAP_FAIL:
goto activate_locked;
case SWAP_AGAIN:
@@ -1069,7 +1088,20 @@ static unsigned long shrink_page_list(struct list_head *page_list,
case SWAP_MLOCK:
goto cull_mlocked;
case SWAP_SUCCESS:
- ; /* try to free the page below */
+ /* try to free the page below */
+ if (!freeable)
+ break;
+ /*
+ * Freeable anon page doesn't have mapping
+ * due to skipping of swapcache so we free
+ * page in here rather than __remove_mapping.
+ */
+ VM_BUG_ON_PAGE(PageSwapCache(page), page);
+ if (!page_freeze_refs(page, 1))
+ goto keep_locked;
+ __ClearPageLocked(page);
+ count_vm_event(PGLAZYFREED);
+ goto free_it;
}
}
@@ -1179,7 +1211,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* we obviously don't have to worry about waking up a process
* waiting on the page lock, because there are no references.
*/
- __clear_page_locked(page);
+ __ClearPageLocked(page);
free_it:
nr_reclaimed++;
@@ -1438,6 +1470,32 @@ int isolate_lru_page(struct page *page)
return ret;
}
+static int __too_many_isolated(struct zone *zone, int file,
+ struct scan_control *sc, int safe)
+{
+ unsigned long inactive, isolated;
+
+ if (safe) {
+ inactive = zone_page_state_snapshot(zone,
+ NR_INACTIVE_ANON + 2 * file);
+ isolated = zone_page_state_snapshot(zone,
+ NR_ISOLATED_ANON + file);
+ } else {
+ inactive = zone_page_state(zone, NR_INACTIVE_ANON + 2 * file);
+ isolated = zone_page_state(zone, NR_ISOLATED_ANON + file);
+ }
+
+ /*
+ * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
+ * won't get blocked by normal direct-reclaimers, forming a circular
+ * deadlock.
+ */
+ if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
+ inactive >>= 3;
+
+ return isolated > inactive;
+}
+
/*
* A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
* then get resheduled. When there are massive number of tasks doing page
@@ -1446,33 +1504,24 @@ int isolate_lru_page(struct page *page)
* unnecessary swapping, thrashing and OOM.
*/
static int too_many_isolated(struct zone *zone, int file,
- struct scan_control *sc)
+ struct scan_control *sc)
{
- unsigned long inactive, isolated;
-
if (current_is_kswapd())
return 0;
if (!sane_reclaim(sc))
return 0;
- if (file) {
- inactive = zone_page_state(zone, NR_INACTIVE_FILE);
- isolated = zone_page_state(zone, NR_ISOLATED_FILE);
- } else {
- inactive = zone_page_state(zone, NR_INACTIVE_ANON);
- isolated = zone_page_state(zone, NR_ISOLATED_ANON);
- }
-
/*
- * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
- * won't get blocked by normal direct-reclaimers, forming a circular
- * deadlock.
+ * __too_many_isolated(safe=0) is fast but inaccurate, because it
+ * doesn't account for the vm_stat_diff[] counters. So if it looks
+ * like too_many_isolated() is about to return true, fall back to the
+ * slower, more accurate zone_page_state_snapshot().
*/
- if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
- inactive >>= 3;
+ if (unlikely(__too_many_isolated(zone, file, sc, 0)))
+ return __too_many_isolated(zone, file, sc, 1);
- return isolated > inactive;
+ return 0;
}
static noinline_for_stack void
@@ -1809,7 +1858,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
}
if (page_referenced(page, 0, sc->target_mem_cgroup,
- &vm_flags)) {
+ &vm_flags, NULL)) {
nr_rotated += hpage_nr_pages(page);
/*
* Identify referenced, file-backed active pages and
@@ -2683,7 +2732,8 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
for (i = 0; i <= ZONE_NORMAL; i++) {
zone = &pgdat->node_zones[i];
- if (!populated_zone(zone))
+ if (!populated_zone(zone) ||
+ zone_reclaimable_pages(zone) == 0)
continue;
pfmemalloc_reserve += min_wmark_pages(zone);
@@ -3633,7 +3683,7 @@ int zone_reclaim_mode __read_mostly;
#define RECLAIM_OFF 0
#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */
#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
-#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */
+#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */
/*
* Priority for ZONE_RECLAIM. This determines the fraction of pages
@@ -3675,12 +3725,12 @@ static long zone_pagecache_reclaimable(struct zone *zone)
long delta = 0;
/*
- * If RECLAIM_SWAP is set, then all file pages are considered
+ * If RECLAIM_UNMAP is set, then all file pages are considered
* potentially reclaimable. Otherwise, we have to worry about
* pages like swapcache and zone_unmapped_file_pages() provides
* a better estimate
*/
- if (zone_reclaim_mode & RECLAIM_SWAP)
+ if (zone_reclaim_mode & RECLAIM_UNMAP)
nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
else
nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
@@ -3711,15 +3761,15 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
.order = order,
.priority = ZONE_RECLAIM_PRIORITY,
.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
- .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
+ .may_unmap = !!(zone_reclaim_mode & RECLAIM_UNMAP),
.may_swap = 1,
};
cond_resched();
/*
- * We need to be able to allocate from the reserves for RECLAIM_SWAP
+ * We need to be able to allocate from the reserves for RECLAIM_UNMAP
* and we also need to be able to write out pages for RECLAIM_WRITE
- * and RECLAIM_SWAP.
+ * and RECLAIM_UNMAP.
*/
p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
lockdep_set_current_reclaim_state(gfp_mask);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 4f5cd974e11a0a..1fd0886a389f21 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -759,6 +759,7 @@ const char * const vmstat_text[] = {
"pgfault",
"pgmajfault",
+ "pglazyfreed",
TEXTS_FOR_ZONES("pgrefill")
TEXTS_FOR_ZONES("pgsteal_kswapd")
diff --git a/mm/zbud.c b/mm/zbud.c
index 2ee4e45204936e..f3bf6f7627d8d1 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -97,6 +97,10 @@ struct zbud_pool {
struct list_head lru;
u64 pages_nr;
struct zbud_ops *ops;
+#ifdef CONFIG_ZPOOL
+ struct zpool *zpool;
+ struct zpool_ops *zpool_ops;
+#endif
};
/*
@@ -123,7 +127,10 @@ struct zbud_header {
static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle)
{
- return zpool_evict(pool, handle);
+ if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict)
+ return pool->zpool_ops->evict(pool->zpool, handle);
+ else
+ return -ENOENT;
}
static struct zbud_ops zbud_zpool_ops = {
@@ -131,9 +138,17 @@ static struct zbud_ops zbud_zpool_ops = {
};
static void *zbud_zpool_create(char *name, gfp_t gfp,
- struct zpool_ops *zpool_ops)
+ struct zpool_ops *zpool_ops,
+ struct zpool *zpool)
{
- return zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL);
+ struct zbud_pool *pool;
+
+ pool = zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL);
+ if (pool) {
+ pool->zpool = zpool;
+ pool->zpool_ops = zpool_ops;
+ }
+ return pool;
}
static void zbud_zpool_destroy(void *pool)
@@ -292,7 +307,7 @@ struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops)
struct zbud_pool *pool;
int i;
- pool = kmalloc(sizeof(struct zbud_pool), gfp);
+ pool = kzalloc(sizeof(struct zbud_pool), gfp);
if (!pool)
return NULL;
spin_lock_init(&pool->lock);
diff --git a/mm/zpool.c b/mm/zpool.c
index bacdab6e47de33..722a4f60e90b29 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -73,33 +73,6 @@ int zpool_unregister_driver(struct zpool_driver *driver)
}
EXPORT_SYMBOL(zpool_unregister_driver);
-/**
- * zpool_evict() - evict callback from a zpool implementation.
- * @pool: pool to evict from.
- * @handle: handle to evict.
- *
- * This can be used by zpool implementations to call the
- * user's evict zpool_ops struct evict callback.
- */
-int zpool_evict(void *pool, unsigned long handle)
-{
- struct zpool *zpool;
-
- spin_lock(&pools_lock);
- list_for_each_entry(zpool, &pools_head, list) {
- if (zpool->pool == pool) {
- spin_unlock(&pools_lock);
- if (!zpool->ops || !zpool->ops->evict)
- return -EINVAL;
- return zpool->ops->evict(zpool, handle);
- }
- }
- spin_unlock(&pools_lock);
-
- return -ENOENT;
-}
-EXPORT_SYMBOL(zpool_evict);
-
static struct zpool_driver *zpool_get_driver(char *type)
{
struct zpool_driver *driver;
@@ -147,7 +120,7 @@ struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp,
struct zpool_driver *driver;
struct zpool *zpool;
- pr_info("creating pool type %s\n", type);
+ pr_debug("creating pool type %s\n", type);
driver = zpool_get_driver(type);
@@ -170,7 +143,7 @@ struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp,
zpool->type = driver->type;
zpool->driver = driver;
- zpool->pool = driver->create(name, gfp, ops);
+ zpool->pool = driver->create(name, gfp, ops, zpool);
zpool->ops = ops;
if (!zpool->pool) {
@@ -180,7 +153,7 @@ struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp,
return NULL;
}
- pr_info("created %s pool\n", type);
+ pr_debug("created pool type %s\n", type);
spin_lock(&pools_lock);
list_add(&zpool->list, &pools_head);
@@ -202,7 +175,7 @@ struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp,
*/
void zpool_destroy_pool(struct zpool *zpool)
{
- pr_info("destroying pool type %s\n", zpool->type);
+ pr_debug("destroying pool type %s\n", zpool->type);
spin_lock(&pools_lock);
list_del(&zpool->list);
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 08bd7a3d464a9c..287f09e68a3d02 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -45,10 +45,6 @@
*
*/
-#ifdef CONFIG_ZSMALLOC_DEBUG
-#define DEBUG
-#endif
-
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/sched.h>
@@ -312,7 +308,8 @@ static void record_obj(unsigned long handle, unsigned long obj)
#ifdef CONFIG_ZPOOL
-static void *zs_zpool_create(char *name, gfp_t gfp, struct zpool_ops *zpool_ops)
+static void *zs_zpool_create(char *name, gfp_t gfp, struct zpool_ops *zpool_ops,
+ struct zpool *zpool)
{
return zs_create_pool(name, gfp);
}
diff --git a/mm/zswap.c b/mm/zswap.c
index 4249e82ff934d9..915a4e724d562c 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -75,9 +75,10 @@ static u64 zswap_duplicate_entry;
/*********************************
* tunables
**********************************/
-/* Enable/disable zswap (disabled by default, fixed at boot for now) */
-static bool zswap_enabled __read_mostly;
-module_param_named(enabled, zswap_enabled, bool, 0444);
+
+/* Enable/disable zswap (disabled by default) */
+static bool zswap_enabled;
+module_param_named(enabled, zswap_enabled, bool, 0644);
/* Compressor to be used by zswap (fixed at boot for now) */
#define ZSWAP_COMPRESSOR_DEFAULT "lzo"
@@ -490,7 +491,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry,
}
/* May fail (-ENOMEM) if radix-tree node allocation failed. */
- __set_page_locked(new_page);
+ __SetPageLocked(new_page);
SetPageSwapBacked(new_page);
err = __add_to_swap_cache(new_page, entry);
if (likely(!err)) {
@@ -501,7 +502,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry,
}
radix_tree_preload_end();
ClearPageSwapBacked(new_page);
- __clear_page_locked(new_page);
+ __ClearPageLocked(new_page);
/*
* add_to_swap_cache() doesn't return -EEXIST, so we can safely
* clear SWAP_HAS_CACHE flag.
@@ -648,7 +649,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
u8 *src, *dst;
struct zswap_header *zhdr;
- if (!tree) {
+ if (!zswap_enabled || !tree) {
ret = -ENODEV;
goto reject;
}
@@ -901,9 +902,6 @@ static int __init init_zswap(void)
{
gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN;
- if (!zswap_enabled)
- return 0;
-
pr_info("loading zswap\n");
zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp,
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 337ca851a350cc..b140c092d226ed 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -297,7 +297,7 @@ static int rpc_complete_task(struct rpc_task *task)
clear_bit(RPC_TASK_ACTIVE, &task->tk_runstate);
ret = atomic_dec_and_test(&task->tk_count);
if (waitqueue_active(wq))
- __wake_up_locked_key(wq, TASK_NORMAL, &k);
+ __wake_up_locked_key(wq, TASK_NORMAL, 1, &k);
spin_unlock_irqrestore(&wq->lock, flags);
return ret;
}
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 89b1df4e72ab34..69c4716d9e2745 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -9,6 +9,7 @@ use strict;
use POSIX;
use File::Basename;
use Cwd 'abs_path';
+use Term::ANSIColor qw(:constants);
my $P = $0;
my $D = dirname(abs_path($P));
@@ -24,6 +25,7 @@ my $chk_patch = 1;
my $tst_only;
my $emacs = 0;
my $terse = 0;
+my $showfile = 0;
my $file = 0;
my $check = 0;
my $check_orig = 0;
@@ -48,7 +50,8 @@ my $minimum_perl_version = 5.10.0;
my $min_conf_desc_length = 4;
my $spelling_file = "$D/spelling.txt";
my $codespell = 0;
-my $codespellfile = "/usr/local/share/codespell/dictionary.txt";
+my $codespellfile = "/usr/share/codespell/dictionary.txt";
+my $color = 1;
sub help {
my ($exitcode) = @_;
@@ -64,6 +67,7 @@ Options:
--patch treat FILE as patchfile (default)
--emacs emacs compile window format
--terse one line per report
+ --showfile emit diffed file position, not input file position
-f, --file treat FILE as regular source file
--subjective, --strict enable more subjective tests
--types TYPE(,TYPE2...) show only these comma separated message types
@@ -91,8 +95,9 @@ Options:
--ignore-perl-version override checking of perl version. expect
runtime errors.
--codespell Use the codespell dictionary for spelling/typos
- (default:/usr/local/share/codespell/dictionary.txt)
+ (default:/usr/share/codespell/dictionary.txt)
--codespellfile Use this codespell dictionary
+ --color Use colors when output is STDOUT (default: on)
-h, --help, --version display this help and exit
When FILE is - read standard input.
@@ -134,6 +139,7 @@ GetOptions(
'patch!' => \$chk_patch,
'emacs!' => \$emacs,
'terse!' => \$terse,
+ 'showfile!' => \$showfile,
'f|file!' => \$file,
'subjective!' => \$check,
'strict!' => \$check,
@@ -153,6 +159,7 @@ GetOptions(
'test-only=s' => \$tst_only,
'codespell!' => \$codespell,
'codespellfile=s' => \$codespellfile,
+ 'color!' => \$color,
'h|help' => \$help,
'version' => \$help
) or help(1);
@@ -197,11 +204,11 @@ sub hash_show_words {
my ($hashRef, $prefix) = @_;
if ($quiet == 0 && keys %$hashRef) {
- print "NOTE: $prefix message types:";
+ print "\nNOTE: $prefix message types:";
foreach my $word (sort keys %$hashRef) {
print " $word";
}
- print "\n\n";
+ print "\n";
}
}
@@ -347,15 +354,20 @@ our $UTF8 = qr{
| $NON_ASCII_UTF8
}x;
+our $typeC99Typedefs = qr{(?:__)?(?:[us]_?)?int_?(?:8|16|32|64)_t};
our $typeOtherOSTypedefs = qr{(?x:
u_(?:char|short|int|long) | # bsd
u(?:nchar|short|int|long) # sysv
)};
-
-our $typeTypedefs = qr{(?x:
+our $typeKernelTypedefs = qr{(?x:
(?:__)?(?:u|s|be|le)(?:8|16|32|64)|
atomic_t
)};
+our $typeTypedefs = qr{(?x:
+ $typeC99Typedefs\b|
+ $typeOtherOSTypedefs\b|
+ $typeKernelTypedefs\b
+)};
our $logFunctions = qr{(?x:
printk(?:_ratelimited|_once|)|
@@ -418,6 +430,7 @@ our @typeList = (
qr{${Ident}_handler_fn},
@typeListMisordered,
);
+our @typeListFile = ();
our @typeListWithAttr = (
@typeList,
qr{struct\s+$InitAttribute\s+$Ident},
@@ -427,6 +440,7 @@ our @typeListWithAttr = (
our @modifierList = (
qr{fastcall},
);
+our @modifierListFile = ();
our @mode_permission_funcs = (
["module_param", 3],
@@ -510,13 +524,12 @@ if ($codespell) {
$misspellings = join("|", sort keys %spelling_fix) if keys %spelling_fix;
sub build_types {
- my $mods = "(?x: \n" . join("|\n ", @modifierList) . "\n)";
- my $all = "(?x: \n" . join("|\n ", @typeList) . "\n)";
+ my $mods = "(?x: \n" . join("|\n ", (@modifierList, @modifierListFile)) . "\n)";
+ my $all = "(?x: \n" . join("|\n ", (@typeList, @typeListFile)) . "\n)";
my $Misordered = "(?x: \n" . join("|\n ", @typeListMisordered) . "\n)";
my $allWithAttr = "(?x: \n" . join("|\n ", @typeListWithAttr) . "\n)";
$Modifier = qr{(?:$Attribute|$Sparse|$mods)};
$BasicType = qr{
- (?:$typeOtherOSTypedefs\b)|
(?:$typeTypedefs\b)|
(?:${all}\b)
}x;
@@ -524,7 +537,6 @@ sub build_types {
(?:$Modifier\s+|const\s+)*
(?:
(?:typeof|__typeof__)\s*\([^\)]*\)|
- (?:$typeOtherOSTypedefs\b)|
(?:$typeTypedefs\b)|
(?:${all}\b)
)
@@ -542,7 +554,6 @@ sub build_types {
(?:
(?:typeof|__typeof__)\s*\([^\)]*\)|
(?:$typeTypedefs\b)|
- (?:$typeOtherOSTypedefs\b)|
(?:${allWithAttr}\b)
)
(?:\s+$Modifier|\s+const)*
@@ -737,6 +748,13 @@ for my $filename (@ARGV) {
push(@rawlines, $_);
}
close($FILE);
+
+ if ($#ARGV > 0 && $quiet == 0) {
+ print '-' x length($vname) . "\n";
+ print "$vname\n";
+ print '-' x length($vname) . "\n";
+ }
+
if (!process($filename)) {
$exit = 1;
}
@@ -746,6 +764,26 @@ for my $filename (@ARGV) {
@fixed_inserted = ();
@fixed_deleted = ();
$fixlinenr = -1;
+ @modifierListFile = ();
+ @typeListFile = ();
+ build_types();
+}
+
+if (!$quiet) {
+ if ($^V lt 5.10.0) {
+ print << "EOM"
+
+NOTE: perl $^V is not modern enough to detect all possible issues.
+ An upgrade to at least perl v5.10.0 is suggested.
+EOM
+ }
+ if ($exit) {
+ print << "EOM"
+
+NOTE: If any of the errors are false positives, please report
+ them to the maintainer, see CHECKPATCH in MAINTAINERS.
+EOM
+ }
}
exit($exit);
@@ -1001,7 +1039,7 @@ sub sanitise_line {
sub get_quoted_string {
my ($line, $rawline) = @_;
- return "" if ($line !~ m/(\"[X\t]+\")/g);
+ return "" if ($line !~ m/($String)/g);
return substr($rawline, $-[0], $+[0] - $-[0]);
}
@@ -1610,13 +1648,13 @@ sub possible {
for my $modifier (split(' ', $possible)) {
if ($modifier !~ $notPermitted) {
warn "MODIFIER: $modifier ($possible) ($line)\n" if ($dbg_possible);
- push(@modifierList, $modifier);
+ push(@modifierListFile, $modifier);
}
}
} else {
warn "POSSIBLE: $possible ($line)\n" if ($dbg_possible);
- push(@typeList, $possible);
+ push(@typeListFile, $possible);
}
build_types();
} else {
@@ -1641,15 +1679,32 @@ sub report {
(defined $tst_only && $msg !~ /\Q$tst_only\E/)) {
return 0;
}
- my $line;
+ my $output = '';
+ if (-t STDOUT && $color) {
+ if ($level eq 'ERROR') {
+ $output .= RED;
+ } elsif ($level eq 'WARNING') {
+ $output .= YELLOW;
+ } else {
+ $output .= GREEN;
+ }
+ }
+ $output .= $prefix . $level . ':';
if ($show_types) {
- $line = "$prefix$level:$type: $msg\n";
- } else {
- $line = "$prefix$level: $msg\n";
+ $output .= BLUE if (-t STDOUT && $color);
+ $output .= "$type:";
+ }
+ $output .= RESET if (-t STDOUT && $color);
+ $output .= ' ' . $msg . "\n";
+
+ if ($showfile) {
+ my @lines = split("\n", $output, -1);
+ splice(@lines, 1, 1);
+ $output = join("\n", @lines);
}
- $line = (split('\n', $line))[0] . "\n" if ($terse);
+ $output = (split('\n', $output))[0] . "\n" if ($terse);
- push(our @report, $line);
+ push(our @report, $output);
return 1;
}
@@ -2073,10 +2128,6 @@ sub process {
my $hunk_line = ($realcnt != 0);
-#make up the handle for any error we report on this line
- $prefix = "$filename:$realline: " if ($emacs && $file);
- $prefix = "$filename:$linenr: " if ($emacs && !$file);
-
$here = "#$linenr: " if (!$file);
$here = "#$realline: " if ($file);
@@ -2106,6 +2157,13 @@ sub process {
$found_file = 1;
}
+#make up the handle for any error we report on this line
+ if ($showfile) {
+ $prefix = "$realfile:$realline: "
+ } elsif ($emacs) {
+ $prefix = "$filename:$linenr: ";
+ }
+
if ($found_file) {
if ($realfile =~ m@^(drivers/net/|net/)@) {
$check = 1;
@@ -2510,16 +2568,56 @@ sub process {
# check we are in a valid source file if not then ignore this hunk
next if ($realfile !~ /\.(h|c|s|S|pl|sh|dtsi|dts)$/);
-#line length limit
- if ($line =~ /^\+/ && $prevrawline !~ /\/\*\*/ &&
- $rawline !~ /^.\s*\*\s*\@$Ident\s/ &&
- !($line =~ /^\+\s*$logFunctions\s*\(\s*(?:(KERN_\S+\s*|[^"]*))?$String\s*(?:|,|\)\s*;)\s*$/ ||
- $line =~ /^\+\s*$String\s*(?:\s*|,|\)\s*;)\s*$/ ||
- $line =~ /^\+\s*#\s*define\s+\w+\s+$String$/) &&
- $length > $max_line_length)
- {
- WARN("LONG_LINE",
- "line over $max_line_length characters\n" . $herecurr);
+# line length limit (with some exclusions)
+#
+# There are a few types of lines that may extend beyond $max_line_length:
+# logging functions like pr_info that end in a string
+# lines with a single string
+# #defines that are a single string
+#
+# There are 3 different line length message types:
+# LONG_LINE_COMMENT a comment starts before but extends beyond $max_linelength
+# LONG_LINE_STRING a string starts before but extends beyond $max_line_length
+# LONG_LINE all other lines longer than $max_line_length
+#
+# if LONG_LINE is ignored, the other 2 types are also ignored
+#
+
+ if ($length > $max_line_length) {
+ my $msg_type = "LONG_LINE";
+
+ # Check the allowed long line types first
+
+ # logging functions that end in a string that starts
+ # before $max_line_length
+ if ($line =~ /^\+\s*$logFunctions\s*\(\s*(?:(?:KERN_\S+\s*|[^"]*))?($String\s*(?:|,|\)\s*;)\s*)$/ &&
+ length(expand_tabs(substr($line, 1, length($line) - length($1) - 1))) <= $max_line_length) {
+ $msg_type = "";
+
+ # lines with only strings (w/ possible termination)
+ # #defines with only strings
+ } elsif ($line =~ /^\+\s*$String\s*(?:\s*|,|\)\s*;)\s*$/ ||
+ $line =~ /^\+\s*#\s*define\s+\w+\s+$String$/) {
+ $msg_type = "";
+
+ # Otherwise set the alternate message types
+
+ # a comment starts before $max_line_length
+ } elsif ($line =~ /($;[\s$;]*)$/ &&
+ length(expand_tabs(substr($line, 1, length($line) - length($1) - 1))) <= $max_line_length) {
+ $msg_type = "LONG_LINE_COMMENT"
+
+ # a quoted string starts before $max_line_length
+ } elsif ($sline =~ /\s*($String(?:\s*(?:\\|,\s*|\)\s*;\s*))?)$/ &&
+ length(expand_tabs(substr($line, 1, length($line) - length($1) - 1))) <= $max_line_length) {
+ $msg_type = "LONG_LINE_STRING"
+ }
+
+ if ($msg_type ne "" &&
+ (show_type("LONG_LINE") || show_type($msg_type))) {
+ WARN($msg_type,
+ "line over $max_line_length characters\n" . $herecurr);
+ }
}
# check for adding lines without a newline.
@@ -3169,12 +3267,12 @@ sub process {
}
# check for global initialisers.
- if ($line =~ /^\+(\s*$Type\s*$Ident\s*(?:\s+$Modifier))*\s*=\s*(0|NULL|false)\s*;/) {
+ if ($line =~ /^\+$Type\s*$Ident(?:\s+$Modifier)*\s*=\s*(?:0|NULL|false)\s*;/) {
if (ERROR("GLOBAL_INITIALISERS",
"do not initialise globals to 0 or NULL\n" .
$herecurr) &&
$fix) {
- $fixed[$fixlinenr] =~ s/($Type\s*$Ident\s*(?:\s+$Modifier))*\s*=\s*(0|NULL|false)\s*;/$1;/;
+ $fixed[$fixlinenr] =~ s/(^.$Type\s*$Ident(?:\s+$Modifier)*)\s*=\s*(0|NULL|false)\s*;/$1;/;
}
}
# check for static initialisers.
@@ -3264,7 +3362,6 @@ sub process {
$line !~ /\btypedef\s+$Type\s*\(\s*\*?$Ident\s*\)\s*\(/ &&
$line !~ /\btypedef\s+$Type\s+$Ident\s*\(/ &&
$line !~ /\b$typeTypedefs\b/ &&
- $line !~ /\b$typeOtherOSTypedefs\b/ &&
$line !~ /\b__bitwise(?:__|)\b/) {
WARN("NEW_TYPEDEFS",
"do not add new typedefs\n" . $herecurr);
@@ -4337,8 +4434,8 @@ sub process {
}
# Flatten any obvious string concatentation.
- while ($dstat =~ s/("X*")\s*$Ident/$1/ ||
- $dstat =~ s/$Ident\s*("X*")/$1/)
+ while ($dstat =~ s/($String)\s*$Ident/$1/ ||
+ $dstat =~ s/$Ident\s*($String)/$1/)
{
}
@@ -4619,7 +4716,7 @@ sub process {
# to grep for the string. Make exceptions when the previous string ends in a
# newline (multiple lines in one string constant) or '\t', '\r', ';', or '{'
# (common in inline assembly) or is a octal \123 or hexadecimal \xaf value
- if ($line =~ /^\+\s*"[X\t]*"/ &&
+ if ($line =~ /^\+\s*$String/ &&
$prevline =~ /"\s*$/ &&
$prevrawline !~ /(?:\\(?:[ntr]|[0-7]{1,3}|x[0-9a-fA-F]{1,2})|;\s*|\{\s*)"\s*$/) {
if (WARN("SPLIT_STRING",
@@ -4665,13 +4762,13 @@ sub process {
}
# concatenated string without spaces between elements
- if ($line =~ /"X+"[A-Z_]+/ || $line =~ /[A-Z_]+"X+"/) {
+ if ($line =~ /$String[A-Z_]/ || $line =~ /[A-Za-z0-9_]$String/) {
CHK("CONCATENATED_STRING",
"Concatenated strings should use spaces between elements\n" . $herecurr);
}
# uncoalesced string fragments
- if ($line =~ /"X*"\s*"/) {
+ if ($line =~ /$String\s*"/) {
WARN("STRING_FRAGMENTS",
"Consecutive strings are generally better as a single string\n" . $herecurr);
}
@@ -4898,6 +4995,13 @@ sub process {
"memory barrier without comment\n" . $herecurr);
}
}
+# check for waitqueue_active without a comment.
+ if ($line =~ /\bwaitqueue_active\s*\(/) {
+ if (!ctx_has_comment($first_line, $linenr)) {
+ WARN("WAITQUEUE_ACTIVE",
+ "waitqueue_active without comment\n" . $herecurr);
+ }
+ }
# check of hardware specific defines
if ($line =~ m@^.\s*\#\s*if.*\b(__i386__|__powerpc64__|__sun__|__s390x__)\b@ && $realfile !~ m@include/asm-@) {
CHK("ARCH_DEFINES",
@@ -4973,6 +5077,24 @@ sub process {
"Using weak declarations can have unintended link defects\n" . $herecurr);
}
+# check for c99 types like uint8_t used outside of uapi/
+ if ($realfile !~ m@\binclude/uapi/@ &&
+ $line =~ /\b($Declare)\s*$Ident\s*[=;,\[]/) {
+ my $type = $1;
+ if ($type =~ /\b($typeC99Typedefs)\b/) {
+ $type = $1;
+ my $kernel_type = 'u';
+ $kernel_type = 's' if ($type =~ /^_*[si]/);
+ $type =~ /(\d+)/;
+ $kernel_type .= $1;
+ if (CHK("PREFER_KERNEL_TYPES",
+ "Prefer kernel type '$kernel_type' over '$type'\n" . $herecurr) &&
+ $fix) {
+ $fixed[$fixlinenr] =~ s/\b$type\b/$kernel_type/;
+ }
+ }
+ }
+
# check for sizeof(&)
if ($line =~ /\bsizeof\s*\(\s*\&/) {
WARN("SIZEOF_ADDRESS",
@@ -5492,11 +5614,11 @@ sub process {
exit(0);
}
- if (!$is_patch) {
+ if (!$is_patch && $file !~ /cover-letter\.patch$/) {
ERROR("NOT_UNIFIED_DIFF",
"Does not appear to be a unified-diff format patch\n");
}
- if ($is_patch && $chk_signoff && $signoff == 0) {
+ if ($is_patch && $filename ne '-' && $chk_signoff && $signoff == 0) {
ERROR("MISSING_SIGN_OFF",
"Missing Signed-off-by: line(s)\n");
}
@@ -5507,22 +5629,18 @@ sub process {
print "total: $cnt_error errors, $cnt_warn warnings, " .
(($check)? "$cnt_chk checks, " : "") .
"$cnt_lines lines checked\n";
- print "\n" if ($quiet == 0);
}
if ($quiet == 0) {
-
- if ($^V lt 5.10.0) {
- print("NOTE: perl $^V is not modern enough to detect all possible issues.\n");
- print("An upgrade to at least perl v5.10.0 is suggested.\n\n");
- }
-
# If there were whitespace errors which cleanpatch can fix
# then suggest that.
if ($rpt_cleaners) {
- print "NOTE: whitespace errors detected, you may wish to use scripts/cleanpatch or\n";
- print " scripts/cleanfile\n\n";
$rpt_cleaners = 0;
+ print << "EOM"
+
+NOTE: Whitespace errors detected.
+ You may wish to use scripts/cleanpatch or scripts/cleanfile
+EOM
}
}
@@ -5556,6 +5674,7 @@ sub process {
if (!$quiet) {
print << "EOM";
+
Wrote EXPERIMENTAL --fix correction(s) to '$newfile'
Do _NOT_ trust the results written to this file.
@@ -5563,22 +5682,17 @@ Do _NOT_ submit these changes without inspecting them for correctness.
This EXPERIMENTAL file is simply a convenience to help rewrite patches.
No warranties, expressed or implied...
-
EOM
}
}
- if ($clean == 1 && $quiet == 0) {
- print "$vname has no obvious style problems and is ready for submission.\n"
- }
- if ($clean == 0 && $quiet == 0) {
- print << "EOM";
-$vname has style problems, please review.
-
-If any of these errors are false positives, please report
-them to the maintainer, see CHECKPATCH in MAINTAINERS.
-EOM
+ if ($quiet == 0) {
+ print "\n";
+ if ($clean == 1) {
+ print "$vname has no obvious style problems and is ready for submission.\n";
+ } else {
+ print "$vname has style problems, please review.\n";
+ }
}
-
return $clean;
}
diff --git a/scripts/gdb/linux/dmesg.py b/scripts/gdb/linux/dmesg.py
index 3c947f0c5dad13..927d0d2a314517 100644
--- a/scripts/gdb/linux/dmesg.py
+++ b/scripts/gdb/linux/dmesg.py
@@ -12,7 +12,6 @@
#
import gdb
-import string
from linux import utils
diff --git a/scripts/gdb/linux/lists.py b/scripts/gdb/linux/lists.py
new file mode 100644
index 00000000000000..3a3775bc162bd7
--- /dev/null
+++ b/scripts/gdb/linux/lists.py
@@ -0,0 +1,92 @@
+#
+# gdb helper commands and functions for Linux kernel debugging
+#
+# list tools
+#
+# Copyright (c) Thiebaud Weksteen, 2015
+#
+# Authors:
+# Thiebaud Weksteen <thiebaud@weksteen.fr>
+#
+# This work is licensed under the terms of the GNU GPL version 2.
+#
+
+import gdb
+
+from linux import utils
+
+list_head = utils.CachedType("struct list_head")
+
+
+def list_check(head):
+ nb = 0
+ if (head.type == list_head.get_type().pointer()):
+ head = head.dereference()
+ elif (head.type != list_head.get_type()):
+ raise gdb.GdbError('argument must be of type (struct list_head [*])')
+ c = head
+ try:
+ gdb.write("Starting with: {}\n".format(c))
+ except gdb.MemoryError:
+ gdb.write('head is not accessible\n')
+ return
+ while True:
+ p = c['prev'].dereference()
+ n = c['next'].dereference()
+ try:
+ if p['next'] != c.address:
+ gdb.write('prev.next != current: '
+ 'current@{current_addr}={current} '
+ 'prev@{p_addr}={p}\n'.format(
+ current_addr=c.address,
+ current=c,
+ p_addr=p.address,
+ p=p,
+ ))
+ return
+ except gdb.MemoryError:
+ gdb.write('prev is not accessible: '
+ 'current@{current_addr}={current}\n'.format(
+ current_addr=c.address,
+ current=c
+ ))
+ return
+ try:
+ if n['prev'] != c.address:
+ gdb.write('next.prev != current: '
+ 'current@{current_addr}={current} '
+ 'next@{n_addr}={n}\n'.format(
+ current_addr=c.address,
+ current=c,
+ n_addr=n.address,
+ n=n,
+ ))
+ return
+ except gdb.MemoryError:
+ gdb.write('next is not accessible: '
+ 'current@{current_addr}={current}\n'.format(
+ current_addr=c.address,
+ current=c
+ ))
+ return
+ c = n
+ nb += 1
+ if c == head:
+ gdb.write("list is consistent: {} node(s)\n".format(nb))
+ return
+
+
+class LxListChk(gdb.Command):
+ """Verify a list consistency"""
+
+ def __init__(self):
+ super(LxListChk, self).__init__("lx-list-check", gdb.COMMAND_DATA,
+ gdb.COMPLETE_EXPRESSION)
+
+ def invoke(self, arg, from_tty):
+ argv = gdb.string_to_argv(arg)
+ if len(argv) != 1:
+ raise gdb.GdbError("lx-list-check takes one argument")
+ list_check(gdb.parse_and_eval(argv[0]))
+
+LxListChk()
diff --git a/scripts/gdb/linux/symbols.py b/scripts/gdb/linux/symbols.py
index cd5bea965d4eb0..627750cb420d08 100644
--- a/scripts/gdb/linux/symbols.py
+++ b/scripts/gdb/linux/symbols.py
@@ -14,9 +14,8 @@
import gdb
import os
import re
-import string
-from linux import modules, utils
+from linux import modules
if hasattr(gdb, 'Breakpoint'):
@@ -97,7 +96,7 @@ lx-symbols command."""
return ""
attrs = sect_attrs['attrs']
section_name_to_address = {
- attrs[n]['name'].string() : attrs[n]['address']
+ attrs[n]['name'].string(): attrs[n]['address']
for n in range(int(sect_attrs['nsections']))}
args = []
for section_name in [".data", ".data..read_mostly", ".rodata", ".bss"]:
@@ -124,7 +123,7 @@ lx-symbols command."""
addr=module_addr,
sections=self._section_arguments(module))
gdb.execute(cmdline, to_string=True)
- if not module_name in self.loaded_modules:
+ if module_name not in self.loaded_modules:
self.loaded_modules.append(module_name)
else:
gdb.write("no module object found for '{0}'\n".format(module_name))
@@ -164,7 +163,7 @@ lx-symbols command."""
self.load_all_symbols()
if hasattr(gdb, 'Breakpoint'):
- if not self.breakpoint is None:
+ if self.breakpoint is not None:
self.breakpoint.delete()
self.breakpoint = None
self.breakpoint = LoadModuleBreakpoint(
diff --git a/scripts/gdb/linux/tasks.py b/scripts/gdb/linux/tasks.py
index e2037d9bb7eb89..862a4ae24d4996 100644
--- a/scripts/gdb/linux/tasks.py
+++ b/scripts/gdb/linux/tasks.py
@@ -18,8 +18,8 @@ from linux import utils
task_type = utils.CachedType("struct task_struct")
+
def task_lists():
- global task_type
task_ptr_type = task_type.get_type().pointer()
init_task = gdb.parse_and_eval("init_task").address
t = g = init_task
@@ -38,6 +38,7 @@ def task_lists():
if t == init_task:
return
+
def get_task_by_pid(pid):
for task in task_lists():
if int(task['pid']) == pid:
@@ -65,13 +66,28 @@ return that task_struct variable which PID matches."""
LxTaskByPidFunc()
+class LxPs(gdb.Command):
+ """Dump Linux tasks."""
+
+ def __init__(self):
+ super(LxPs, self).__init__("lx-ps", gdb.COMMAND_DATA)
+
+ def invoke(self, arg, from_tty):
+ for task in task_lists():
+ gdb.write("{address} {pid} {comm}\n".format(
+ address=task,
+ pid=task["pid"],
+ comm=task["comm"].string()))
+
+LxPs()
+
+
thread_info_type = utils.CachedType("struct thread_info")
ia64_task_size = None
def get_thread_info(task):
- global thread_info_type
thread_info_ptr_type = thread_info_type.get_type().pointer()
if utils.is_target_arch("ia64"):
global ia64_task_size
diff --git a/scripts/gdb/linux/utils.py b/scripts/gdb/linux/utils.py
index 128c306db3ee89..0893b326a28b8f 100644
--- a/scripts/gdb/linux/utils.py
+++ b/scripts/gdb/linux/utils.py
@@ -83,7 +83,7 @@ def get_target_endianness():
elif "big endian" in endian:
target_endianness = BIG_ENDIAN
else:
- raise gdb.GdgError("unknown endianness '{0}'".format(str(endian)))
+ raise gdb.GdbError("unknown endianness '{0}'".format(str(endian)))
return target_endianness
@@ -151,6 +151,6 @@ def get_gdbserver_type():
gdbserver_type = GDBSERVER_QEMU
elif probe_kgdb():
gdbserver_type = GDBSERVER_KGDB
- if not gdbserver_type is None and hasattr(gdb, 'events'):
+ if gdbserver_type is not None and hasattr(gdb, 'events'):
gdb.events.exited.connect(exit_handler)
return gdbserver_type
diff --git a/scripts/gdb/vmlinux-gdb.py b/scripts/gdb/vmlinux-gdb.py
index 48489285f1192c..ce82bf5c394321 100644
--- a/scripts/gdb/vmlinux-gdb.py
+++ b/scripts/gdb/vmlinux-gdb.py
@@ -28,3 +28,4 @@ else:
import linux.dmesg
import linux.tasks
import linux.cpus
+ import linux.lists
diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl
index d7016279ec2b1b..fc169fd2a3cc30 100755
--- a/scripts/get_maintainer.pl
+++ b/scripts/get_maintainer.pl
@@ -186,6 +186,27 @@ if (-f $conf) {
unshift(@ARGV, @conf_args) if @conf_args;
}
+my @ignore_emails = ();
+my $ignore_file = which_conf(".get_maintainer.ignore");
+if (-f $ignore_file) {
+ open(my $ignore, '<', "$ignore_file")
+ or warn "$P: Can't find a readable .get_maintainer.ignore file $!\n";
+ while (<$ignore>) {
+ my $line = $_;
+
+ $line =~ s/\s*\n?$//;
+ $line =~ s/^\s*//;
+ $line =~ s/\s+$//;
+ $line =~ s/#.*$//;
+
+ next if ($line =~ m/^\s*$/);
+ if (rfc822_valid($line)) {
+ push(@ignore_emails, $line);
+ }
+ }
+ close($ignore);
+}
+
if (!GetOptions(
'email!' => \$email,
'git!' => \$email_git,
@@ -513,6 +534,16 @@ if ($web) {
exit($exit);
+sub ignore_email_address {
+ my ($address) = @_;
+
+ foreach my $ignore (@ignore_emails) {
+ return 1 if ($ignore eq $address);
+ }
+
+ return 0;
+}
+
sub range_is_maintained {
my ($start, $end) = @_;
@@ -1868,6 +1899,7 @@ sub vcs_assign {
my $percent = $sign_offs * 100 / $divisor;
$percent = 100 if ($percent > 100);
+ next if (ignore_email_address($line));
$count++;
last if ($sign_offs < $email_git_min_signatures ||
$count > $email_git_max_maintainers ||