aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2024-04-29 08:37:17 +1000
committerStephen Rothwell <sfr@canb.auug.org.au>2024-04-29 08:37:17 +1000
commitef486a2f3d9ef66eb3b139b86d388cd34dba8da9 (patch)
tree34fd7cd63374dbb9c9c8b913c53fd0f10da2f913
parent71c44418506bd15c951e17ada6ce5708fba14bc2 (diff)
parentcd67fe6dd7fc05638a7728b16cdef4b560cd52a4 (diff)
downloadlinux-next-history-ef486a2f3d9ef66eb3b139b86d388cd34dba8da9.tar.gz
Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git
Notice: this object is not reachable from any branch.
Notice: this object is not reachable from any branch.
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt4
-rw-r--r--Documentation/arch/s390/index.rst1
-rw-r--r--Documentation/arch/s390/mm.rst111
-rw-r--r--Documentation/arch/s390/vfio-ap.rst32
-rw-r--r--arch/s390/Kconfig53
-rw-r--r--arch/s390/Makefile6
-rw-r--r--arch/s390/boot/Makefile2
-rw-r--r--arch/s390/boot/boot.h8
-rw-r--r--arch/s390/boot/decompressor.c15
-rw-r--r--arch/s390/boot/decompressor.h8
-rw-r--r--arch/s390/boot/kaslr.c2
-rw-r--r--arch/s390/boot/pgm_check_info.c4
-rw-r--r--arch/s390/boot/startup.c233
-rw-r--r--arch/s390/boot/vmem.c108
-rw-r--r--arch/s390/boot/vmlinux.lds.S30
-rw-r--r--arch/s390/include/asm/ap.h30
-rw-r--r--arch/s390/include/asm/asm-prototypes.h1
-rw-r--r--arch/s390/include/asm/chsc.h15
-rw-r--r--arch/s390/include/asm/extmem.h7
-rw-r--r--arch/s390/include/asm/gmap.h2
-rw-r--r--arch/s390/include/asm/mmu.h5
-rw-r--r--arch/s390/include/asm/mmu_context.h1
-rw-r--r--arch/s390/include/asm/nospec-branch.h20
-rw-r--r--arch/s390/include/asm/nospec-insn.h13
-rw-r--r--arch/s390/include/asm/os_info.h28
-rw-r--r--arch/s390/include/asm/page.h50
-rw-r--r--arch/s390/include/asm/pgtable.h22
-rw-r--r--arch/s390/include/asm/physmem_info.h4
-rw-r--r--arch/s390/include/asm/setup.h14
-rw-r--r--arch/s390/kernel/crash_dump.c41
-rw-r--r--arch/s390/kernel/ipl.c6
-rw-r--r--arch/s390/kernel/nospec-branch.c4
-rw-r--r--arch/s390/kernel/os_info.c25
-rw-r--r--arch/s390/kernel/perf_cpum_cf.c2
-rw-r--r--arch/s390/kernel/perf_cpum_cf_events.c11
-rw-r--r--arch/s390/kernel/setup.c6
-rw-r--r--arch/s390/kernel/uv.c51
-rw-r--r--arch/s390/kernel/vmcore_info.c2
-rw-r--r--arch/s390/kernel/vmlinux.lds.S5
-rw-r--r--arch/s390/kvm/kvm-s390.c4
-rw-r--r--arch/s390/kvm/vsie.c3
-rw-r--r--arch/s390/lib/Makefile2
-rw-r--r--arch/s390/lib/expoline.S (renamed from arch/s390/lib/expoline/expoline.S)0
-rw-r--r--arch/s390/lib/expoline/Makefile3
-rw-r--r--arch/s390/mm/gmap.c165
-rw-r--r--arch/s390/mm/vmem.c5
-rw-r--r--arch/s390/tools/relocs.c2
-rw-r--r--drivers/crypto/Kconfig18
-rw-r--r--drivers/s390/char/Makefile2
-rw-r--r--drivers/s390/cio/chp.c141
-rw-r--r--drivers/s390/cio/chp.h2
-rw-r--r--drivers/s390/cio/chsc.c122
-rw-r--r--drivers/s390/cio/chsc.h5
-rw-r--r--drivers/s390/cio/css.c14
-rw-r--r--drivers/s390/cio/css.h13
-rw-r--r--drivers/s390/cio/trace.h2
-rw-r--r--drivers/s390/crypto/Makefile2
-rw-r--r--drivers/s390/crypto/ap_bus.c238
-rw-r--r--drivers/s390/crypto/ap_bus.h22
-rw-r--r--drivers/s390/crypto/ap_queue.c4
-rw-r--r--drivers/s390/crypto/vfio_ap_ops.c224
-rw-r--r--drivers/s390/crypto/vfio_ap_private.h6
-rw-r--r--mm/userfaultfd.c35
-rw-r--r--scripts/mod/modpost.c5
-rw-r--r--tools/testing/selftests/kvm/Makefile1
-rw-r--r--tools/testing/selftests/kvm/s390x/shared_zeropage_test.c110
66 files changed, 1541 insertions, 591 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 827b51450cc28d..670951a4b1d811 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4790,7 +4790,9 @@
prot_virt= [S390] enable hosting protected virtual machines
isolated from the hypervisor (if hardware supports
- that).
+ that). If enabled, the default kernel base address
+ might be overridden even when Kernel Address Space
+ Layout Randomization is disabled.
Format: <bool>
psi= [KNL] Enable or disable pressure stall information
diff --git a/Documentation/arch/s390/index.rst b/Documentation/arch/s390/index.rst
index 73c79bf586fd60..e75a6e5d2505e6 100644
--- a/Documentation/arch/s390/index.rst
+++ b/Documentation/arch/s390/index.rst
@@ -8,6 +8,7 @@ s390 Architecture
cds
3270
driver-model
+ mm
monreader
qeth
s390dbf
diff --git a/Documentation/arch/s390/mm.rst b/Documentation/arch/s390/mm.rst
new file mode 100644
index 00000000000000..084adad5eef9ec
--- /dev/null
+++ b/Documentation/arch/s390/mm.rst
@@ -0,0 +1,111 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=================
+Memory Management
+=================
+
+Virtual memory layout
+=====================
+
+.. note::
+
+ - Some aspects of the virtual memory layout setup are not
+ clarified (number of page levels, alignment, DMA memory).
+
+ - Unused gaps in the virtual memory layout could be present
+ or not - depending on how partucular system is configured.
+ No page tables are created for the unused gaps.
+
+ - The virtual memory regions are tracked or untracked by KASAN
+ instrumentation, as well as the KASAN shadow memory itself is
+ created only when CONFIG_KASAN configuration option is enabled.
+
+::
+
+ =============================================================================
+ | Physical | Virtual | VM area description
+ =============================================================================
+ +- 0 --------------+- 0 --------------+
+ | | S390_lowcore | Low-address memory
+ | +- 8 KB -----------+
+ | | |
+ | | |
+ | | ... unused gap | KASAN untracked
+ | | |
+ +- AMODE31_START --+- AMODE31_START --+ .amode31 rand. phys/virt start
+ |.amode31 text/data|.amode31 text/data| KASAN untracked
+ +- AMODE31_END ----+- AMODE31_END ----+ .amode31 rand. phys/virt end (<2GB)
+ | | |
+ | | |
+ +- __kaslr_offset_phys | kernel rand. phys start
+ | | |
+ | kernel text/data | |
+ | | |
+ +------------------+ | kernel phys end
+ | | |
+ | | |
+ | | |
+ | | |
+ +- ident_map_size -+ |
+ | |
+ | ... unused gap | KASAN untracked
+ | |
+ +- __identity_base + identity mapping start (>= 2GB)
+ | |
+ | identity | phys == virt - __identity_base
+ | mapping | virt == phys + __identity_base
+ | |
+ | | KASAN tracked
+ | |
+ | |
+ | |
+ | |
+ | |
+ | |
+ | |
+ | |
+ | |
+ | |
+ | |
+ | |
+ | |
+ | |
+ | |
+ +---- vmemmap -----+ 'struct page' array start
+ | |
+ | virtually mapped |
+ | memory map | KASAN untracked
+ | |
+ +- __abs_lowcore --+
+ | |
+ | Absolute Lowcore | KASAN untracked
+ | |
+ +- __memcpy_real_area
+ | |
+ | Real Memory Copy| KASAN untracked
+ | |
+ +- VMALLOC_START --+ vmalloc area start
+ | | KASAN untracked or
+ | vmalloc area | KASAN shallowly populated in case
+ | | CONFIG_KASAN_VMALLOC=y
+ +- MODULES_VADDR --+ modules area start
+ | | KASAN allocated per module or
+ | modules area | KASAN shallowly populated in case
+ | | CONFIG_KASAN_VMALLOC=y
+ +- __kaslr_offset -+ kernel rand. virt start
+ | | KASAN tracked
+ | kernel text/data | phys == (kvirt - __kaslr_offset) +
+ | | __kaslr_offset_phys
+ +- kernel .bss end + kernel rand. virt end
+ | |
+ | ... unused gap | KASAN untracked
+ | |
+ +------------------+ UltraVisor Secure Storage limit
+ | |
+ | ... unused gap | KASAN untracked
+ | |
+ +KASAN_SHADOW_START+ KASAN shadow memory start
+ | |
+ | KASAN shadow | KASAN untracked
+ | |
+ +------------------+ ASCE limit
diff --git a/Documentation/arch/s390/vfio-ap.rst b/Documentation/arch/s390/vfio-ap.rst
index 929ee1c1c94007..ea744cbc8687e4 100644
--- a/Documentation/arch/s390/vfio-ap.rst
+++ b/Documentation/arch/s390/vfio-ap.rst
@@ -380,6 +380,36 @@ matrix device.
control_domains:
A read-only file for displaying the control domain numbers assigned to the
vfio_ap mediated device.
+ ap_config:
+ A read/write file that, when written to, allows all three of the
+ vfio_ap mediated device's ap matrix masks to be replaced in one shot.
+ Three masks are given, one for adapters, one for domains, and one for
+ control domains. If the given state cannot be set then no changes are
+ made to the vfio-ap mediated device.
+
+ The format of the data written to ap_config is as follows:
+ {amask},{dmask},{cmask}\n
+
+ \n is a newline character.
+
+ amask, dmask, and cmask are masks identifying which adapters, domains,
+ and control domains should be assigned to the mediated device.
+
+ The format of a mask is as follows:
+ 0xNN..NN
+
+ Where NN..NN is 64 hexadecimal characters representing a 256-bit value.
+ The leftmost (highest order) bit represents adapter/domain 0.
+
+ For an example set of masks that represent your mdev's current
+ configuration, simply cat ap_config.
+
+ Setting an adapter or domain number greater than the maximum allowed for
+ the system will result in an error.
+
+ This attribute is intended to be used by automation. End users would be
+ better served using the respective assign/unassign attributes for
+ adapters, domains, and control domains.
* functions:
@@ -550,7 +580,7 @@ These are the steps:
following Kconfig elements selected:
* IOMMU_SUPPORT
* S390
- * ZCRYPT
+ * AP
* VFIO
* KVM
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index d9aed4c93ee671..7e7c8567044a38 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -17,6 +17,9 @@ config ARCH_HAS_ILOG2_U32
config ARCH_HAS_ILOG2_U64
def_bool n
+config ARCH_PROC_KCORE_TEXT
+ def_bool y
+
config GENERIC_HWEIGHT
def_bool y
@@ -552,7 +555,7 @@ config EXPOLINE
If unsure, say N.
config EXPOLINE_EXTERN
- def_bool n
+ def_bool y if EXPOLINE
depends on EXPOLINE
depends on CC_IS_GCC && GCC_VERSION >= 110200
depends on $(success,$(srctree)/arch/s390/tools/gcc-thunk-extern.sh $(CC))
@@ -611,6 +614,25 @@ config RANDOMIZE_BASE
as a security feature that deters exploit attempts relying on
knowledge of the location of kernel internals.
+config KERNEL_IMAGE_BASE
+ hex "Kernel image base address"
+ range 0x100000 0x1FFFFFE0000000 if !KASAN
+ range 0x100000 0x1BFFFFE0000000 if KASAN
+ default 0x3FFE0000000 if !KASAN
+ default 0x7FFFE0000000 if KASAN
+ help
+ This is the address at which the kernel image is loaded in case
+ Kernel Address Space Layout Randomization (KASLR) is disabled.
+
+ In case the Protected virtualization guest support is enabled the
+ Ultravisor imposes a virtual address limit. If the value of this
+ option leads to the kernel image exceeding the Ultravisor limit,
+ this option is ignored and the image is loaded below the limit.
+
+ If the value of this option leads to the kernel image overlapping
+ the virtual memory where other data structures are located, this
+ option is ignored and the image is loaded above the structures.
+
endmenu
menu "Memory setup"
@@ -724,6 +746,33 @@ config EADM_SCH
To compile this driver as a module, choose M here: the
module will be called eadm_sch.
+config AP
+ def_tristate y
+ prompt "Support for Adjunct Processors (ap)"
+ help
+ This driver allows usage to Adjunct Processor (AP) devices via
+ the ap bus, cards and queues. Supported Adjunct Processors are
+ the CryptoExpress Cards (CEX).
+
+ To compile this driver as a module, choose M here: the
+ module will be called ap.
+
+ If unsure, say Y (default).
+
+config AP_DEBUG
+ def_bool n
+ prompt "Enable debug features for Adjunct Processor (ap) devices"
+ depends on AP
+ help
+ Say 'Y' here to enable some additional debug features for Adjunct
+ Processor (ap) devices.
+
+ There will be some more sysfs attributes displayed for ap queues.
+
+ Do not enable on production level kernel build.
+
+ If unsure, say N.
+
config VFIO_CCW
def_tristate n
prompt "Support for VFIO-CCW subchannels"
@@ -740,7 +789,7 @@ config VFIO_AP
prompt "VFIO support for AP devices"
depends on KVM
depends on VFIO
- depends on ZCRYPT
+ depends on AP
select VFIO_MDEV
help
This driver grants access to Adjunct Processor (AP) devices
diff --git a/arch/s390/Makefile b/arch/s390/Makefile
index 2dbb2d2f22f99c..64821f54f1e025 100644
--- a/arch/s390/Makefile
+++ b/arch/s390/Makefile
@@ -88,7 +88,6 @@ endif
ifdef CONFIG_EXPOLINE
ifdef CONFIG_EXPOLINE_EXTERN
- KBUILD_LDFLAGS_MODULE += arch/s390/lib/expoline/expoline.o
CC_FLAGS_EXPOLINE := -mindirect-branch=thunk-extern
CC_FLAGS_EXPOLINE += -mfunction-return=thunk-extern
else
@@ -167,11 +166,6 @@ vdso_prepare: prepare0
vdso-install-y += arch/s390/kernel/vdso64/vdso64.so.dbg
vdso-install-$(CONFIG_COMPAT) += arch/s390/kernel/vdso32/vdso32.so.dbg
-ifdef CONFIG_EXPOLINE_EXTERN
-modules_prepare: expoline_prepare
-expoline_prepare: scripts
- $(Q)$(MAKE) $(build)=arch/s390/lib/expoline arch/s390/lib/expoline/expoline.o
-endif
endif
# Don't use tabs in echo arguments
diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile
index 294f08a8811a22..bd5d4d37a96101 100644
--- a/arch/s390/boot/Makefile
+++ b/arch/s390/boot/Makefile
@@ -112,7 +112,7 @@ $(obj)/vmlinux.bin: vmlinux FORCE
ifndef CONFIG_PIE_BUILD
CMD_RELOCS=arch/s390/tools/relocs
-quiet_cmd_relocs = RELOCS $@
+quiet_cmd_relocs = RELOCS $@
cmd_relocs = $(CMD_RELOCS) $< > $@
$(obj)/relocs.S: vmlinux FORCE
$(call if_changed,relocs)
diff --git a/arch/s390/boot/boot.h b/arch/s390/boot/boot.h
index 567d60f78bbcca..85da1c6cef4f58 100644
--- a/arch/s390/boot/boot.h
+++ b/arch/s390/boot/boot.h
@@ -17,7 +17,6 @@ struct machine_info {
};
struct vmlinux_info {
- unsigned long default_lma;
unsigned long entry;
unsigned long image_size; /* does not include .bss */
unsigned long bss_size; /* uncompressed image .bss size */
@@ -74,10 +73,11 @@ void sclp_early_setup_buffer(void);
void print_pgm_check_info(void);
unsigned long randomize_within_range(unsigned long size, unsigned long align,
unsigned long min, unsigned long max);
-void setup_vmem(unsigned long asce_limit);
+void setup_vmem(unsigned long kernel_start, unsigned long kernel_end, unsigned long asce_limit);
void __printf(1, 2) decompressor_printk(const char *fmt, ...);
void print_stacktrace(unsigned long sp);
void error(char *m);
+int get_random(unsigned long limit, unsigned long *value);
extern struct machine_info machine;
@@ -98,6 +98,10 @@ extern struct vmlinux_info _vmlinux_info;
#define vmlinux _vmlinux_info
#define __abs_lowcore_pa(x) (((unsigned long)(x) - __abs_lowcore) % sizeof(struct lowcore))
+#define __kernel_va(x) ((void *)((unsigned long)(x) - __kaslr_offset_phys + __kaslr_offset))
+#define __kernel_pa(x) ((unsigned long)(x) - __kaslr_offset + __kaslr_offset_phys)
+#define __identity_va(x) ((void *)((unsigned long)(x) + __identity_base))
+#define __identity_pa(x) ((unsigned long)(x) - __identity_base)
static inline bool intersects(unsigned long addr0, unsigned long size0,
unsigned long addr1, unsigned long size1)
diff --git a/arch/s390/boot/decompressor.c b/arch/s390/boot/decompressor.c
index d762733a07530d..f478e8e9cbda4a 100644
--- a/arch/s390/boot/decompressor.c
+++ b/arch/s390/boot/decompressor.c
@@ -63,24 +63,13 @@ static unsigned long free_mem_end_ptr = (unsigned long) _end + BOOT_HEAP_SIZE;
#include "../../../../lib/decompress_unzstd.c"
#endif
-#define decompress_offset ALIGN((unsigned long)_end + BOOT_HEAP_SIZE, PAGE_SIZE)
-
unsigned long mem_safe_offset(void)
{
- /*
- * due to 4MB HEAD_SIZE for bzip2
- * 'decompress_offset + vmlinux.image_size' could be larger than
- * kernel at final position + its .bss, so take the larger of two
- */
- return max(decompress_offset + vmlinux.image_size,
- vmlinux.default_lma + vmlinux.image_size + vmlinux.bss_size);
+ return ALIGN(free_mem_end_ptr, PAGE_SIZE);
}
-void *decompress_kernel(void)
+void deploy_kernel(void *output)
{
- void *output = (void *)decompress_offset;
-
__decompress(_compressed_start, _compressed_end - _compressed_start,
NULL, NULL, output, vmlinux.image_size, NULL, error);
- return output;
}
diff --git a/arch/s390/boot/decompressor.h b/arch/s390/boot/decompressor.h
index 92b81d2ea35d65..4f966f06bd65a3 100644
--- a/arch/s390/boot/decompressor.h
+++ b/arch/s390/boot/decompressor.h
@@ -2,11 +2,9 @@
#ifndef BOOT_COMPRESSED_DECOMPRESSOR_H
#define BOOT_COMPRESSED_DECOMPRESSOR_H
-#ifdef CONFIG_KERNEL_UNCOMPRESSED
-static inline void *decompress_kernel(void) { return NULL; }
-#else
-void *decompress_kernel(void);
-#endif
+#ifndef CONFIG_KERNEL_UNCOMPRESSED
unsigned long mem_safe_offset(void);
+void deploy_kernel(void *output);
+#endif
#endif /* BOOT_COMPRESSED_DECOMPRESSOR_H */
diff --git a/arch/s390/boot/kaslr.c b/arch/s390/boot/kaslr.c
index 90602101e2aee8..bd3bf5ef472df2 100644
--- a/arch/s390/boot/kaslr.c
+++ b/arch/s390/boot/kaslr.c
@@ -43,7 +43,7 @@ static int check_prng(void)
return PRNG_MODE_TDES;
}
-static int get_random(unsigned long limit, unsigned long *value)
+int get_random(unsigned long limit, unsigned long *value)
{
struct prng_parm prng = {
/* initial parameter block for tdes mode, copied from libica */
diff --git a/arch/s390/boot/pgm_check_info.c b/arch/s390/boot/pgm_check_info.c
index 97244cd7a20696..ea96275b038039 100644
--- a/arch/s390/boot/pgm_check_info.c
+++ b/arch/s390/boot/pgm_check_info.c
@@ -153,8 +153,10 @@ void print_pgm_check_info(void)
decompressor_printk("Kernel command line: %s\n", early_command_line);
decompressor_printk("Kernel fault: interruption code %04x ilc:%x\n",
S390_lowcore.pgm_code, S390_lowcore.pgm_ilc >> 1);
- if (kaslr_enabled())
+ if (kaslr_enabled()) {
decompressor_printk("Kernel random base: %lx\n", __kaslr_offset);
+ decompressor_printk("Kernel random base phys: %lx\n", __kaslr_offset_phys);
+ }
decompressor_printk("PSW : %016lx %016lx (%pS)\n",
S390_lowcore.psw_save_area.mask,
S390_lowcore.psw_save_area.addr,
diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c
index 6cf89314209a62..246d54499c201e 100644
--- a/arch/s390/boot/startup.c
+++ b/arch/s390/boot/startup.c
@@ -3,6 +3,7 @@
#include <linux/elf.h>
#include <asm/page-states.h>
#include <asm/boot_data.h>
+#include <asm/extmem.h>
#include <asm/sections.h>
#include <asm/maccess.h>
#include <asm/cpu_mf.h>
@@ -18,7 +19,7 @@
#include "boot.h"
#include "uv.h"
-unsigned long __bootdata_preserved(__kaslr_offset);
+struct vm_layout __bootdata_preserved(vm_layout);
unsigned long __bootdata_preserved(__abs_lowcore);
unsigned long __bootdata_preserved(__memcpy_real_area);
pte_t *__bootdata_preserved(memcpy_real_ptep);
@@ -29,7 +30,6 @@ unsigned long __bootdata_preserved(vmemmap_size);
unsigned long __bootdata_preserved(MODULES_VADDR);
unsigned long __bootdata_preserved(MODULES_END);
unsigned long __bootdata_preserved(max_mappable);
-unsigned long __bootdata(ident_map_size);
u64 __bootdata_preserved(stfle_fac_list[16]);
u64 __bootdata_preserved(alt_stfle_fac_list[16]);
@@ -109,9 +109,19 @@ static void setup_lpp(void)
}
#ifdef CONFIG_KERNEL_UNCOMPRESSED
-unsigned long mem_safe_offset(void)
+static unsigned long mem_safe_offset(void)
{
- return vmlinux.default_lma + vmlinux.image_size + vmlinux.bss_size;
+ return (unsigned long)_compressed_start;
+}
+
+static void deploy_kernel(void *output)
+{
+ void *uncompressed_start = (void *)_compressed_start;
+
+ if (output == uncompressed_start)
+ return;
+ memmove(output, uncompressed_start, vmlinux.image_size);
+ memset(uncompressed_start, 0, vmlinux.image_size);
}
#endif
@@ -142,7 +152,8 @@ static void copy_bootdata(void)
}
#ifdef CONFIG_PIE_BUILD
-static void kaslr_adjust_relocs(unsigned long min_addr, unsigned long max_addr, unsigned long offset)
+static void kaslr_adjust_relocs(unsigned long min_addr, unsigned long max_addr,
+ unsigned long offset, unsigned long phys_offset)
{
Elf64_Rela *rela_start, *rela_end, *rela;
int r_type, r_sym, rc;
@@ -153,18 +164,18 @@ static void kaslr_adjust_relocs(unsigned long min_addr, unsigned long max_addr,
rela_end = (Elf64_Rela *) vmlinux.rela_dyn_end;
dynsym = (Elf64_Sym *) vmlinux.dynsym_start;
for (rela = rela_start; rela < rela_end; rela++) {
- loc = rela->r_offset + offset;
+ loc = rela->r_offset + phys_offset - __START_KERNEL;
val = rela->r_addend;
r_sym = ELF64_R_SYM(rela->r_info);
if (r_sym) {
if (dynsym[r_sym].st_shndx != SHN_UNDEF)
- val += dynsym[r_sym].st_value + offset;
+ val += dynsym[r_sym].st_value + offset - __START_KERNEL;
} else {
/*
- * 0 == undefined symbol table index (STN_UNDEF),
+ * 0 == undefined symbol table index (SHN_UNDEF),
* used for R_390_RELATIVE, only add KASLR offset
*/
- val += offset;
+ val += offset - __START_KERNEL;
}
r_type = ELF64_R_TYPE(rela->r_info);
rc = arch_kexec_do_relocs(r_type, (void *) loc, val, 0);
@@ -174,37 +185,19 @@ static void kaslr_adjust_relocs(unsigned long min_addr, unsigned long max_addr,
}
static void kaslr_adjust_got(unsigned long offset) {}
-static void rescue_relocs(void) {}
-static void free_relocs(void) {}
#else
-static int *vmlinux_relocs_64_start;
-static int *vmlinux_relocs_64_end;
-
-static void rescue_relocs(void)
-{
- unsigned long size = __vmlinux_relocs_64_end - __vmlinux_relocs_64_start;
-
- vmlinux_relocs_64_start = (void *)physmem_alloc_top_down(RR_RELOC, size, 0);
- vmlinux_relocs_64_end = (void *)vmlinux_relocs_64_start + size;
- memmove(vmlinux_relocs_64_start, __vmlinux_relocs_64_start, size);
-}
-
-static void free_relocs(void)
-{
- physmem_free(RR_RELOC);
-}
-
-static void kaslr_adjust_relocs(unsigned long min_addr, unsigned long max_addr, unsigned long offset)
+static void kaslr_adjust_relocs(unsigned long min_addr, unsigned long max_addr,
+ unsigned long offset, unsigned long phys_offset)
{
int *reloc;
long loc;
/* Adjust R_390_64 relocations */
- for (reloc = vmlinux_relocs_64_start; reloc < vmlinux_relocs_64_end; reloc++) {
- loc = (long)*reloc + offset;
+ for (reloc = (int *)__vmlinux_relocs_64_start; reloc < (int *)__vmlinux_relocs_64_end; reloc++) {
+ loc = (long)*reloc + phys_offset;
if (loc < min_addr || loc > max_addr)
error("64-bit relocation outside of kernel!\n");
- *(u64 *)loc += offset;
+ *(u64 *)loc += offset - __START_KERNEL;
}
}
@@ -217,7 +210,7 @@ static void kaslr_adjust_got(unsigned long offset)
* reason. Adjust the GOT entries.
*/
for (entry = (u64 *)vmlinux.got_start; entry < (u64 *)vmlinux.got_end; entry++)
- *entry += offset;
+ *entry += offset - __START_KERNEL;
}
#endif
@@ -261,9 +254,26 @@ static void setup_ident_map_size(unsigned long max_physmem_end)
#endif
}
-static unsigned long setup_kernel_memory_layout(void)
+#define FIXMAP_SIZE round_up(MEMCPY_REAL_SIZE + ABS_LOWCORE_MAP_SIZE, sizeof(struct lowcore))
+
+static unsigned long get_vmem_size(unsigned long identity_size,
+ unsigned long vmemmap_size,
+ unsigned long vmalloc_size,
+ unsigned long rte_size)
+{
+ unsigned long max_mappable, vsize;
+
+ max_mappable = max(identity_size, MAX_DCSS_ADDR);
+ vsize = round_up(SZ_2G + max_mappable, rte_size) +
+ round_up(vmemmap_size, rte_size) +
+ FIXMAP_SIZE + MODULES_LEN + KASLR_LEN;
+ return size_add(vsize, vmalloc_size);
+}
+
+static unsigned long setup_kernel_memory_layout(unsigned long kernel_size)
{
unsigned long vmemmap_start;
+ unsigned long kernel_start;
unsigned long asce_limit;
unsigned long rte_size;
unsigned long pages;
@@ -275,12 +285,19 @@ static unsigned long setup_kernel_memory_layout(void)
vmemmap_size = SECTION_ALIGN_UP(pages) * sizeof(struct page);
/* choose kernel address space layout: 4 or 3 levels. */
- vsize = round_up(ident_map_size, _REGION3_SIZE) + vmemmap_size +
- MODULES_LEN + MEMCPY_REAL_SIZE + ABS_LOWCORE_MAP_SIZE;
- vsize = size_add(vsize, vmalloc_size);
- if (IS_ENABLED(CONFIG_KASAN) || (vsize > _REGION2_SIZE)) {
+ BUILD_BUG_ON(!IS_ALIGNED(__START_KERNEL, THREAD_SIZE));
+ BUILD_BUG_ON(!IS_ALIGNED(__NO_KASLR_START_KERNEL, THREAD_SIZE));
+ BUILD_BUG_ON(__NO_KASLR_END_KERNEL > _REGION1_SIZE);
+ vsize = get_vmem_size(ident_map_size, vmemmap_size, vmalloc_size, _REGION3_SIZE);
+ if (IS_ENABLED(CONFIG_KASAN) || __NO_KASLR_END_KERNEL > _REGION2_SIZE ||
+ (vsize > _REGION2_SIZE && kaslr_enabled())) {
asce_limit = _REGION1_SIZE;
- rte_size = _REGION2_SIZE;
+ if (__NO_KASLR_END_KERNEL > _REGION2_SIZE) {
+ rte_size = _REGION2_SIZE;
+ vsize = get_vmem_size(ident_map_size, vmemmap_size, vmalloc_size, _REGION2_SIZE);
+ } else {
+ rte_size = _REGION3_SIZE;
+ }
} else {
asce_limit = _REGION2_SIZE;
rte_size = _REGION3_SIZE;
@@ -290,38 +307,67 @@ static unsigned long setup_kernel_memory_layout(void)
* Forcing modules and vmalloc area under the ultravisor
* secure storage limit, so that any vmalloc allocation
* we do could be used to back secure guest storage.
+ *
+ * Assume the secure storage limit always exceeds _REGION2_SIZE,
+ * otherwise asce_limit and rte_size would have been adjusted.
*/
vmax = adjust_to_uv_max(asce_limit);
#ifdef CONFIG_KASAN
+ BUILD_BUG_ON(__NO_KASLR_END_KERNEL > KASAN_SHADOW_START);
/* force vmalloc and modules below kasan shadow */
vmax = min(vmax, KASAN_SHADOW_START);
#endif
- __memcpy_real_area = round_down(vmax - MEMCPY_REAL_SIZE, PAGE_SIZE);
- __abs_lowcore = round_down(__memcpy_real_area - ABS_LOWCORE_MAP_SIZE,
- sizeof(struct lowcore));
- MODULES_END = round_down(__abs_lowcore, _SEGMENT_SIZE);
+ vsize = min(vsize, vmax);
+ if (kaslr_enabled()) {
+ unsigned long kernel_end, kaslr_len, slots, pos;
+
+ kaslr_len = max(KASLR_LEN, vmax - vsize);
+ slots = DIV_ROUND_UP(kaslr_len - kernel_size, THREAD_SIZE);
+ if (get_random(slots, &pos))
+ pos = 0;
+ kernel_end = vmax - pos * THREAD_SIZE;
+ kernel_start = round_down(kernel_end - kernel_size, THREAD_SIZE);
+ } else if (vmax < __NO_KASLR_END_KERNEL || vsize > __NO_KASLR_END_KERNEL) {
+ kernel_start = round_down(vmax - kernel_size, THREAD_SIZE);
+ decompressor_printk("The kernel base address is forced to %lx\n", kernel_start);
+ } else {
+ kernel_start = __NO_KASLR_START_KERNEL;
+ }
+ __kaslr_offset = kernel_start;
+
+ MODULES_END = round_down(kernel_start, _SEGMENT_SIZE);
MODULES_VADDR = MODULES_END - MODULES_LEN;
VMALLOC_END = MODULES_VADDR;
/* allow vmalloc area to occupy up to about 1/2 of the rest virtual space left */
- vsize = round_down(VMALLOC_END / 2, _SEGMENT_SIZE);
+ vsize = (VMALLOC_END - FIXMAP_SIZE) / 2;
+ vsize = round_down(vsize, _SEGMENT_SIZE);
vmalloc_size = min(vmalloc_size, vsize);
VMALLOC_START = VMALLOC_END - vmalloc_size;
+ __memcpy_real_area = round_down(VMALLOC_START - MEMCPY_REAL_SIZE, PAGE_SIZE);
+ __abs_lowcore = round_down(__memcpy_real_area - ABS_LOWCORE_MAP_SIZE,
+ sizeof(struct lowcore));
+
/* split remaining virtual space between 1:1 mapping & vmemmap array */
- pages = VMALLOC_START / (PAGE_SIZE + sizeof(struct page));
+ pages = __abs_lowcore / (PAGE_SIZE + sizeof(struct page));
pages = SECTION_ALIGN_UP(pages);
/* keep vmemmap_start aligned to a top level region table entry */
- vmemmap_start = round_down(VMALLOC_START - pages * sizeof(struct page), rte_size);
- vmemmap_start = min(vmemmap_start, 1UL << MAX_PHYSMEM_BITS);
- /* maximum mappable address as seen by arch_get_mappable_range() */
- max_mappable = vmemmap_start;
+ vmemmap_start = round_down(__abs_lowcore - pages * sizeof(struct page), rte_size);
/* make sure identity map doesn't overlay with vmemmap */
ident_map_size = min(ident_map_size, vmemmap_start);
vmemmap_size = SECTION_ALIGN_UP(ident_map_size / PAGE_SIZE) * sizeof(struct page);
- /* make sure vmemmap doesn't overlay with vmalloc area */
- VMALLOC_START = max(vmemmap_start + vmemmap_size, VMALLOC_START);
+ /* make sure vmemmap doesn't overlay with absolute lowcore area */
+ if (vmemmap_start + vmemmap_size > __abs_lowcore) {
+ vmemmap_size = SECTION_ALIGN_DOWN(ident_map_size / PAGE_SIZE) * sizeof(struct page);
+ ident_map_size = vmemmap_size / sizeof(struct page) * PAGE_SIZE;
+ }
vmemmap = (struct page *)vmemmap_start;
+ /* maximum address for which linear mapping could be created (DCSS, memory) */
+ BUILD_BUG_ON(MAX_DCSS_ADDR > (1UL << MAX_PHYSMEM_BITS));
+ max_mappable = max(ident_map_size, MAX_DCSS_ADDR);
+ max_mappable = min(max_mappable, vmemmap_start);
+ __identity_base = round_down(vmemmap_start - max_mappable, rte_size);
return asce_limit;
}
@@ -329,9 +375,9 @@ static unsigned long setup_kernel_memory_layout(void)
/*
* This function clears the BSS section of the decompressed Linux kernel and NOT the decompressor's.
*/
-static void clear_bss_section(unsigned long vmlinux_lma)
+static void clear_bss_section(unsigned long kernel_start)
{
- memset((void *)vmlinux_lma + vmlinux.image_size, 0, vmlinux.bss_size);
+ memset((void *)kernel_start + vmlinux.image_size, 0, vmlinux.bss_size);
}
/*
@@ -348,9 +394,8 @@ static void setup_vmalloc_size(void)
vmalloc_size = max(size, vmalloc_size);
}
-static void kaslr_adjust_vmlinux_info(unsigned long offset)
+static void kaslr_adjust_vmlinux_info(long offset)
{
- *(unsigned long *)(&vmlinux.entry) += offset;
vmlinux.bootdata_off += offset;
vmlinux.bootdata_preserved_off += offset;
#ifdef CONFIG_PIE_BUILD
@@ -373,23 +418,30 @@ static void kaslr_adjust_vmlinux_info(unsigned long offset)
#endif
}
+static void fixup_vmlinux_info(void)
+{
+ vmlinux.entry -= __START_KERNEL;
+ kaslr_adjust_vmlinux_info(-__START_KERNEL);
+}
+
void startup_kernel(void)
{
- unsigned long max_physmem_end;
- unsigned long vmlinux_lma = 0;
+ unsigned long kernel_size = vmlinux.image_size + vmlinux.bss_size;
+ unsigned long nokaslr_offset_phys = mem_safe_offset();
unsigned long amode31_lma = 0;
+ unsigned long max_physmem_end;
unsigned long asce_limit;
unsigned long safe_addr;
- void *img;
psw_t psw;
+ fixup_vmlinux_info();
setup_lpp();
- safe_addr = mem_safe_offset();
+ safe_addr = PAGE_ALIGN(nokaslr_offset_phys + kernel_size);
/*
- * Reserve decompressor memory together with decompression heap, buffer and
- * memory which might be occupied by uncompressed kernel at default 1Mb
- * position (if KASLR is off or failed).
+ * Reserve decompressor memory together with decompression heap,
+ * buffer and memory which might be occupied by uncompressed kernel
+ * (if KASLR is off or failed).
*/
physmem_reserve(RR_DECOMPRESSOR, 0, safe_addr);
if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && parmarea.initrd_size)
@@ -409,40 +461,39 @@ void startup_kernel(void)
max_physmem_end = detect_max_physmem_end();
setup_ident_map_size(max_physmem_end);
setup_vmalloc_size();
- asce_limit = setup_kernel_memory_layout();
+ asce_limit = setup_kernel_memory_layout(kernel_size);
/* got final ident_map_size, physmem allocations could be performed now */
physmem_set_usable_limit(ident_map_size);
detect_physmem_online_ranges(max_physmem_end);
save_ipl_cert_comp_list();
rescue_initrd(safe_addr, ident_map_size);
- rescue_relocs();
- if (kaslr_enabled()) {
- vmlinux_lma = randomize_within_range(vmlinux.image_size + vmlinux.bss_size,
- THREAD_SIZE, vmlinux.default_lma,
- ident_map_size);
- if (vmlinux_lma) {
- __kaslr_offset = vmlinux_lma - vmlinux.default_lma;
- kaslr_adjust_vmlinux_info(__kaslr_offset);
- }
- }
- vmlinux_lma = vmlinux_lma ?: vmlinux.default_lma;
- physmem_reserve(RR_VMLINUX, vmlinux_lma, vmlinux.image_size + vmlinux.bss_size);
-
- if (!IS_ENABLED(CONFIG_KERNEL_UNCOMPRESSED)) {
- img = decompress_kernel();
- memmove((void *)vmlinux_lma, img, vmlinux.image_size);
- } else if (__kaslr_offset) {
- img = (void *)vmlinux.default_lma;
- memmove((void *)vmlinux_lma, img, vmlinux.image_size);
- memset(img, 0, vmlinux.image_size);
- }
+ if (kaslr_enabled())
+ __kaslr_offset_phys = randomize_within_range(kernel_size, THREAD_SIZE, 0, ident_map_size);
+ if (!__kaslr_offset_phys)
+ __kaslr_offset_phys = nokaslr_offset_phys;
+ kaslr_adjust_vmlinux_info(__kaslr_offset_phys);
+ physmem_reserve(RR_VMLINUX, __kaslr_offset_phys, kernel_size);
+ deploy_kernel((void *)__kaslr_offset_phys);
/* vmlinux decompression is done, shrink reserved low memory */
physmem_reserve(RR_DECOMPRESSOR, 0, (unsigned long)_decompressor_end);
+
+ /*
+ * In case KASLR is enabled the randomized location of .amode31
+ * section might overlap with .vmlinux.relocs section. To avoid that
+ * the below randomize_within_range() could have been called with
+ * __vmlinux_relocs_64_end as the lower range address. However,
+ * .amode31 section is written to by the decompressed kernel - at
+ * that time the contents of .vmlinux.relocs is not needed anymore.
+ * Conversly, .vmlinux.relocs is read only by the decompressor, even
+ * before the kernel started. Therefore, in case the two sections
+ * overlap there is no risk of corrupting any data.
+ */
if (kaslr_enabled())
amode31_lma = randomize_within_range(vmlinux.amode31_size, PAGE_SIZE, 0, SZ_2G);
- amode31_lma = amode31_lma ?: vmlinux.default_lma - vmlinux.amode31_size;
+ if (!amode31_lma)
+ amode31_lma = __kaslr_offset_phys - vmlinux.amode31_size;
physmem_reserve(RR_AMODE31, amode31_lma, vmlinux.amode31_size);
/*
@@ -458,23 +509,23 @@ void startup_kernel(void)
* - copy_bootdata() must follow setup_vmem() to propagate changes
* to bootdata made by setup_vmem()
*/
- clear_bss_section(vmlinux_lma);
- kaslr_adjust_relocs(vmlinux_lma, vmlinux_lma + vmlinux.image_size, __kaslr_offset);
+ clear_bss_section(__kaslr_offset_phys);
+ kaslr_adjust_relocs(__kaslr_offset_phys, __kaslr_offset_phys + vmlinux.image_size,
+ __kaslr_offset, __kaslr_offset_phys);
kaslr_adjust_got(__kaslr_offset);
- free_relocs();
- setup_vmem(asce_limit);
+ setup_vmem(__kaslr_offset, __kaslr_offset + kernel_size, asce_limit);
copy_bootdata();
/*
* Save KASLR offset for early dumps, before vmcore_info is set.
* Mark as uneven to distinguish from real vmcore_info pointer.
*/
- S390_lowcore.vmcore_info = __kaslr_offset ? __kaslr_offset | 0x1UL : 0;
+ S390_lowcore.vmcore_info = __kaslr_offset_phys ? __kaslr_offset_phys | 0x1UL : 0;
/*
* Jump to the decompressed kernel entry point and switch DAT mode on.
*/
- psw.addr = vmlinux.entry;
+ psw.addr = __kaslr_offset + vmlinux.entry;
psw.mask = PSW_KERNEL_BITS;
__load_psw(psw);
}
diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c
index 09b10bb6e4d070..96d48b7112d40b 100644
--- a/arch/s390/boot/vmem.c
+++ b/arch/s390/boot/vmem.c
@@ -27,6 +27,8 @@ enum populate_mode {
POPULATE_NONE,
POPULATE_DIRECT,
POPULATE_ABS_LOWCORE,
+ POPULATE_IDENTITY,
+ POPULATE_KERNEL,
#ifdef CONFIG_KASAN
POPULATE_KASAN_MAP_SHADOW,
POPULATE_KASAN_ZERO_SHADOW,
@@ -54,7 +56,7 @@ static inline void kasan_populate(unsigned long start, unsigned long end, enum p
pgtable_populate(start, end, mode);
}
-static void kasan_populate_shadow(void)
+static void kasan_populate_shadow(unsigned long kernel_start, unsigned long kernel_end)
{
pmd_t pmd_z = __pmd(__pa(kasan_early_shadow_pte) | _SEGMENT_ENTRY);
pud_t pud_z = __pud(__pa(kasan_early_shadow_pmd) | _REGION3_ENTRY);
@@ -76,44 +78,20 @@ static void kasan_populate_shadow(void)
__arch_set_page_dat(kasan_early_shadow_pmd, 1UL << CRST_ALLOC_ORDER);
__arch_set_page_dat(kasan_early_shadow_pte, 1);
- /*
- * Current memory layout:
- * +- 0 -------------+ +- shadow start -+
- * |1:1 ident mapping| /|1/8 of ident map|
- * | | / | |
- * +-end of ident map+ / +----------------+
- * | ... gap ... | / | kasan |
- * | | / | zero page |
- * +- vmalloc area -+ / | mapping |
- * | vmalloc_size | / | (untracked) |
- * +- modules vaddr -+ / +----------------+
- * | 2Gb |/ | unmapped | allocated per module
- * +- shadow start -+ +----------------+
- * | 1/8 addr space | | zero pg mapping| (untracked)
- * +- shadow end ----+---------+- shadow end ---+
- *
- * Current memory layout (KASAN_VMALLOC):
- * +- 0 -------------+ +- shadow start -+
- * |1:1 ident mapping| /|1/8 of ident map|
- * | | / | |
- * +-end of ident map+ / +----------------+
- * | ... gap ... | / | kasan zero page| (untracked)
- * | | / | mapping |
- * +- vmalloc area -+ / +----------------+
- * | vmalloc_size | / |shallow populate|
- * +- modules vaddr -+ / +----------------+
- * | 2Gb |/ |shallow populate|
- * +- shadow start -+ +----------------+
- * | 1/8 addr space | | zero pg mapping| (untracked)
- * +- shadow end ----+---------+- shadow end ---+
- */
-
for_each_physmem_usable_range(i, &start, &end) {
- kasan_populate(start, end, POPULATE_KASAN_MAP_SHADOW);
- if (memgap_start && physmem_info.info_source == MEM_DETECT_DIAG260)
- kasan_populate(memgap_start, start, POPULATE_KASAN_ZERO_SHADOW);
+ kasan_populate((unsigned long)__identity_va(start),
+ (unsigned long)__identity_va(end),
+ POPULATE_KASAN_MAP_SHADOW);
+ if (memgap_start && physmem_info.info_source == MEM_DETECT_DIAG260) {
+ kasan_populate((unsigned long)__identity_va(memgap_start),
+ (unsigned long)__identity_va(start),
+ POPULATE_KASAN_ZERO_SHADOW);
+ }
memgap_start = end;
}
+ kasan_populate(kernel_start, kernel_end, POPULATE_KASAN_MAP_SHADOW);
+ kasan_populate(0, (unsigned long)__identity_va(0), POPULATE_KASAN_ZERO_SHADOW);
+ kasan_populate(AMODE31_START, AMODE31_END, POPULATE_KASAN_ZERO_SHADOW);
if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) {
untracked_end = VMALLOC_START;
/* shallowly populate kasan shadow for vmalloc and modules */
@@ -122,8 +100,9 @@ static void kasan_populate_shadow(void)
untracked_end = MODULES_VADDR;
}
/* populate kasan shadow for untracked memory */
- kasan_populate(ident_map_size, untracked_end, POPULATE_KASAN_ZERO_SHADOW);
- kasan_populate(MODULES_END, _REGION1_SIZE, POPULATE_KASAN_ZERO_SHADOW);
+ kasan_populate((unsigned long)__identity_va(ident_map_size), untracked_end,
+ POPULATE_KASAN_ZERO_SHADOW);
+ kasan_populate(kernel_end, _REGION1_SIZE, POPULATE_KASAN_ZERO_SHADOW);
}
static bool kasan_pgd_populate_zero_shadow(pgd_t *pgd, unsigned long addr,
@@ -180,7 +159,9 @@ static bool kasan_pte_populate_zero_shadow(pte_t *pte, enum populate_mode mode)
}
#else
-static inline void kasan_populate_shadow(void) {}
+static inline void kasan_populate_shadow(unsigned long kernel_start, unsigned long kernel_end)
+{
+}
static inline bool kasan_pgd_populate_zero_shadow(pgd_t *pgd, unsigned long addr,
unsigned long end, enum populate_mode mode)
@@ -263,6 +244,10 @@ static unsigned long _pa(unsigned long addr, unsigned long size, enum populate_m
return addr;
case POPULATE_ABS_LOWCORE:
return __abs_lowcore_pa(addr);
+ case POPULATE_KERNEL:
+ return __kernel_pa(addr);
+ case POPULATE_IDENTITY:
+ return __identity_pa(addr);
#ifdef CONFIG_KASAN
case POPULATE_KASAN_MAP_SHADOW:
addr = physmem_alloc_top_down(RR_VMEM, size, size);
@@ -274,15 +259,22 @@ static unsigned long _pa(unsigned long addr, unsigned long size, enum populate_m
}
}
-static bool can_large_pud(pud_t *pu_dir, unsigned long addr, unsigned long end)
+static bool large_allowed(enum populate_mode mode)
+{
+ return (mode == POPULATE_DIRECT) || (mode == POPULATE_IDENTITY);
+}
+
+static bool can_large_pud(pud_t *pu_dir, unsigned long addr, unsigned long end,
+ enum populate_mode mode)
{
- return machine.has_edat2 &&
+ return machine.has_edat2 && large_allowed(mode) &&
IS_ALIGNED(addr, PUD_SIZE) && (end - addr) >= PUD_SIZE;
}
-static bool can_large_pmd(pmd_t *pm_dir, unsigned long addr, unsigned long end)
+static bool can_large_pmd(pmd_t *pm_dir, unsigned long addr, unsigned long end,
+ enum populate_mode mode)
{
- return machine.has_edat1 &&
+ return machine.has_edat1 && large_allowed(mode) &&
IS_ALIGNED(addr, PMD_SIZE) && (end - addr) >= PMD_SIZE;
}
@@ -322,7 +314,7 @@ static void pgtable_pmd_populate(pud_t *pud, unsigned long addr, unsigned long e
if (pmd_none(*pmd)) {
if (kasan_pmd_populate_zero_shadow(pmd, addr, next, mode))
continue;
- if (can_large_pmd(pmd, addr, next)) {
+ if (can_large_pmd(pmd, addr, next, mode)) {
entry = __pmd(_pa(addr, _SEGMENT_SIZE, mode));
entry = set_pmd_bit(entry, SEGMENT_KERNEL);
if (!machine.has_nx)
@@ -355,7 +347,7 @@ static void pgtable_pud_populate(p4d_t *p4d, unsigned long addr, unsigned long e
if (pud_none(*pud)) {
if (kasan_pud_populate_zero_shadow(pud, addr, next, mode))
continue;
- if (can_large_pud(pud, addr, next)) {
+ if (can_large_pud(pud, addr, next, mode)) {
entry = __pud(_pa(addr, _REGION3_SIZE, mode));
entry = set_pud_bit(entry, REGION3_KERNEL);
if (!machine.has_nx)
@@ -418,11 +410,12 @@ static void pgtable_populate(unsigned long addr, unsigned long end, enum populat
}
}
-void setup_vmem(unsigned long asce_limit)
+void setup_vmem(unsigned long kernel_start, unsigned long kernel_end, unsigned long asce_limit)
{
unsigned long start, end;
unsigned long asce_type;
unsigned long asce_bits;
+ pgd_t *init_mm_pgd;
int i;
/*
@@ -433,6 +426,15 @@ void setup_vmem(unsigned long asce_limit)
for_each_physmem_online_range(i, &start, &end)
__arch_set_page_nodat((void *)start, (end - start) >> PAGE_SHIFT);
+ /*
+ * init_mm->pgd contains virtual address of swapper_pg_dir.
+ * It is unusable at this stage since DAT is yet off. Swap
+ * it for physical address of swapper_pg_dir and restore
+ * the virtual address after all page tables are created.
+ */
+ init_mm_pgd = init_mm.pgd;
+ init_mm.pgd = (pgd_t *)swapper_pg_dir;
+
if (asce_limit == _REGION1_SIZE) {
asce_type = _REGION2_ENTRY_EMPTY;
asce_bits = _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
@@ -453,15 +455,20 @@ void setup_vmem(unsigned long asce_limit)
* the lowcore and create the identity mapping only afterwards.
*/
pgtable_populate(0, sizeof(struct lowcore), POPULATE_DIRECT);
- for_each_physmem_usable_range(i, &start, &end)
- pgtable_populate(start, end, POPULATE_DIRECT);
+ for_each_physmem_usable_range(i, &start, &end) {
+ pgtable_populate((unsigned long)__identity_va(start),
+ (unsigned long)__identity_va(end),
+ POPULATE_IDENTITY);
+ }
+ pgtable_populate(kernel_start, kernel_end, POPULATE_KERNEL);
+ pgtable_populate(AMODE31_START, AMODE31_END, POPULATE_DIRECT);
pgtable_populate(__abs_lowcore, __abs_lowcore + sizeof(struct lowcore),
POPULATE_ABS_LOWCORE);
pgtable_populate(__memcpy_real_area, __memcpy_real_area + PAGE_SIZE,
POPULATE_NONE);
- memcpy_real_ptep = __virt_to_kpte(__memcpy_real_area);
+ memcpy_real_ptep = __identity_va(__virt_to_kpte(__memcpy_real_area));
- kasan_populate_shadow();
+ kasan_populate_shadow(kernel_start, kernel_end);
S390_lowcore.kernel_asce.val = swapper_pg_dir | asce_bits;
S390_lowcore.user_asce = s390_invalid_asce;
@@ -471,4 +478,5 @@ void setup_vmem(unsigned long asce_limit)
local_ctl_load(13, &S390_lowcore.kernel_asce);
init_mm.context.asce = S390_lowcore.kernel_asce.val;
+ init_mm.pgd = init_mm_pgd;
}
diff --git a/arch/s390/boot/vmlinux.lds.S b/arch/s390/boot/vmlinux.lds.S
index 3d7ea585ab9935..d6454ec01e2275 100644
--- a/arch/s390/boot/vmlinux.lds.S
+++ b/arch/s390/boot/vmlinux.lds.S
@@ -99,8 +99,18 @@ SECTIONS
_decompressor_end = .;
+#ifndef CONFIG_PIE_BUILD
+ . = ALIGN(4);
+ .vmlinux.relocs : {
+ __vmlinux_relocs_64_start = .;
+ *(.vmlinux.relocs_64)
+ __vmlinux_relocs_64_end = .;
+ }
+#endif
+
#ifdef CONFIG_KERNEL_UNCOMPRESSED
- . = 0x100000;
+ . = ALIGN(PAGE_SIZE);
+ . += AMODE31_SIZE; /* .amode31 section */
#else
. = ALIGN(8);
#endif
@@ -110,24 +120,6 @@ SECTIONS
_compressed_end = .;
}
-#ifndef CONFIG_PIE_BUILD
- /*
- * When the kernel is built with CONFIG_KERNEL_UNCOMPRESSED, the entire
- * uncompressed vmlinux.bin is positioned in the bzImage decompressor
- * image at the default kernel LMA of 0x100000, enabling it to be
- * executed in-place. However, the size of .vmlinux.relocs could be
- * large enough to cause an overlap with the uncompressed kernel at the
- * address 0x100000. To address this issue, .vmlinux.relocs is
- * positioned after the .rodata.compressed.
- */
- . = ALIGN(4);
- .vmlinux.relocs : {
- __vmlinux_relocs_64_start = .;
- *(.vmlinux.relocs_64)
- __vmlinux_relocs_64_end = .;
- }
-#endif
-
#define SB_TRAILER_SIZE 32
/* Trailer needed for Secure Boot */
. += SB_TRAILER_SIZE; /* make sure .sb.trailer does not overwrite the previous section */
diff --git a/arch/s390/include/asm/ap.h b/arch/s390/include/asm/ap.h
index 43ac4a64f49b02..395b02d6a13374 100644
--- a/arch/s390/include/asm/ap.h
+++ b/arch/s390/include/asm/ap.h
@@ -223,13 +223,18 @@ static inline struct ap_queue_status ap_zapq(ap_qid_t qid, int fbit)
* config info as returned by the ap_qci() function.
*/
struct ap_config_info {
- unsigned int apsc : 1; /* S bit */
- unsigned int apxa : 1; /* N bit */
- unsigned int qact : 1; /* C bit */
- unsigned int rc8a : 1; /* R bit */
- unsigned int : 4;
- unsigned int apsb : 1; /* B bit */
- unsigned int : 23;
+ union {
+ unsigned int flags;
+ struct {
+ unsigned int apsc : 1; /* S bit */
+ unsigned int apxa : 1; /* N bit */
+ unsigned int qact : 1; /* C bit */
+ unsigned int rc8a : 1; /* R bit */
+ unsigned int : 4;
+ unsigned int apsb : 1; /* B bit */
+ unsigned int : 23;
+ };
+ };
unsigned char na; /* max # of APs - 1 */
unsigned char nd; /* max # of Domains - 1 */
unsigned char _reserved0[10];
@@ -544,15 +549,4 @@ static inline struct ap_queue_status ap_dqap(ap_qid_t qid,
return reg1.status;
}
-/*
- * Interface to tell the AP bus code that a configuration
- * change has happened. The bus code should at least do
- * an ap bus resource rescan.
- */
-#if IS_ENABLED(CONFIG_ZCRYPT)
-void ap_bus_cfg_chg(void);
-#else
-static inline void ap_bus_cfg_chg(void){}
-#endif
-
#endif /* _ASM_S390_AP_H_ */
diff --git a/arch/s390/include/asm/asm-prototypes.h b/arch/s390/include/asm/asm-prototypes.h
index 56096ae26f296b..f662eb4b9246fb 100644
--- a/arch/s390/include/asm/asm-prototypes.h
+++ b/arch/s390/include/asm/asm-prototypes.h
@@ -4,6 +4,7 @@
#include <linux/kvm_host.h>
#include <linux/ftrace.h>
#include <asm/fpu.h>
+#include <asm/nospec-branch.h>
#include <asm-generic/asm-prototypes.h>
__int128_t __ashlti3(__int128_t a, int b);
diff --git a/arch/s390/include/asm/chsc.h b/arch/s390/include/asm/chsc.h
index bb48ea380c0d1d..bb78159d8042a9 100644
--- a/arch/s390/include/asm/chsc.h
+++ b/arch/s390/include/asm/chsc.h
@@ -11,6 +11,9 @@
#include <uapi/asm/chsc.h>
+/* struct from linux/notifier.h */
+struct notifier_block;
+
/**
* Operation codes for CHSC PNSO:
* PNSO_OC_NET_BRIDGE_INFO - only addresses that are visible to a bridgeport
@@ -66,4 +69,16 @@ struct chsc_pnso_area {
struct chsc_pnso_naid_l2 entries[];
} __packed __aligned(PAGE_SIZE);
+/*
+ * notifier interface - registered notifiers gets called on
+ * the following events:
+ * - ap config changed (CHSC_NOTIFY_AP_CFG)
+ */
+enum chsc_notify_type {
+ CHSC_NOTIFY_AP_CFG = 3,
+};
+
+int chsc_notifier_register(struct notifier_block *nb);
+int chsc_notifier_unregister(struct notifier_block *nb);
+
#endif /* _ASM_S390_CHSC_H */
diff --git a/arch/s390/include/asm/extmem.h b/arch/s390/include/asm/extmem.h
index 568fd81bb77b51..e0a06060afddc8 100644
--- a/arch/s390/include/asm/extmem.h
+++ b/arch/s390/include/asm/extmem.h
@@ -8,6 +8,13 @@
#define _ASM_S390X_DCSS_H
#ifndef __ASSEMBLY__
+/*
+ * DCSS segment is defined as a contiguous range of pages using DEFSEG command.
+ * The range start and end is a page number with a value less than or equal to
+ * 0x7ffffff (see CP Commands and Utilities Reference).
+ */
+#define MAX_DCSS_ADDR (512UL * SZ_1G)
+
/* possible values for segment type as returned by segment_info */
#define SEG_TYPE_SW 0
#define SEG_TYPE_EW 1
diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
index 5cc46e0dde620d..9725586f42597c 100644
--- a/arch/s390/include/asm/gmap.h
+++ b/arch/s390/include/asm/gmap.h
@@ -146,7 +146,7 @@ int gmap_mprotect_notify(struct gmap *, unsigned long start,
void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4],
unsigned long gaddr, unsigned long vmaddr);
-int gmap_mark_unmergeable(void);
+int s390_disable_cow_sharing(void);
void s390_unlist_old_asce(struct gmap *gmap);
int s390_replace_asce(struct gmap *gmap);
void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns);
diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h
index bb1b4bef1878b2..4c2dc7abc28586 100644
--- a/arch/s390/include/asm/mmu.h
+++ b/arch/s390/include/asm/mmu.h
@@ -32,6 +32,11 @@ typedef struct {
unsigned int uses_skeys:1;
/* The mmu context uses CMM. */
unsigned int uses_cmm:1;
+ /*
+ * The mmu context allows COW-sharing of memory pages (KSM, zeropage).
+ * Note that COW-sharing during fork() is currently always allowed.
+ */
+ unsigned int allow_cow_sharing:1;
/* The gmaps associated with this context are allowed to use huge pages. */
unsigned int allow_gmap_hpage_1m:1;
} mm_context_t;
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index 929af18b09081a..a7789a9f621868 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -35,6 +35,7 @@ static inline int init_new_context(struct task_struct *tsk,
mm->context.has_pgste = 0;
mm->context.uses_skeys = 0;
mm->context.uses_cmm = 0;
+ mm->context.allow_cow_sharing = 1;
mm->context.allow_gmap_hpage_1m = 0;
#endif
switch (mm->context.asce_limit) {
diff --git a/arch/s390/include/asm/nospec-branch.h b/arch/s390/include/asm/nospec-branch.h
index 82725cf783c70c..b9c1f3cae84267 100644
--- a/arch/s390/include/asm/nospec-branch.h
+++ b/arch/s390/include/asm/nospec-branch.h
@@ -17,6 +17,26 @@ static inline bool nospec_uses_trampoline(void)
return __is_defined(CC_USING_EXPOLINE) && !nospec_disable;
}
+#ifdef CONFIG_EXPOLINE_EXTERN
+
+void __s390_indirect_jump_r1(void);
+void __s390_indirect_jump_r2(void);
+void __s390_indirect_jump_r3(void);
+void __s390_indirect_jump_r4(void);
+void __s390_indirect_jump_r5(void);
+void __s390_indirect_jump_r6(void);
+void __s390_indirect_jump_r7(void);
+void __s390_indirect_jump_r8(void);
+void __s390_indirect_jump_r9(void);
+void __s390_indirect_jump_r10(void);
+void __s390_indirect_jump_r11(void);
+void __s390_indirect_jump_r12(void);
+void __s390_indirect_jump_r13(void);
+void __s390_indirect_jump_r14(void);
+void __s390_indirect_jump_r15(void);
+
+#endif
+
#endif /* __ASSEMBLY__ */
#endif /* _ASM_S390_EXPOLINE_H */
diff --git a/arch/s390/include/asm/nospec-insn.h b/arch/s390/include/asm/nospec-insn.h
index 7a946c42ad13b8..cb15dd25bf2196 100644
--- a/arch/s390/include/asm/nospec-insn.h
+++ b/arch/s390/include/asm/nospec-insn.h
@@ -16,24 +16,25 @@
*/
.macro __THUNK_PROLOG_NAME name
#ifdef CONFIG_EXPOLINE_EXTERN
- .pushsection .text,"ax",@progbits
- __ALIGN
+ SYM_CODE_START(\name)
#else
.pushsection .text.\name,"axG",@progbits,\name,comdat
-#endif
.globl \name
.hidden \name
.type \name,@function
\name:
CFI_STARTPROC
+#endif
.endm
.macro __THUNK_EPILOG_NAME name
- CFI_ENDPROC
#ifdef CONFIG_EXPOLINE_EXTERN
- .size \name, .-\name
-#endif
+ SYM_CODE_END(\name)
+ EXPORT_SYMBOL(\name)
+#else
+ CFI_ENDPROC
.popsection
+#endif
.endm
.macro __THUNK_PROLOG_BR r1
diff --git a/arch/s390/include/asm/os_info.h b/arch/s390/include/asm/os_info.h
index a4d2e103f1161b..dea2b37b635ed8 100644
--- a/arch/s390/include/asm/os_info.h
+++ b/arch/s390/include/asm/os_info.h
@@ -17,11 +17,24 @@
#define OS_INFO_VMCOREINFO 0
#define OS_INFO_REIPL_BLOCK 1
#define OS_INFO_FLAGS_ENTRY 2
+#define OS_INFO_RESERVED 3
+#define OS_INFO_IDENTITY_BASE 4
+#define OS_INFO_KASLR_OFFSET 5
+#define OS_INFO_KASLR_OFF_PHYS 6
+#define OS_INFO_VMEMMAP 7
+#define OS_INFO_AMODE31_START 8
+#define OS_INFO_AMODE31_END 9
+#define OS_INFO_IMAGE_START 10
+#define OS_INFO_IMAGE_END 11
+#define OS_INFO_IMAGE_PHYS 12
#define OS_INFO_FLAG_REIPL_CLEAR (1UL << 0)
struct os_info_entry {
- u64 addr;
+ union {
+ u64 addr;
+ u64 val;
+ };
u64 size;
u32 csum;
} __packed;
@@ -33,17 +46,24 @@ struct os_info {
u16 version_minor;
u64 crashkernel_addr;
u64 crashkernel_size;
- struct os_info_entry entry[3];
- u8 reserved[4004];
+ struct os_info_entry entry[10];
+ u8 reserved[3864];
} __packed;
void os_info_init(void);
-void os_info_entry_add(int nr, void *ptr, u64 len);
+void os_info_entry_add_data(int nr, void *ptr, u64 len);
+void os_info_entry_add_val(int nr, u64 val);
void os_info_crashkernel_add(unsigned long base, unsigned long size);
u32 os_info_csum(struct os_info *os_info);
#ifdef CONFIG_CRASH_DUMP
void *os_info_old_entry(int nr, unsigned long *size);
+static inline unsigned long os_info_old_value(int nr)
+{
+ unsigned long size;
+
+ return (unsigned long)os_info_old_entry(nr, &size);
+}
#else
static inline void *os_info_old_entry(int nr, unsigned long *size)
{
diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
index 9381879f7ecf6c..224ff9d433eadd 100644
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@ -178,19 +178,52 @@ int arch_make_page_accessible(struct page *page);
#define HAVE_ARCH_MAKE_PAGE_ACCESSIBLE
#endif
-#define __PAGE_OFFSET 0x0UL
-#define PAGE_OFFSET 0x0UL
+struct vm_layout {
+ unsigned long kaslr_offset;
+ unsigned long kaslr_offset_phys;
+ unsigned long identity_base;
+ unsigned long identity_size;
+};
-#define __pa_nodebug(x) ((unsigned long)(x))
+extern struct vm_layout vm_layout;
+
+#define __kaslr_offset vm_layout.kaslr_offset
+#define __kaslr_offset_phys vm_layout.kaslr_offset_phys
+#define __identity_base vm_layout.identity_base
+#define ident_map_size vm_layout.identity_size
+
+static inline unsigned long kaslr_offset(void)
+{
+ return __kaslr_offset;
+}
+
+extern int __kaslr_enabled;
+static inline int kaslr_enabled(void)
+{
+ if (IS_ENABLED(CONFIG_RANDOMIZE_BASE))
+ return __kaslr_enabled;
+ return 0;
+}
+
+#define __PAGE_OFFSET __identity_base
+#define PAGE_OFFSET __PAGE_OFFSET
#ifdef __DECOMPRESSOR
+#define __pa_nodebug(x) ((unsigned long)(x))
#define __pa(x) __pa_nodebug(x)
#define __pa32(x) __pa(x)
#define __va(x) ((void *)(unsigned long)(x))
#else /* __DECOMPRESSOR */
+static inline unsigned long __pa_nodebug(unsigned long x)
+{
+ if (x < __kaslr_offset)
+ return x - __identity_base;
+ return x - __kaslr_offset + __kaslr_offset_phys;
+}
+
#ifdef CONFIG_DEBUG_VIRTUAL
unsigned long __phys_addr(unsigned long x, bool is_31bit);
@@ -206,7 +239,7 @@ static inline unsigned long __phys_addr(unsigned long x, bool is_31bit)
#define __pa(x) __phys_addr((unsigned long)(x), false)
#define __pa32(x) __phys_addr((unsigned long)(x), true)
-#define __va(x) ((void *)(unsigned long)(x))
+#define __va(x) ((void *)((unsigned long)(x) + __identity_base))
#endif /* __DECOMPRESSOR */
@@ -231,7 +264,7 @@ static inline unsigned long virt_to_pfn(const void *kaddr)
#define virt_to_page(kaddr) pfn_to_page(virt_to_pfn(kaddr))
#define page_to_virt(page) pfn_to_virt(page_to_pfn(page))
-#define virt_addr_valid(kaddr) pfn_valid(phys_to_pfn(__pa_nodebug(kaddr)))
+#define virt_addr_valid(kaddr) pfn_valid(phys_to_pfn(__pa_nodebug((unsigned long)(kaddr))))
#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_NON_EXEC
@@ -240,4 +273,11 @@ static inline unsigned long virt_to_pfn(const void *kaddr)
#include <asm-generic/memory_model.h>
#include <asm-generic/getorder.h>
+#define AMODE31_SIZE (3 * PAGE_SIZE)
+
+#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
+#define __START_KERNEL 0x100000
+#define __NO_KASLR_START_KERNEL CONFIG_KERNEL_IMAGE_BASE
+#define __NO_KASLR_END_KERNEL (__NO_KASLR_START_KERNEL + KERNEL_IMAGE_SIZE)
+
#endif /* _S390_PAGE_H */
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 2cb2a2e7b34b83..16ca0993b7573e 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -107,6 +107,12 @@ static inline int is_module_addr(void *addr)
return 1;
}
+#ifdef CONFIG_RANDOMIZE_BASE
+#define KASLR_LEN (1UL << 31)
+#else
+#define KASLR_LEN 0UL
+#endif
+
/*
* A 64 bit pagetable entry of S390 has following format:
* | PFRA |0IPC| OS |
@@ -566,10 +572,20 @@ static inline pud_t set_pud_bit(pud_t pud, pgprot_t prot)
}
/*
- * In the case that a guest uses storage keys
- * faults should no longer be backed by zero pages
+ * As soon as the guest uses storage keys or enables PV, we deduplicate all
+ * mapped shared zeropages and prevent new shared zeropages from getting
+ * mapped.
*/
-#define mm_forbids_zeropage mm_has_pgste
+#define mm_forbids_zeropage mm_forbids_zeropage
+static inline int mm_forbids_zeropage(struct mm_struct *mm)
+{
+#ifdef CONFIG_PGSTE
+ if (!mm->context.allow_cow_sharing)
+ return 1;
+#endif
+ return 0;
+}
+
static inline int mm_uses_skeys(struct mm_struct *mm)
{
#ifdef CONFIG_PGSTE
diff --git a/arch/s390/include/asm/physmem_info.h b/arch/s390/include/asm/physmem_info.h
index e747b067f8dbbf..f45cfc8bc233ff 100644
--- a/arch/s390/include/asm/physmem_info.h
+++ b/arch/s390/include/asm/physmem_info.h
@@ -22,7 +22,6 @@ enum reserved_range_type {
RR_DECOMPRESSOR,
RR_INITRD,
RR_VMLINUX,
- RR_RELOC,
RR_AMODE31,
RR_IPLREPORT,
RR_CERT_COMP_LIST,
@@ -170,4 +169,7 @@ static inline unsigned long get_physmem_reserved(enum reserved_range_type type,
return *size;
}
+#define AMODE31_START (physmem_info.reserved[RR_AMODE31].start)
+#define AMODE31_END (physmem_info.reserved[RR_AMODE31].end)
+
#endif
diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h
index 03bcaa8effb222..32f70873e2b7d5 100644
--- a/arch/s390/include/asm/setup.h
+++ b/arch/s390/include/asm/setup.h
@@ -127,20 +127,6 @@ extern void (*_machine_restart)(char *command);
extern void (*_machine_halt)(void);
extern void (*_machine_power_off)(void);
-extern unsigned long __kaslr_offset;
-static inline unsigned long kaslr_offset(void)
-{
- return __kaslr_offset;
-}
-
-extern int __kaslr_enabled;
-static inline int kaslr_enabled(void)
-{
- if (IS_ENABLED(CONFIG_RANDOMIZE_BASE))
- return __kaslr_enabled;
- return 0;
-}
-
struct oldmem_data {
unsigned long start;
unsigned long size;
diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c
index d09ebb6f52625e..9863ebe75019a6 100644
--- a/arch/s390/kernel/crash_dump.c
+++ b/arch/s390/kernel/crash_dump.c
@@ -465,7 +465,11 @@ static void *ehdr_init(Elf64_Ehdr *ehdr, int mem_chunk_cnt)
ehdr->e_phoff = sizeof(Elf64_Ehdr);
ehdr->e_ehsize = sizeof(Elf64_Ehdr);
ehdr->e_phentsize = sizeof(Elf64_Phdr);
- ehdr->e_phnum = mem_chunk_cnt + 1;
+ /*
+ * Number of memory chunk PT_LOAD program headers plus one kernel
+ * image PT_LOAD program header plus one PT_NOTE program header.
+ */
+ ehdr->e_phnum = mem_chunk_cnt + 1 + 1;
return ehdr + 1;
}
@@ -501,15 +505,16 @@ static int get_mem_chunk_cnt(void)
*/
static void loads_init(Elf64_Phdr *phdr)
{
+ unsigned long old_identity_base = os_info_old_value(OS_INFO_IDENTITY_BASE);
phys_addr_t start, end;
u64 idx;
for_each_physmem_range(idx, &oldmem_type, &start, &end) {
- phdr->p_filesz = end - start;
phdr->p_type = PT_LOAD;
+ phdr->p_vaddr = old_identity_base + start;
phdr->p_offset = start;
- phdr->p_vaddr = (unsigned long)__va(start);
phdr->p_paddr = start;
+ phdr->p_filesz = end - start;
phdr->p_memsz = end - start;
phdr->p_flags = PF_R | PF_W | PF_X;
phdr->p_align = PAGE_SIZE;
@@ -518,6 +523,25 @@ static void loads_init(Elf64_Phdr *phdr)
}
/*
+ * Prepare PT_LOAD type program header for kernel image region
+ */
+static void text_init(Elf64_Phdr *phdr)
+{
+ unsigned long start_phys = os_info_old_value(OS_INFO_IMAGE_PHYS);
+ unsigned long start = os_info_old_value(OS_INFO_IMAGE_START);
+ unsigned long end = os_info_old_value(OS_INFO_IMAGE_END);
+
+ phdr->p_type = PT_LOAD;
+ phdr->p_vaddr = start;
+ phdr->p_filesz = end - start;
+ phdr->p_memsz = end - start;
+ phdr->p_offset = start_phys;
+ phdr->p_paddr = start_phys;
+ phdr->p_flags = PF_R | PF_W | PF_X;
+ phdr->p_align = PAGE_SIZE;
+}
+
+/*
* Initialize notes (new kernel)
*/
static void *notes_init(Elf64_Phdr *phdr, void *ptr, u64 notes_offset)
@@ -557,6 +581,8 @@ static size_t get_elfcorehdr_size(int mem_chunk_cnt)
size += nt_vmcoreinfo_size();
/* nt_final */
size += sizeof(Elf64_Nhdr);
+ /* PT_LOAD type program header for kernel text region */
+ size += sizeof(Elf64_Phdr);
/* PT_LOADS */
size += mem_chunk_cnt * sizeof(Elf64_Phdr);
@@ -568,7 +594,7 @@ static size_t get_elfcorehdr_size(int mem_chunk_cnt)
*/
int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size)
{
- Elf64_Phdr *phdr_notes, *phdr_loads;
+ Elf64_Phdr *phdr_notes, *phdr_loads, *phdr_text;
size_t alloc_size;
int mem_chunk_cnt;
void *ptr, *hdr;
@@ -606,14 +632,19 @@ int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size)
/* Init program headers */
phdr_notes = ptr;
ptr = PTR_ADD(ptr, sizeof(Elf64_Phdr));
+ phdr_text = ptr;
+ ptr = PTR_ADD(ptr, sizeof(Elf64_Phdr));
phdr_loads = ptr;
ptr = PTR_ADD(ptr, sizeof(Elf64_Phdr) * mem_chunk_cnt);
/* Init notes */
hdr_off = PTR_DIFF(ptr, hdr);
ptr = notes_init(phdr_notes, ptr, ((unsigned long) hdr) + hdr_off);
+ /* Init kernel text program header */
+ text_init(phdr_text);
/* Init loads */
- hdr_off = PTR_DIFF(ptr, hdr);
loads_init(phdr_loads);
+ /* Finalize program headers */
+ hdr_off = PTR_DIFF(ptr, hdr);
*addr = (unsigned long long) hdr;
*size = (unsigned long long) hdr_off;
BUG_ON(elfcorehdr_size > alloc_size);
diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c
index 5d6d381aa0be4a..f3609a0d453d59 100644
--- a/arch/s390/kernel/ipl.c
+++ b/arch/s390/kernel/ipl.c
@@ -1210,8 +1210,8 @@ static struct attribute_group reipl_nss_attr_group = {
void set_os_info_reipl_block(void)
{
- os_info_entry_add(OS_INFO_REIPL_BLOCK, reipl_block_actual,
- reipl_block_actual->hdr.len);
+ os_info_entry_add_data(OS_INFO_REIPL_BLOCK, reipl_block_actual,
+ reipl_block_actual->hdr.len);
}
/* reipl type */
@@ -1941,7 +1941,7 @@ static void dump_reipl_run(struct shutdown_trigger *trigger)
reipl_type == IPL_TYPE_NSS ||
reipl_type == IPL_TYPE_UNKNOWN)
os_info_flags |= OS_INFO_FLAG_REIPL_CLEAR;
- os_info_entry_add(OS_INFO_FLAGS_ENTRY, &os_info_flags, sizeof(os_info_flags));
+ os_info_entry_add_data(OS_INFO_FLAGS_ENTRY, &os_info_flags, sizeof(os_info_flags));
csum = (__force unsigned int)cksm(reipl_block_actual, reipl_block_actual->hdr.len, 0);
abs_lc = get_abs_lowcore();
abs_lc->ipib = __pa(reipl_block_actual);
diff --git a/arch/s390/kernel/nospec-branch.c b/arch/s390/kernel/nospec-branch.c
index d1b16d83e49ade..9b8c24ebb00858 100644
--- a/arch/s390/kernel/nospec-branch.c
+++ b/arch/s390/kernel/nospec-branch.c
@@ -114,10 +114,10 @@ static void __init_or_module __nospec_revert(s32 *start, s32 *end)
type = BRASL_EXPOLINE; /* brasl instruction */
else
continue;
- thunk = instr + (*(int *)(instr + 2)) * 2;
+ thunk = instr + (long)(*(int *)(instr + 2)) * 2;
if (thunk[0] == 0xc6 && thunk[1] == 0x00)
/* exrl %r0,<target-br> */
- br = thunk + (*(int *)(thunk + 2)) * 2;
+ br = thunk + (long)(*(int *)(thunk + 2)) * 2;
else
continue;
if (br[0] != 0x07 || (br[1] & 0xf0) != 0xf0)
diff --git a/arch/s390/kernel/os_info.c b/arch/s390/kernel/os_info.c
index a801e6bd534178..3f4dc045a894bd 100644
--- a/arch/s390/kernel/os_info.c
+++ b/arch/s390/kernel/os_info.c
@@ -15,6 +15,7 @@
#include <asm/checksum.h>
#include <asm/abs_lowcore.h>
#include <asm/os_info.h>
+#include <asm/physmem_info.h>
#include <asm/maccess.h>
#include <asm/asm-offsets.h>
@@ -43,9 +44,9 @@ void os_info_crashkernel_add(unsigned long base, unsigned long size)
}
/*
- * Add OS info entry and update checksum
+ * Add OS info data entry and update checksum
*/
-void os_info_entry_add(int nr, void *ptr, u64 size)
+void os_info_entry_add_data(int nr, void *ptr, u64 size)
{
os_info.entry[nr].addr = __pa(ptr);
os_info.entry[nr].size = size;
@@ -54,6 +55,17 @@ void os_info_entry_add(int nr, void *ptr, u64 size)
}
/*
+ * Add OS info value entry and update checksum
+ */
+void os_info_entry_add_val(int nr, u64 value)
+{
+ os_info.entry[nr].val = value;
+ os_info.entry[nr].size = 0;
+ os_info.entry[nr].csum = 0;
+ os_info.csum = os_info_csum(&os_info);
+}
+
+/*
* Initialize OS info structure and set lowcore pointer
*/
void __init os_info_init(void)
@@ -63,6 +75,15 @@ void __init os_info_init(void)
os_info.version_major = OS_INFO_VERSION_MAJOR;
os_info.version_minor = OS_INFO_VERSION_MINOR;
os_info.magic = OS_INFO_MAGIC;
+ os_info_entry_add_val(OS_INFO_IDENTITY_BASE, __identity_base);
+ os_info_entry_add_val(OS_INFO_KASLR_OFFSET, kaslr_offset());
+ os_info_entry_add_val(OS_INFO_KASLR_OFF_PHYS, __kaslr_offset_phys);
+ os_info_entry_add_val(OS_INFO_VMEMMAP, (unsigned long)vmemmap);
+ os_info_entry_add_val(OS_INFO_AMODE31_START, AMODE31_START);
+ os_info_entry_add_val(OS_INFO_AMODE31_END, AMODE31_END);
+ os_info_entry_add_val(OS_INFO_IMAGE_START, (unsigned long)_stext);
+ os_info_entry_add_val(OS_INFO_IMAGE_END, (unsigned long)_end);
+ os_info_entry_add_val(OS_INFO_IMAGE_PHYS, __pa_symbol(_stext));
os_info.csum = os_info_csum(&os_info);
abs_lc = get_abs_lowcore();
abs_lc->os_info = __pa(&os_info);
diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c
index 41ed6e0f0a2a92..1434642e9cba0d 100644
--- a/arch/s390/kernel/perf_cpum_cf.c
+++ b/arch/s390/kernel/perf_cpum_cf.c
@@ -428,7 +428,7 @@ static void cpum_cf_make_setsize(enum cpumf_ctr_set ctrset)
case CPUMF_CTR_SET_CRYPTO:
if (cpumf_ctr_info.csvn >= 1 && cpumf_ctr_info.csvn <= 5)
ctrset_size = 16;
- else if (cpumf_ctr_info.csvn == 6 || cpumf_ctr_info.csvn == 7)
+ else if (cpumf_ctr_info.csvn >= 6)
ctrset_size = 20;
break;
case CPUMF_CTR_SET_EXT:
diff --git a/arch/s390/kernel/perf_cpum_cf_events.c b/arch/s390/kernel/perf_cpum_cf_events.c
index 0d64aafd158f22..e4a6bfc910808a 100644
--- a/arch/s390/kernel/perf_cpum_cf_events.c
+++ b/arch/s390/kernel/perf_cpum_cf_events.c
@@ -855,16 +855,11 @@ __init const struct attribute_group **cpumf_cf_event_group(void)
}
/* Determine version specific crypto set */
- switch (ci.csvn) {
- case 1 ... 5:
+ csvn = none;
+ if (ci.csvn >= 1 && ci.csvn <= 5)
csvn = cpumcf_svn_12345_pmu_event_attr;
- break;
- case 6 ... 7:
+ else if (ci.csvn >= 6)
csvn = cpumcf_svn_67_pmu_event_attr;
- break;
- default:
- csvn = none;
- }
/* Determine model-specific counter set(s) */
get_cpu_id(&cpu_id);
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 24ed33f044ec39..cbd5290939df1c 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -146,10 +146,10 @@ static u32 __amode31_ref *__ctl_linkage_stack = __ctl_linkage_stack_amode31;
static u32 __amode31_ref *__ctl_duct = __ctl_duct_amode31;
unsigned long __bootdata_preserved(max_mappable);
-unsigned long __bootdata(ident_map_size);
struct physmem_info __bootdata(physmem_info);
-unsigned long __bootdata_preserved(__kaslr_offset);
+struct vm_layout __bootdata_preserved(vm_layout);
+EXPORT_SYMBOL_GPL(vm_layout);
int __bootdata_preserved(__kaslr_enabled);
unsigned int __bootdata_preserved(zlib_dfltcc_support);
EXPORT_SYMBOL(zlib_dfltcc_support);
@@ -765,7 +765,7 @@ static void __init relocate_amode31_section(void)
unsigned long amode31_size = __eamode31 - __samode31;
long amode31_offset, *ptr;
- amode31_offset = physmem_info.reserved[RR_AMODE31].start - (unsigned long)__samode31;
+ amode31_offset = AMODE31_START - (unsigned long)__samode31;
pr_info("Relocating AMODE31 section of size 0x%08lx\n", amode31_size);
/* Move original AMODE31 section to the new one */
diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c
index fc07bc39e69839..265fea37e0308c 100644
--- a/arch/s390/kernel/uv.c
+++ b/arch/s390/kernel/uv.c
@@ -21,6 +21,7 @@
/* the bootdata_preserved fields come from ones in arch/s390/boot/uv.c */
#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST
int __bootdata_preserved(prot_virt_guest);
+EXPORT_SYMBOL(prot_virt_guest);
#endif
/*
@@ -181,36 +182,36 @@ int uv_convert_owned_from_secure(unsigned long paddr)
}
/*
- * Calculate the expected ref_count for a page that would otherwise have no
+ * Calculate the expected ref_count for a folio that would otherwise have no
* further pins. This was cribbed from similar functions in other places in
* the kernel, but with some slight modifications. We know that a secure
- * page can not be a huge page for example.
+ * folio can not be a large folio, for example.
*/
-static int expected_page_refs(struct page *page)
+static int expected_folio_refs(struct folio *folio)
{
int res;
- res = page_mapcount(page);
- if (PageSwapCache(page)) {
+ res = folio_mapcount(folio);
+ if (folio_test_swapcache(folio)) {
res++;
- } else if (page_mapping(page)) {
+ } else if (folio_mapping(folio)) {
res++;
- if (page_has_private(page))
+ if (folio->private)
res++;
}
return res;
}
-static int make_page_secure(struct page *page, struct uv_cb_header *uvcb)
+static int make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb)
{
int expected, cc = 0;
- if (PageWriteback(page))
+ if (folio_test_writeback(folio))
return -EAGAIN;
- expected = expected_page_refs(page);
- if (!page_ref_freeze(page, expected))
+ expected = expected_folio_refs(folio);
+ if (!folio_ref_freeze(folio, expected))
return -EBUSY;
- set_bit(PG_arch_1, &page->flags);
+ set_bit(PG_arch_1, &folio->flags);
/*
* If the UVC does not succeed or fail immediately, we don't want to
* loop for long, or we might get stall notifications.
@@ -220,9 +221,9 @@ static int make_page_secure(struct page *page, struct uv_cb_header *uvcb)
* -EAGAIN and we let the callers deal with it.
*/
cc = __uv_call(0, (u64)uvcb);
- page_ref_unfreeze(page, expected);
+ folio_ref_unfreeze(folio, expected);
/*
- * Return -ENXIO if the page was not mapped, -EINVAL for other errors.
+ * Return -ENXIO if the folio was not mapped, -EINVAL for other errors.
* If busy or partially completed, return -EAGAIN.
*/
if (cc == UVC_CC_OK)
@@ -277,7 +278,7 @@ int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb)
bool local_drain = false;
spinlock_t *ptelock;
unsigned long uaddr;
- struct page *page;
+ struct folio *folio;
pte_t *ptep;
int rc;
@@ -306,15 +307,19 @@ again:
if (!ptep)
goto out;
if (pte_present(*ptep) && !(pte_val(*ptep) & _PAGE_INVALID) && pte_write(*ptep)) {
- page = pte_page(*ptep);
+ folio = page_folio(pte_page(*ptep));
+ rc = -EINVAL;
+ if (folio_test_large(folio))
+ goto unlock;
rc = -EAGAIN;
- if (trylock_page(page)) {
+ if (folio_trylock(folio)) {
if (should_export_before_import(uvcb, gmap->mm))
- uv_convert_from_secure(page_to_phys(page));
- rc = make_page_secure(page, uvcb);
- unlock_page(page);
+ uv_convert_from_secure(PFN_PHYS(folio_pfn(folio)));
+ rc = make_folio_secure(folio, uvcb);
+ folio_unlock(folio);
}
}
+unlock:
pte_unmap_unlock(ptep, ptelock);
out:
mmap_read_unlock(gmap->mm);
@@ -324,10 +329,10 @@ out:
* If we are here because the UVC returned busy or partial
* completion, this is just a useless check, but it is safe.
*/
- wait_on_page_writeback(page);
+ folio_wait_writeback(folio);
} else if (rc == -EBUSY) {
/*
- * If we have tried a local drain and the page refcount
+ * If we have tried a local drain and the folio refcount
* still does not match our expected safe value, try with a
* system wide drain. This is needed if the pagevecs holding
* the page are on a different CPU.
@@ -338,7 +343,7 @@ out:
return -EAGAIN;
}
/*
- * We are here if the page refcount does not match the
+ * We are here if the folio refcount does not match the
* expected safe value. The main culprits are usually
* pagevecs. With lru_add_drain() we drain the pagevecs
* on the local CPU so that hopefully the refcount will
diff --git a/arch/s390/kernel/vmcore_info.c b/arch/s390/kernel/vmcore_info.c
index d296dfc22191ca..23f7d7619a992e 100644
--- a/arch/s390/kernel/vmcore_info.c
+++ b/arch/s390/kernel/vmcore_info.c
@@ -14,7 +14,9 @@ void arch_crash_save_vmcoreinfo(void)
VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS);
vmcoreinfo_append_str("SAMODE31=%lx\n", (unsigned long)__samode31);
vmcoreinfo_append_str("EAMODE31=%lx\n", (unsigned long)__eamode31);
+ vmcoreinfo_append_str("IDENTITYBASE=%lx\n", __identity_base);
vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset());
+ vmcoreinfo_append_str("KERNELOFFPHYS=%lx\n", __kaslr_offset_phys);
abs_lc = get_abs_lowcore();
abs_lc->vmcore_info = paddr_vmcoreinfo_note();
put_abs_lowcore(abs_lc);
diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S
index 48de296e8905c1..023fade1da3bab 100644
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -39,7 +39,7 @@ PHDRS {
SECTIONS
{
- . = 0x100000;
+ . = __START_KERNEL;
.text : {
_stext = .; /* Start of text section */
_text = .; /* Text and read-only data */
@@ -183,7 +183,7 @@ SECTIONS
.amode31.data : {
*(.amode31.data)
}
- . = ALIGN(PAGE_SIZE);
+ . = _samode31 + AMODE31_SIZE;
_eamode31 = .;
/* early.c uses stsi, which requires page aligned data. */
@@ -230,7 +230,6 @@ SECTIONS
* it should match struct vmlinux_info
*/
.vmlinux.info 0 (INFO) : {
- QUAD(_stext) /* default_lma */
QUAD(startup_continue) /* entry */
QUAD(__bss_start - _stext) /* image_size */
QUAD(__bss_stop - __bss_start) /* bss_size */
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 5147b943a864a6..db3392f0be2128 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2631,9 +2631,7 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
if (r)
break;
- mmap_write_lock(current->mm);
- r = gmap_mark_unmergeable();
- mmap_write_unlock(current->mm);
+ r = s390_disable_cow_sharing();
if (r)
break;
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index b2c9f010f0fefd..d8527a046cf759 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -12,6 +12,7 @@
#include <linux/list.h>
#include <linux/bitmap.h>
#include <linux/sched/signal.h>
+#include <linux/io.h>
#include <asm/gmap.h>
#include <asm/mmu_context.h>
@@ -1005,7 +1006,7 @@ static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
if (read_guest_real(vcpu, fac, &vsie_page->fac,
stfle_size() * sizeof(u64)))
return set_validity_icpt(scb_s, 0x1090U);
- scb_s->fac = (__u32)(__u64) &vsie_page->fac;
+ scb_s->fac = (u32)virt_to_phys(&vsie_page->fac);
}
return 0;
}
diff --git a/arch/s390/lib/Makefile b/arch/s390/lib/Makefile
index 90eac15ea62aaf..f43f897d3fc027 100644
--- a/arch/s390/lib/Makefile
+++ b/arch/s390/lib/Makefile
@@ -23,4 +23,4 @@ obj-$(CONFIG_S390_MODULES_SANITY_TEST_HELPERS) += test_modules_helpers.o
lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
-obj-$(CONFIG_EXPOLINE_EXTERN) += expoline/
+obj-$(CONFIG_EXPOLINE_EXTERN) += expoline.o
diff --git a/arch/s390/lib/expoline/expoline.S b/arch/s390/lib/expoline.S
index 92ed8409a7a449..92ed8409a7a449 100644
--- a/arch/s390/lib/expoline/expoline.S
+++ b/arch/s390/lib/expoline.S
diff --git a/arch/s390/lib/expoline/Makefile b/arch/s390/lib/expoline/Makefile
deleted file mode 100644
index 854631d9cb03a8..00000000000000
--- a/arch/s390/lib/expoline/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-
-obj-y += expoline.o
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 12d22a7fa32fd2..474a25ca5c48ff 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -2550,41 +2550,6 @@ static inline void thp_split_mm(struct mm_struct *mm)
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
/*
- * Remove all empty zero pages from the mapping for lazy refaulting
- * - This must be called after mm->context.has_pgste is set, to avoid
- * future creation of zero pages
- * - This must be called after THP was disabled.
- *
- * mm contracts with s390, that even if mm were to remove a page table,
- * racing with the loop below and so causing pte_offset_map_lock() to fail,
- * it will never insert a page table containing empty zero pages once
- * mm_forbids_zeropage(mm) i.e. mm->context.has_pgste is set.
- */
-static int __zap_zero_pages(pmd_t *pmd, unsigned long start,
- unsigned long end, struct mm_walk *walk)
-{
- unsigned long addr;
-
- for (addr = start; addr != end; addr += PAGE_SIZE) {
- pte_t *ptep;
- spinlock_t *ptl;
-
- ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
- if (!ptep)
- break;
- if (is_zero_pfn(pte_pfn(*ptep)))
- ptep_xchg_direct(walk->mm, addr, ptep, __pte(_PAGE_INVALID));
- pte_unmap_unlock(ptep, ptl);
- }
- return 0;
-}
-
-static const struct mm_walk_ops zap_zero_walk_ops = {
- .pmd_entry = __zap_zero_pages,
- .walk_lock = PGWALK_WRLOCK,
-};
-
-/*
* switch on pgstes for its userspace process (for kvm)
*/
int s390_enable_sie(void)
@@ -2601,22 +2566,142 @@ int s390_enable_sie(void)
mm->context.has_pgste = 1;
/* split thp mappings and disable thp for future mappings */
thp_split_mm(mm);
- walk_page_range(mm, 0, TASK_SIZE, &zap_zero_walk_ops, NULL);
mmap_write_unlock(mm);
return 0;
}
EXPORT_SYMBOL_GPL(s390_enable_sie);
-int gmap_mark_unmergeable(void)
+static int find_zeropage_pte_entry(pte_t *pte, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+{
+ unsigned long *found_addr = walk->private;
+
+ /* Return 1 of the page is a zeropage. */
+ if (is_zero_pfn(pte_pfn(*pte))) {
+ /*
+ * Shared zeropage in e.g., a FS DAX mapping? We cannot do the
+ * right thing and likely don't care: FAULT_FLAG_UNSHARE
+ * currently only works in COW mappings, which is also where
+ * mm_forbids_zeropage() is checked.
+ */
+ if (!is_cow_mapping(walk->vma->vm_flags))
+ return -EFAULT;
+
+ *found_addr = addr;
+ return 1;
+ }
+ return 0;
+}
+
+static const struct mm_walk_ops find_zeropage_ops = {
+ .pte_entry = find_zeropage_pte_entry,
+ .walk_lock = PGWALK_WRLOCK,
+};
+
+/*
+ * Unshare all shared zeropages, replacing them by anonymous pages. Note that
+ * we cannot simply zap all shared zeropages, because this could later
+ * trigger unexpected userfaultfd missing events.
+ *
+ * This must be called after mm->context.allow_cow_sharing was
+ * set to 0, to avoid future mappings of shared zeropages.
+ *
+ * mm contracts with s390, that even if mm were to remove a page table,
+ * and racing with walk_page_range_vma() calling pte_offset_map_lock()
+ * would fail, it will never insert a page table containing empty zero
+ * pages once mm_forbids_zeropage(mm) i.e.
+ * mm->context.allow_cow_sharing is set to 0.
+ */
+static int __s390_unshare_zeropages(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+ VMA_ITERATOR(vmi, mm, 0);
+ unsigned long addr;
+ vm_fault_t fault;
+ int rc;
+
+ for_each_vma(vmi, vma) {
+ /*
+ * We could only look at COW mappings, but it's more future
+ * proof to catch unexpected zeropages in other mappings and
+ * fail.
+ */
+ if ((vma->vm_flags & VM_PFNMAP) || is_vm_hugetlb_page(vma))
+ continue;
+ addr = vma->vm_start;
+
+retry:
+ rc = walk_page_range_vma(vma, addr, vma->vm_end,
+ &find_zeropage_ops, &addr);
+ if (rc < 0)
+ return rc;
+ else if (!rc)
+ continue;
+
+ /* addr was updated by find_zeropage_pte_entry() */
+ fault = handle_mm_fault(vma, addr,
+ FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE,
+ NULL);
+ if (fault & VM_FAULT_OOM)
+ return -ENOMEM;
+ /*
+ * See break_ksm(): even after handle_mm_fault() returned 0, we
+ * must start the lookup from the current address, because
+ * handle_mm_fault() may back out if there's any difficulty.
+ *
+ * VM_FAULT_SIGBUS and VM_FAULT_SIGSEGV are unexpected but
+ * maybe they could trigger in the future on concurrent
+ * truncation. In that case, the shared zeropage would be gone
+ * and we can simply retry and make progress.
+ */
+ cond_resched();
+ goto retry;
+ }
+
+ return 0;
+}
+
+static int __s390_disable_cow_sharing(struct mm_struct *mm)
{
+ int rc;
+
+ if (!mm->context.allow_cow_sharing)
+ return 0;
+
+ mm->context.allow_cow_sharing = 0;
+
+ /* Replace all shared zeropages by anonymous pages. */
+ rc = __s390_unshare_zeropages(mm);
/*
* Make sure to disable KSM (if enabled for the whole process or
* individual VMAs). Note that nothing currently hinders user space
* from re-enabling it.
*/
- return ksm_disable(current->mm);
+ if (!rc)
+ rc = ksm_disable(mm);
+ if (rc)
+ mm->context.allow_cow_sharing = 1;
+ return rc;
+}
+
+/*
+ * Disable most COW-sharing of memory pages for the whole process:
+ * (1) Disable KSM and unmerge/unshare any KSM pages.
+ * (2) Disallow shared zeropages and unshare any zerpages that are mapped.
+ *
+ * Not that we currently don't bother with COW-shared pages that are shared
+ * with parent/child processes due to fork().
+ */
+int s390_disable_cow_sharing(void)
+{
+ int rc;
+
+ mmap_write_lock(current->mm);
+ rc = __s390_disable_cow_sharing(current->mm);
+ mmap_write_unlock(current->mm);
+ return rc;
}
-EXPORT_SYMBOL_GPL(gmap_mark_unmergeable);
+EXPORT_SYMBOL_GPL(s390_disable_cow_sharing);
/*
* Enable storage key handling from now on and initialize the storage
@@ -2685,7 +2770,7 @@ int s390_enable_skey(void)
goto out_up;
mm->context.uses_skeys = 1;
- rc = gmap_mark_unmergeable();
+ rc = __s390_disable_cow_sharing(mm);
if (rc) {
mm->context.uses_skeys = 0;
goto out_up;
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index 85cddf904cb209..41c714e212927b 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -13,7 +13,9 @@
#include <linux/slab.h>
#include <linux/sort.h>
#include <asm/page-states.h>
+#include <asm/abs_lowcore.h>
#include <asm/cacheflush.h>
+#include <asm/maccess.h>
#include <asm/nospec-branch.h>
#include <asm/ctlreg.h>
#include <asm/pgalloc.h>
@@ -21,6 +23,7 @@
#include <asm/tlbflush.h>
#include <asm/sections.h>
#include <asm/set_memory.h>
+#include <asm/physmem_info.h>
static DEFINE_MUTEX(vmem_mutex);
@@ -436,7 +439,7 @@ static int modify_pagetable(unsigned long start, unsigned long end, bool add,
if (WARN_ON_ONCE(!PAGE_ALIGNED(start | end)))
return -EINVAL;
/* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */
- if (WARN_ON_ONCE(end > VMALLOC_START))
+ if (WARN_ON_ONCE(end > __abs_lowcore))
return -EINVAL;
for (addr = start; addr < end; addr = next) {
next = pgd_addr_end(addr, end);
diff --git a/arch/s390/tools/relocs.c b/arch/s390/tools/relocs.c
index 30a732c808f35e..a74dbd5c9896a7 100644
--- a/arch/s390/tools/relocs.c
+++ b/arch/s390/tools/relocs.c
@@ -280,7 +280,7 @@ static int do_reloc(struct section *sec, Elf_Rel *rel)
case R_390_GOTOFF64:
break;
case R_390_64:
- add_reloc(&relocs64, offset);
+ add_reloc(&relocs64, offset - ehdr.e_entry);
break;
default:
die("Unsupported relocation type: %d\n", r_type);
diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig
index 3d02702456a507..bfe18ebc2a981b 100644
--- a/drivers/crypto/Kconfig
+++ b/drivers/crypto/Kconfig
@@ -67,6 +67,7 @@ config CRYPTO_DEV_GEODE
config ZCRYPT
tristate "Support for s390 cryptographic adapters"
depends on S390
+ depends on AP
select HW_RANDOM
help
Select this option if you want to enable support for
@@ -74,23 +75,6 @@ config ZCRYPT
to 8 in Coprocessor (CEXxC), EP11 Coprocessor (CEXxP)
or Accelerator (CEXxA) mode.
-config ZCRYPT_DEBUG
- bool "Enable debug features for s390 cryptographic adapters"
- default n
- depends on DEBUG_KERNEL
- depends on ZCRYPT
- help
- Say 'Y' here to enable some additional debug features on the
- s390 cryptographic adapters driver.
-
- There will be some more sysfs attributes displayed for ap cards
- and queues and some flags on crypto requests are interpreted as
- debugging messages to force error injection.
-
- Do not enable on production level kernel build.
-
- If unsure, say N.
-
config PKEY
tristate "Kernel API for protected key handling"
depends on S390
diff --git a/drivers/s390/char/Makefile b/drivers/s390/char/Makefile
index b0f6b32016362e..81d6744e1861fd 100644
--- a/drivers/s390/char/Makefile
+++ b/drivers/s390/char/Makefile
@@ -32,7 +32,7 @@ obj-$(CONFIG_SCLP_VT220_TTY) += sclp_vt220.o
obj-$(CONFIG_PCI) += sclp_pci.o
-obj-$(subst m,y,$(CONFIG_ZCRYPT)) += sclp_ap.o
+obj-$(subst m,y,$(CONFIG_AP)) += sclp_ap.o
obj-$(CONFIG_VMLOGRDR) += vmlogrdr.o
obj-$(CONFIG_VMCP) += vmcp.o
diff --git a/drivers/s390/cio/chp.c b/drivers/s390/cio/chp.c
index 675d7ed82356a9..a07bbecba61cd4 100644
--- a/drivers/s390/cio/chp.c
+++ b/drivers/s390/cio/chp.c
@@ -127,10 +127,9 @@ static int s390_vary_chpid(struct chp_id chpid, int on)
/*
* Channel measurement related functions
*/
-static ssize_t chp_measurement_chars_read(struct file *filp,
- struct kobject *kobj,
- struct bin_attribute *bin_attr,
- char *buf, loff_t off, size_t count)
+static ssize_t measurement_chars_read(struct file *filp, struct kobject *kobj,
+ struct bin_attribute *bin_attr,
+ char *buf, loff_t off, size_t count)
{
struct channel_path *chp;
struct device *device;
@@ -143,87 +142,79 @@ static ssize_t chp_measurement_chars_read(struct file *filp,
return memory_read_from_buffer(buf, count, &off, &chp->cmg_chars,
sizeof(chp->cmg_chars));
}
+static BIN_ATTR_ADMIN_RO(measurement_chars, sizeof(struct cmg_chars));
-static const struct bin_attribute chp_measurement_chars_attr = {
- .attr = {
- .name = "measurement_chars",
- .mode = S_IRUSR,
- },
- .size = sizeof(struct cmg_chars),
- .read = chp_measurement_chars_read,
-};
-
-static void chp_measurement_copy_block(struct cmg_entry *buf,
- struct channel_subsystem *css,
- struct chp_id chpid)
-{
- void *area;
- struct cmg_entry *entry, reference_buf;
- int idx;
-
- if (chpid.id < 128) {
- area = css->cub_addr1;
- idx = chpid.id;
- } else {
- area = css->cub_addr2;
- idx = chpid.id - 128;
- }
- entry = area + (idx * sizeof(struct cmg_entry));
- do {
- memcpy(buf, entry, sizeof(*entry));
- memcpy(&reference_buf, entry, sizeof(*entry));
- } while (reference_buf.values[0] != buf->values[0]);
-}
-
-static ssize_t chp_measurement_read(struct file *filp, struct kobject *kobj,
- struct bin_attribute *bin_attr,
- char *buf, loff_t off, size_t count)
+static ssize_t chp_measurement_copy_block(void *buf, loff_t off, size_t count,
+ struct kobject *kobj, bool extended)
{
struct channel_path *chp;
struct channel_subsystem *css;
struct device *device;
unsigned int size;
+ void *area, *entry;
+ int id, idx;
device = kobj_to_dev(kobj);
chp = to_channelpath(device);
css = to_css(chp->dev.parent);
+ id = chp->chpid.id;
- size = sizeof(struct cmg_entry);
+ if (extended) {
+ /* Check if extended measurement data is available. */
+ if (!chp->extended)
+ return 0;
+
+ size = sizeof(struct cmg_ext_entry);
+ area = css->ecub[id / CSS_ECUES_PER_PAGE];
+ idx = id % CSS_ECUES_PER_PAGE;
+ } else {
+ size = sizeof(struct cmg_entry);
+ area = css->cub[id / CSS_CUES_PER_PAGE];
+ idx = id % CSS_CUES_PER_PAGE;
+ }
+ entry = area + (idx * size);
/* Only allow single reads. */
if (off || count < size)
return 0;
- chp_measurement_copy_block((struct cmg_entry *)buf, css, chp->chpid);
- count = size;
- return count;
+
+ memcpy(buf, entry, size);
+
+ return size;
}
-static const struct bin_attribute chp_measurement_attr = {
- .attr = {
- .name = "measurement",
- .mode = S_IRUSR,
- },
- .size = sizeof(struct cmg_entry),
- .read = chp_measurement_read,
+static ssize_t measurement_read(struct file *filp, struct kobject *kobj,
+ struct bin_attribute *bin_attr,
+ char *buf, loff_t off, size_t count)
+{
+ return chp_measurement_copy_block(buf, off, count, kobj, false);
+}
+static BIN_ATTR_ADMIN_RO(measurement, sizeof(struct cmg_entry));
+
+static ssize_t ext_measurement_read(struct file *filp, struct kobject *kobj,
+ struct bin_attribute *bin_attr,
+ char *buf, loff_t off, size_t count)
+{
+ return chp_measurement_copy_block(buf, off, count, kobj, true);
+}
+static BIN_ATTR_ADMIN_RO(ext_measurement, sizeof(struct cmg_ext_entry));
+
+static struct bin_attribute *measurement_attrs[] = {
+ &bin_attr_measurement_chars,
+ &bin_attr_measurement,
+ &bin_attr_ext_measurement,
+ NULL,
};
+BIN_ATTRIBUTE_GROUPS(measurement);
void chp_remove_cmg_attr(struct channel_path *chp)
{
- device_remove_bin_file(&chp->dev, &chp_measurement_chars_attr);
- device_remove_bin_file(&chp->dev, &chp_measurement_attr);
+ device_remove_groups(&chp->dev, measurement_groups);
}
int chp_add_cmg_attr(struct channel_path *chp)
{
- int ret;
-
- ret = device_create_bin_file(&chp->dev, &chp_measurement_chars_attr);
- if (ret)
- return ret;
- ret = device_create_bin_file(&chp->dev, &chp_measurement_attr);
- if (ret)
- device_remove_bin_file(&chp->dev, &chp_measurement_chars_attr);
- return ret;
+ return device_add_groups(&chp->dev, measurement_groups);
}
/*
@@ -401,6 +392,35 @@ static ssize_t chp_esc_show(struct device *dev,
}
static DEVICE_ATTR(esc, 0444, chp_esc_show, NULL);
+static char apply_max_suffix(unsigned long *value, unsigned long base)
+{
+ static char suffixes[] = { 0, 'K', 'M', 'G', 'T' };
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(suffixes) - 1; i++) {
+ if (*value < base || *value % base != 0)
+ break;
+ *value /= base;
+ }
+
+ return suffixes[i];
+}
+
+static ssize_t speed_bps_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct channel_path *chp = to_channelpath(dev);
+ unsigned long speed = chp->speed;
+ char suffix;
+
+ suffix = apply_max_suffix(&speed, 1000);
+
+ return suffix ? sysfs_emit(buf, "%lu%c\n", speed, suffix) :
+ sysfs_emit(buf, "%lu\n", speed);
+}
+
+static DEVICE_ATTR_RO(speed_bps);
+
static ssize_t util_string_read(struct file *filp, struct kobject *kobj,
struct bin_attribute *attr, char *buf,
loff_t off, size_t count)
@@ -432,6 +452,7 @@ static struct attribute *chp_attrs[] = {
&dev_attr_chid.attr,
&dev_attr_chid_external.attr,
&dev_attr_esc.attr,
+ &dev_attr_speed_bps.attr,
NULL,
};
static struct attribute_group chp_attr_group = {
diff --git a/drivers/s390/cio/chp.h b/drivers/s390/cio/chp.h
index 7ee9eba0abcbfa..a15324a43aa3ad 100644
--- a/drivers/s390/cio/chp.h
+++ b/drivers/s390/cio/chp.h
@@ -51,6 +51,8 @@ struct channel_path {
/* Channel-measurement related stuff: */
int cmg;
int shared;
+ int extended;
+ unsigned long speed;
struct cmg_chars cmg_chars;
};
diff --git a/drivers/s390/cio/chsc.c b/drivers/s390/cio/chsc.c
index 44ea76f9e1decf..dcc1e1c34ca2e1 100644
--- a/drivers/s390/cio/chsc.c
+++ b/drivers/s390/cio/chsc.c
@@ -24,7 +24,6 @@
#include <asm/crw.h>
#include <asm/isc.h>
#include <asm/ebcdic.h>
-#include <asm/ap.h>
#include "css.h"
#include "cio.h"
@@ -40,6 +39,20 @@ static DEFINE_SPINLOCK(chsc_page_lock);
#define SEI_VF_FLA 0xc0 /* VF flag for Full Link Address */
#define SEI_RS_CHPID 0x4 /* 4 in RS field indicates CHPID */
+static BLOCKING_NOTIFIER_HEAD(chsc_notifiers);
+
+int chsc_notifier_register(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&chsc_notifiers, nb);
+}
+EXPORT_SYMBOL(chsc_notifier_register);
+
+int chsc_notifier_unregister(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_unregister(&chsc_notifiers, nb);
+}
+EXPORT_SYMBOL(chsc_notifier_unregister);
+
/**
* chsc_error_from_response() - convert a chsc response to an error
* @response: chsc response code
@@ -581,7 +594,8 @@ static void chsc_process_sei_ap_cfg_chg(struct chsc_sei_nt0_area *sei_area)
if (sei_area->rs != 5)
return;
- ap_bus_cfg_chg();
+ blocking_notifier_call_chain(&chsc_notifiers,
+ CHSC_NOTIFY_AP_CFG, NULL);
}
static void chsc_process_sei_fces_event(struct chsc_sei_nt0_area *sei_area)
@@ -857,22 +871,22 @@ int __chsc_do_secm(struct channel_subsystem *css, int enable)
struct {
struct chsc_header request;
u32 operation_code : 2;
- u32 : 30;
+ u32 : 1;
+ u32 e : 1;
+ u32 : 28;
u32 key : 4;
u32 : 28;
- u32 zeroes1;
- dma32_t cub_addr1;
- u32 zeroes2;
- dma32_t cub_addr2;
- u32 reserved[13];
+ dma64_t cub[CSS_NUM_CUB_PAGES];
+ dma64_t ecub[CSS_NUM_ECUB_PAGES];
+ u32 reserved[5];
struct chsc_header response;
u32 status : 8;
u32 : 4;
u32 fmt : 4;
u32 : 16;
- } *secm_area;
+ } __packed *secm_area;
unsigned long flags;
- int ret, ccode;
+ int ret, ccode, i;
spin_lock_irqsave(&chsc_page_lock, flags);
memset(chsc_page, 0, PAGE_SIZE);
@@ -881,8 +895,12 @@ int __chsc_do_secm(struct channel_subsystem *css, int enable)
secm_area->request.code = 0x0016;
secm_area->key = PAGE_DEFAULT_KEY >> 4;
- secm_area->cub_addr1 = virt_to_dma32(css->cub_addr1);
- secm_area->cub_addr2 = virt_to_dma32(css->cub_addr2);
+ secm_area->e = 1;
+
+ for (i = 0; i < CSS_NUM_CUB_PAGES; i++)
+ secm_area->cub[i] = (__force dma64_t)virt_to_dma32(css->cub[i]);
+ for (i = 0; i < CSS_NUM_ECUB_PAGES; i++)
+ secm_area->ecub[i] = virt_to_dma64(css->ecub[i]);
secm_area->operation_code = enable ? 0 : 1;
@@ -908,19 +926,47 @@ out:
return ret;
}
+static int cub_alloc(struct channel_subsystem *css)
+{
+ int i;
+
+ for (i = 0; i < CSS_NUM_CUB_PAGES; i++) {
+ css->cub[i] = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA);
+ if (!css->cub[i])
+ return -ENOMEM;
+ }
+ for (i = 0; i < CSS_NUM_ECUB_PAGES; i++) {
+ css->ecub[i] = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!css->ecub[i])
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void cub_free(struct channel_subsystem *css)
+{
+ int i;
+
+ for (i = 0; i < CSS_NUM_CUB_PAGES; i++) {
+ free_page((unsigned long)css->cub[i]);
+ css->cub[i] = NULL;
+ }
+ for (i = 0; i < CSS_NUM_ECUB_PAGES; i++) {
+ free_page((unsigned long)css->ecub[i]);
+ css->ecub[i] = NULL;
+ }
+}
+
int
chsc_secm(struct channel_subsystem *css, int enable)
{
int ret;
if (enable && !css->cm_enabled) {
- css->cub_addr1 = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA);
- css->cub_addr2 = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA);
- if (!css->cub_addr1 || !css->cub_addr2) {
- free_page((unsigned long)css->cub_addr1);
- free_page((unsigned long)css->cub_addr2);
- return -ENOMEM;
- }
+ ret = cub_alloc(css);
+ if (ret)
+ goto out;
}
ret = __chsc_do_secm(css, enable);
if (!ret) {
@@ -934,10 +980,11 @@ chsc_secm(struct channel_subsystem *css, int enable)
} else
chsc_remove_cmg_attr(css);
}
- if (!css->cm_enabled) {
- free_page((unsigned long)css->cub_addr1);
- free_page((unsigned long)css->cub_addr2);
- }
+
+out:
+ if (!css->cm_enabled)
+ cub_free(css);
+
return ret;
}
@@ -1019,6 +1066,18 @@ chsc_initialize_cmg_chars(struct channel_path *chp, u8 cmcv,
}
}
+static unsigned long scmc_get_speed(u32 s, u32 p)
+{
+ unsigned long speed = s;
+
+ if (!p)
+ p = 8;
+ while (p--)
+ speed *= 10;
+
+ return speed;
+}
+
int chsc_get_channel_measurement_chars(struct channel_path *chp)
{
unsigned long flags;
@@ -1035,18 +1094,23 @@ int chsc_get_channel_measurement_chars(struct channel_path *chp)
u32 zeroes2;
u32 not_valid : 1;
u32 shared : 1;
- u32 : 22;
+ u32 extended : 1;
+ u32 : 21;
u32 chpid : 8;
u32 cmcv : 5;
- u32 : 11;
+ u32 : 7;
+ u32 cmgp : 4;
u32 cmgq : 8;
u32 cmg : 8;
- u32 zeroes3;
+ u32 : 16;
+ u32 cmgs : 16;
u32 data[NR_MEASUREMENT_CHARS];
} *scmc_area;
chp->shared = -1;
chp->cmg = -1;
+ chp->extended = 0;
+ chp->speed = 0;
if (!css_chsc_characteristics.scmc || !css_chsc_characteristics.secm)
return -EINVAL;
@@ -1076,10 +1140,8 @@ int chsc_get_channel_measurement_chars(struct channel_path *chp)
chp->cmg = scmc_area->cmg;
chp->shared = scmc_area->shared;
- if (chp->cmg != 2 && chp->cmg != 3) {
- /* No cmg-dependent data. */
- goto out;
- }
+ chp->extended = scmc_area->extended;
+ chp->speed = scmc_get_speed(scmc_area->cmgs, scmc_area->cmgp);
chsc_initialize_cmg_chars(chp, scmc_area->cmcv,
(struct cmg_chars *) &scmc_area->data);
out:
diff --git a/drivers/s390/cio/chsc.h b/drivers/s390/cio/chsc.h
index 03602295f33503..24cd65dbc5a763 100644
--- a/drivers/s390/cio/chsc.h
+++ b/drivers/s390/cio/chsc.h
@@ -22,6 +22,11 @@ struct cmg_entry {
u32 values[NR_MEASUREMENT_ENTRIES];
};
+#define NR_EXT_MEASUREMENT_ENTRIES 16
+struct cmg_ext_entry {
+ u32 values[NR_EXT_MEASUREMENT_ENTRIES];
+};
+
struct channel_path_desc_fmt1 {
u8 flags;
u8 lsn;
diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c
index 1d68db1a3d4e47..781f84901256c5 100644
--- a/drivers/s390/cio/css.c
+++ b/drivers/s390/cio/css.c
@@ -309,7 +309,7 @@ static ssize_t type_show(struct device *dev, struct device_attribute *attr,
{
struct subchannel *sch = to_subchannel(dev);
- return sprintf(buf, "%01x\n", sch->st);
+ return sysfs_emit(buf, "%01x\n", sch->st);
}
static DEVICE_ATTR_RO(type);
@@ -319,7 +319,7 @@ static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
{
struct subchannel *sch = to_subchannel(dev);
- return sprintf(buf, "css:t%01X\n", sch->st);
+ return sysfs_emit(buf, "css:t%01X\n", sch->st);
}
static DEVICE_ATTR_RO(modalias);
@@ -345,7 +345,7 @@ static ssize_t driver_override_show(struct device *dev,
ssize_t len;
device_lock(dev);
- len = snprintf(buf, PAGE_SIZE, "%s\n", sch->driver_override);
+ len = sysfs_emit(buf, "%s\n", sch->driver_override);
device_unlock(dev);
return len;
}
@@ -396,8 +396,8 @@ static ssize_t pimpampom_show(struct device *dev,
struct subchannel *sch = to_subchannel(dev);
struct pmcw *pmcw = &sch->schib.pmcw;
- return sprintf(buf, "%02x %02x %02x\n",
- pmcw->pim, pmcw->pam, pmcw->pom);
+ return sysfs_emit(buf, "%02x %02x %02x\n",
+ pmcw->pim, pmcw->pam, pmcw->pom);
}
static DEVICE_ATTR_RO(pimpampom);
@@ -881,7 +881,7 @@ static ssize_t real_cssid_show(struct device *dev, struct device_attribute *a,
if (!css->id_valid)
return -EINVAL;
- return sprintf(buf, "%x\n", css->cssid);
+ return sysfs_emit(buf, "%x\n", css->cssid);
}
static DEVICE_ATTR_RO(real_cssid);
@@ -904,7 +904,7 @@ static ssize_t cm_enable_show(struct device *dev, struct device_attribute *a,
int ret;
mutex_lock(&css->mutex);
- ret = sprintf(buf, "%x\n", css->cm_enabled);
+ ret = sysfs_emit(buf, "%x\n", css->cm_enabled);
mutex_unlock(&css->mutex);
return ret;
}
diff --git a/drivers/s390/cio/css.h b/drivers/s390/cio/css.h
index ea555055429740..c2b175592bb736 100644
--- a/drivers/s390/cio/css.h
+++ b/drivers/s390/cio/css.h
@@ -35,6 +35,15 @@
#define SNID_STATE3_SINGLE_PATH 0
/*
+ * Miscellaneous constants
+ */
+
+#define CSS_NUM_CUB_PAGES 2
+#define CSS_CUES_PER_PAGE 128
+#define CSS_NUM_ECUB_PAGES 4
+#define CSS_ECUES_PER_PAGE 64
+
+/*
* Conditions used to specify which subchannels need evaluation
*/
enum css_eval_cond {
@@ -122,8 +131,8 @@ struct channel_subsystem {
struct mutex mutex;
/* channel measurement related */
int cm_enabled;
- void *cub_addr1;
- void *cub_addr2;
+ void *cub[CSS_NUM_CUB_PAGES];
+ void *ecub[CSS_NUM_ECUB_PAGES];
/* for orphaned ccw devices */
struct subchannel *pseudo_subchannel;
};
diff --git a/drivers/s390/cio/trace.h b/drivers/s390/cio/trace.h
index 86993de253451a..a4c5c6736b3107 100644
--- a/drivers/s390/cio/trace.h
+++ b/drivers/s390/cio/trace.h
@@ -50,7 +50,7 @@ DECLARE_EVENT_CLASS(s390_class_schib,
__entry->devno = schib->pmcw.dev;
__entry->schib = *schib;
__entry->pmcw_ena = schib->pmcw.ena;
- __entry->pmcw_st = schib->pmcw.ena;
+ __entry->pmcw_st = schib->pmcw.st;
__entry->pmcw_dnv = schib->pmcw.dnv;
__entry->pmcw_dev = schib->pmcw.dev;
__entry->pmcw_lpm = schib->pmcw.lpm;
diff --git a/drivers/s390/crypto/Makefile b/drivers/s390/crypto/Makefile
index 0edacd101c124b..bd94811fd9f1f8 100644
--- a/drivers/s390/crypto/Makefile
+++ b/drivers/s390/crypto/Makefile
@@ -4,7 +4,7 @@
#
ap-objs := ap_bus.o ap_card.o ap_queue.o
-obj-$(subst m,y,$(CONFIG_ZCRYPT)) += ap.o
+obj-$(CONFIG_AP) += ap.o
# zcrypt_api.o and zcrypt_msgtype*.o depend on ap.o
zcrypt-objs := zcrypt_api.o zcrypt_card.o zcrypt_queue.o
zcrypt-objs += zcrypt_msgtype6.o zcrypt_msgtype50.o
diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c
index cce0bafd4c926a..c20b4509207980 100644
--- a/drivers/s390/crypto/ap_bus.c
+++ b/drivers/s390/crypto/ap_bus.c
@@ -39,13 +39,15 @@
#include <linux/ctype.h>
#include <linux/module.h>
#include <asm/uv.h>
+#include <asm/chsc.h>
#include "ap_bus.h"
#include "ap_debug.h"
-/*
- * Module parameters; note though this file itself isn't modular.
- */
+MODULE_AUTHOR("IBM Corporation");
+MODULE_DESCRIPTION("Adjunct Processor Bus driver");
+MODULE_LICENSE("GPL");
+
int ap_domain_index = -1; /* Adjunct Processor Domain Index */
static DEFINE_SPINLOCK(ap_domain_lock);
module_param_named(domain, ap_domain_index, int, 0440);
@@ -90,8 +92,9 @@ static atomic64_t ap_bindings_complete_count = ATOMIC64_INIT(0);
/* completion for APQN bindings complete */
static DECLARE_COMPLETION(ap_apqn_bindings_complete);
-static struct ap_config_info *ap_qci_info;
-static struct ap_config_info *ap_qci_info_old;
+static struct ap_config_info qci[2];
+static struct ap_config_info *const ap_qci_info = &qci[0];
+static struct ap_config_info *const ap_qci_info_old = &qci[1];
/*
* AP bus related debug feature things.
@@ -203,9 +206,7 @@ static int ap_apft_available(void)
*/
static inline int ap_qact_available(void)
{
- if (ap_qci_info)
- return ap_qci_info->qact;
- return 0;
+ return ap_qci_info->qact;
}
/*
@@ -215,9 +216,7 @@ static inline int ap_qact_available(void)
*/
int ap_sb_available(void)
{
- if (ap_qci_info)
- return ap_qci_info->apsb;
- return 0;
+ return ap_qci_info->apsb;
}
/*
@@ -229,23 +228,6 @@ bool ap_is_se_guest(void)
}
EXPORT_SYMBOL(ap_is_se_guest);
-/*
- * ap_fetch_qci_info(): Fetch cryptographic config info
- *
- * Returns the ap configuration info fetched via PQAP(QCI).
- * On success 0 is returned, on failure a negative errno
- * is returned, e.g. if the PQAP(QCI) instruction is not
- * available, the return value will be -EOPNOTSUPP.
- */
-static inline int ap_fetch_qci_info(struct ap_config_info *info)
-{
- if (!ap_qci_available())
- return -EOPNOTSUPP;
- if (!info)
- return -EINVAL;
- return ap_qci(info);
-}
-
/**
* ap_init_qci_info(): Allocate and query qci config info.
* Does also update the static variables ap_max_domain_id
@@ -253,27 +235,12 @@ static inline int ap_fetch_qci_info(struct ap_config_info *info)
*/
static void __init ap_init_qci_info(void)
{
- if (!ap_qci_available()) {
+ if (!ap_qci_available() ||
+ ap_qci(ap_qci_info)) {
AP_DBF_INFO("%s QCI not supported\n", __func__);
return;
}
-
- ap_qci_info = kzalloc(sizeof(*ap_qci_info), GFP_KERNEL);
- if (!ap_qci_info)
- return;
- ap_qci_info_old = kzalloc(sizeof(*ap_qci_info_old), GFP_KERNEL);
- if (!ap_qci_info_old) {
- kfree(ap_qci_info);
- ap_qci_info = NULL;
- return;
- }
- if (ap_fetch_qci_info(ap_qci_info) != 0) {
- kfree(ap_qci_info);
- kfree(ap_qci_info_old);
- ap_qci_info = NULL;
- ap_qci_info_old = NULL;
- return;
- }
+ memcpy(ap_qci_info_old, ap_qci_info, sizeof(*ap_qci_info));
AP_DBF_INFO("%s successful fetched initial qci info\n", __func__);
if (ap_qci_info->apxa) {
@@ -288,8 +255,6 @@ static void __init ap_init_qci_info(void)
__func__, ap_max_domain_id);
}
}
-
- memcpy(ap_qci_info_old, ap_qci_info, sizeof(*ap_qci_info));
}
/*
@@ -312,7 +277,7 @@ static inline int ap_test_config_card_id(unsigned int id)
{
if (id > ap_max_adapter_id)
return 0;
- if (ap_qci_info)
+ if (ap_qci_info->flags)
return ap_test_config(ap_qci_info->apm, id);
return 1;
}
@@ -329,7 +294,7 @@ int ap_test_config_usage_domain(unsigned int domain)
{
if (domain > ap_max_domain_id)
return 0;
- if (ap_qci_info)
+ if (ap_qci_info->flags)
return ap_test_config(ap_qci_info->aqm, domain);
return 1;
}
@@ -1061,22 +1026,24 @@ EXPORT_SYMBOL(ap_bus_force_rescan);
/*
* A config change has happened, force an ap bus rescan.
*/
-void ap_bus_cfg_chg(void)
+static int ap_bus_cfg_chg(struct notifier_block *nb,
+ unsigned long action, void *data)
{
+ if (action != CHSC_NOTIFY_AP_CFG)
+ return NOTIFY_DONE;
+
pr_debug("%s config change, forcing bus rescan\n", __func__);
ap_bus_force_rescan();
+
+ return NOTIFY_OK;
}
-/*
- * hex2bitmap() - parse hex mask string and set bitmap.
- * Valid strings are "0x012345678" with at least one valid hex number.
- * Rest of the bitmap to the right is padded with 0. No spaces allowed
- * within the string, the leading 0x may be omitted.
- * Returns the bitmask with exactly the bits set as given by the hex
- * string (both in big endian order).
- */
-static int hex2bitmap(const char *str, unsigned long *bitmap, int bits)
+static struct notifier_block ap_bus_nb = {
+ .notifier_call = ap_bus_cfg_chg,
+};
+
+int ap_hex2bitmap(const char *str, unsigned long *bitmap, int bits)
{
int i, n, b;
@@ -1103,6 +1070,7 @@ static int hex2bitmap(const char *str, unsigned long *bitmap, int bits)
return -EINVAL;
return 0;
}
+EXPORT_SYMBOL(ap_hex2bitmap);
/*
* modify_bitmap() - parse bitmask argument and modify an existing
@@ -1168,7 +1136,7 @@ static int ap_parse_bitmap_str(const char *str, unsigned long *bitmap, int bits,
rc = modify_bitmap(str, newmap, bits);
} else {
memset(newmap, 0, size);
- rc = hex2bitmap(str, newmap, bits);
+ rc = ap_hex2bitmap(str, newmap, bits);
}
return rc;
}
@@ -1234,7 +1202,7 @@ static BUS_ATTR_RW(ap_domain);
static ssize_t ap_control_domain_mask_show(const struct bus_type *bus, char *buf)
{
- if (!ap_qci_info) /* QCI not supported */
+ if (!ap_qci_info->flags) /* QCI not supported */
return sysfs_emit(buf, "not supported\n");
return sysfs_emit(buf, "0x%08x%08x%08x%08x%08x%08x%08x%08x\n",
@@ -1248,7 +1216,7 @@ static BUS_ATTR_RO(ap_control_domain_mask);
static ssize_t ap_usage_domain_mask_show(const struct bus_type *bus, char *buf)
{
- if (!ap_qci_info) /* QCI not supported */
+ if (!ap_qci_info->flags) /* QCI not supported */
return sysfs_emit(buf, "not supported\n");
return sysfs_emit(buf, "0x%08x%08x%08x%08x%08x%08x%08x%08x\n",
@@ -1262,7 +1230,7 @@ static BUS_ATTR_RO(ap_usage_domain_mask);
static ssize_t ap_adapter_mask_show(const struct bus_type *bus, char *buf)
{
- if (!ap_qci_info) /* QCI not supported */
+ if (!ap_qci_info->flags) /* QCI not supported */
return sysfs_emit(buf, "not supported\n");
return sysfs_emit(buf, "0x%08x%08x%08x%08x%08x%08x%08x%08x\n",
@@ -1595,7 +1563,7 @@ static ssize_t features_show(const struct bus_type *bus, char *buf)
{
int n = 0;
- if (!ap_qci_info) /* QCI not supported */
+ if (!ap_qci_info->flags) /* QCI not supported */
return sysfs_emit(buf, "-\n");
if (ap_qci_info->apsc)
@@ -2158,11 +2126,11 @@ static inline void ap_scan_adapter(int ap)
*/
static bool ap_get_configuration(void)
{
- if (!ap_qci_info) /* QCI not supported */
+ if (!ap_qci_info->flags) /* QCI not supported */
return false;
memcpy(ap_qci_info_old, ap_qci_info, sizeof(*ap_qci_info));
- ap_fetch_qci_info(ap_qci_info);
+ ap_qci(ap_qci_info);
return memcmp(ap_qci_info, ap_qci_info_old,
sizeof(struct ap_config_info)) != 0;
@@ -2179,7 +2147,7 @@ static bool ap_config_has_new_aps(void)
unsigned long m[BITS_TO_LONGS(AP_DEVICES)];
- if (!ap_qci_info)
+ if (!ap_qci_info->flags)
return false;
bitmap_andnot(m, (unsigned long *)ap_qci_info->apm,
@@ -2200,7 +2168,7 @@ static bool ap_config_has_new_doms(void)
{
unsigned long m[BITS_TO_LONGS(AP_DOMAINS)];
- if (!ap_qci_info)
+ if (!ap_qci_info->flags)
return false;
bitmap_andnot(m, (unsigned long *)ap_qci_info->aqm,
@@ -2310,7 +2278,82 @@ static void ap_scan_bus_wq_callback(struct work_struct *unused)
}
}
-static int __init ap_debug_init(void)
+static inline void __exit ap_async_exit(void)
+{
+ if (ap_thread_flag)
+ ap_poll_thread_stop();
+ chsc_notifier_unregister(&ap_bus_nb);
+ cancel_work(&ap_scan_bus_work);
+ hrtimer_cancel(&ap_poll_timer);
+ timer_delete(&ap_scan_bus_timer);
+}
+
+static inline int __init ap_async_init(void)
+{
+ int rc;
+
+ /* Setup the AP bus rescan timer. */
+ timer_setup(&ap_scan_bus_timer, ap_scan_bus_timer_callback, 0);
+
+ /*
+ * Setup the high resolution poll timer.
+ * If we are running under z/VM adjust polling to z/VM polling rate.
+ */
+ if (MACHINE_IS_VM)
+ poll_high_timeout = 1500000;
+ hrtimer_init(&ap_poll_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ ap_poll_timer.function = ap_poll_timeout;
+
+ queue_work(system_long_wq, &ap_scan_bus_work);
+
+ rc = chsc_notifier_register(&ap_bus_nb);
+ if (rc)
+ goto out;
+
+ /* Start the low priority AP bus poll thread. */
+ if (!ap_thread_flag)
+ return 0;
+
+ rc = ap_poll_thread_start();
+ if (rc)
+ goto out_notifier;
+
+ return 0;
+
+out_notifier:
+ chsc_notifier_unregister(&ap_bus_nb);
+out:
+ cancel_work(&ap_scan_bus_work);
+ hrtimer_cancel(&ap_poll_timer);
+ timer_delete(&ap_scan_bus_timer);
+ return rc;
+}
+
+static inline void ap_irq_exit(void)
+{
+ if (ap_irq_flag)
+ unregister_adapter_interrupt(&ap_airq);
+}
+
+static inline int __init ap_irq_init(void)
+{
+ int rc;
+
+ if (!ap_interrupts_available() || !ap_useirq)
+ return 0;
+
+ rc = register_adapter_interrupt(&ap_airq);
+ ap_irq_flag = (rc == 0);
+
+ return rc;
+}
+
+static inline void ap_debug_exit(void)
+{
+ debug_unregister(ap_dbf_info);
+}
+
+static inline int __init ap_debug_init(void)
{
ap_dbf_info = debug_register("ap", 2, 1,
AP_DBF_MAX_SPRINTF_ARGS * sizeof(long));
@@ -2378,12 +2421,6 @@ static int __init ap_module_init(void)
ap_domain_index = -1;
}
- /* enable interrupts if available */
- if (ap_interrupts_available() && ap_useirq) {
- rc = register_adapter_interrupt(&ap_airq);
- ap_irq_flag = (rc == 0);
- }
-
/* Create /sys/bus/ap. */
rc = bus_register(&ap_bus_type);
if (rc)
@@ -2396,38 +2433,37 @@ static int __init ap_module_init(void)
goto out_bus;
ap_root_device->bus = &ap_bus_type;
- /* Setup the AP bus rescan timer. */
- timer_setup(&ap_scan_bus_timer, ap_scan_bus_timer_callback, 0);
-
- /*
- * Setup the high resolution poll timer.
- * If we are running under z/VM adjust polling to z/VM polling rate.
- */
- if (MACHINE_IS_VM)
- poll_high_timeout = 1500000;
- hrtimer_init(&ap_poll_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
- ap_poll_timer.function = ap_poll_timeout;
-
- /* Start the low priority AP bus poll thread. */
- if (ap_thread_flag) {
- rc = ap_poll_thread_start();
- if (rc)
- goto out_work;
- }
+ /* enable interrupts if available */
+ rc = ap_irq_init();
+ if (rc)
+ goto out_device;
- queue_work(system_long_wq, &ap_scan_bus_work);
+ /* Setup asynchronous work (timers, workqueue, etc). */
+ rc = ap_async_init();
+ if (rc)
+ goto out_irq;
return 0;
-out_work:
- hrtimer_cancel(&ap_poll_timer);
+out_irq:
+ ap_irq_exit();
+out_device:
root_device_unregister(ap_root_device);
out_bus:
bus_unregister(&ap_bus_type);
out:
- if (ap_irq_flag)
- unregister_adapter_interrupt(&ap_airq);
- kfree(ap_qci_info);
+ ap_debug_exit();
return rc;
}
-device_initcall(ap_module_init);
+
+static void __exit ap_module_exit(void)
+{
+ ap_async_exit();
+ ap_irq_exit();
+ root_device_unregister(ap_root_device);
+ bus_unregister(&ap_bus_type);
+ ap_debug_exit();
+}
+
+module_init(ap_module_init);
+module_exit(ap_module_exit);
diff --git a/drivers/s390/crypto/ap_bus.h b/drivers/s390/crypto/ap_bus.h
index 59c7ed49aa0248..fdbc6fdfdf576f 100644
--- a/drivers/s390/crypto/ap_bus.h
+++ b/drivers/s390/crypto/ap_bus.h
@@ -344,6 +344,28 @@ int ap_parse_mask_str(const char *str,
struct mutex *lock);
/*
+ * ap_hex2bitmap() - Convert a string containing a hexadecimal number (str)
+ * into a bitmap (bitmap) with bits set that correspond to the bits represented
+ * by the hex string. Input and output data is in big endian order.
+ *
+ * str - Input hex string of format "0x1234abcd". The leading "0x" is optional.
+ * At least one digit is required. Must be large enough to hold the number of
+ * bits represented by the bits parameter.
+ *
+ * bitmap - Pointer to a bitmap. Upon successful completion of this function,
+ * this bitmap will have bits set to match the value of str. If bitmap is longer
+ * than str, then the rightmost bits of bitmap are padded with zeros. Must be
+ * large enough to hold the number of bits represented by the bits parameter.
+ *
+ * bits - Length, in bits, of the bitmap represented by str. Must be a multiple
+ * of 8.
+ *
+ * Returns: 0 On success
+ * -EINVAL If str format is invalid or bits is not a multiple of 8.
+ */
+int ap_hex2bitmap(const char *str, unsigned long *bitmap, int bits);
+
+/*
* Interface to wait for the AP bus to have done one initial ap bus
* scan and all detected APQNs have been bound to device drivers.
* If these both conditions are not fulfilled, this function blocks
diff --git a/drivers/s390/crypto/ap_queue.c b/drivers/s390/crypto/ap_queue.c
index 6e4e8d324a6def..1f647ffd6f4db9 100644
--- a/drivers/s390/crypto/ap_queue.c
+++ b/drivers/s390/crypto/ap_queue.c
@@ -708,7 +708,7 @@ static ssize_t ap_functions_show(struct device *dev,
static DEVICE_ATTR_RO(ap_functions);
-#ifdef CONFIG_ZCRYPT_DEBUG
+#ifdef CONFIG_AP_DEBUG
static ssize_t states_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
@@ -820,7 +820,7 @@ static struct attribute *ap_queue_dev_attrs[] = {
&dev_attr_config.attr,
&dev_attr_chkstop.attr,
&dev_attr_ap_functions.attr,
-#ifdef CONFIG_ZCRYPT_DEBUG
+#ifdef CONFIG_AP_DEBUG
&dev_attr_states.attr,
&dev_attr_last_err_rc.attr,
#endif
diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c
index fc169bc6159395..9f76f2d7b66e58 100644
--- a/drivers/s390/crypto/vfio_ap_ops.c
+++ b/drivers/s390/crypto/vfio_ap_ops.c
@@ -794,10 +794,11 @@ err_put_vdev:
static void vfio_ap_mdev_link_queue(struct ap_matrix_mdev *matrix_mdev,
struct vfio_ap_queue *q)
{
- if (q) {
- q->matrix_mdev = matrix_mdev;
- hash_add(matrix_mdev->qtable.queues, &q->mdev_qnode, q->apqn);
- }
+ if (!q || vfio_ap_mdev_get_queue(matrix_mdev, q->apqn))
+ return;
+
+ q->matrix_mdev = matrix_mdev;
+ hash_add(matrix_mdev->qtable.queues, &q->mdev_qnode, q->apqn);
}
static void vfio_ap_mdev_link_apqn(struct ap_matrix_mdev *matrix_mdev, int apqn)
@@ -1118,20 +1119,29 @@ static void vfio_ap_mdev_unlink_adapter(struct ap_matrix_mdev *matrix_mdev,
}
}
-static void vfio_ap_mdev_hot_unplug_adapter(struct ap_matrix_mdev *matrix_mdev,
- unsigned long apid)
+static void vfio_ap_mdev_hot_unplug_adapters(struct ap_matrix_mdev *matrix_mdev,
+ unsigned long *apids)
{
struct vfio_ap_queue *q, *tmpq;
struct list_head qlist;
+ unsigned long apid;
+ bool apcb_update = false;
INIT_LIST_HEAD(&qlist);
- vfio_ap_mdev_unlink_adapter(matrix_mdev, apid, &qlist);
- if (test_bit_inv(apid, matrix_mdev->shadow_apcb.apm)) {
- clear_bit_inv(apid, matrix_mdev->shadow_apcb.apm);
- vfio_ap_mdev_update_guest_apcb(matrix_mdev);
+ for_each_set_bit_inv(apid, apids, AP_DEVICES) {
+ vfio_ap_mdev_unlink_adapter(matrix_mdev, apid, &qlist);
+
+ if (test_bit_inv(apid, matrix_mdev->shadow_apcb.apm)) {
+ clear_bit_inv(apid, matrix_mdev->shadow_apcb.apm);
+ apcb_update = true;
+ }
}
+ /* Only update apcb if needed to avoid impacting guest */
+ if (apcb_update)
+ vfio_ap_mdev_update_guest_apcb(matrix_mdev);
+
vfio_ap_mdev_reset_qlist(&qlist);
list_for_each_entry_safe(q, tmpq, &qlist, reset_qnode) {
@@ -1140,6 +1150,16 @@ static void vfio_ap_mdev_hot_unplug_adapter(struct ap_matrix_mdev *matrix_mdev,
}
}
+static void vfio_ap_mdev_hot_unplug_adapter(struct ap_matrix_mdev *matrix_mdev,
+ unsigned long apid)
+{
+ DECLARE_BITMAP(apids, AP_DEVICES);
+
+ bitmap_zero(apids, AP_DEVICES);
+ set_bit_inv(apid, apids);
+ vfio_ap_mdev_hot_unplug_adapters(matrix_mdev, apids);
+}
+
/**
* unassign_adapter_store - parses the APID from @buf and clears the
* corresponding bit in the mediated matrix device's APM
@@ -1300,20 +1320,29 @@ static void vfio_ap_mdev_unlink_domain(struct ap_matrix_mdev *matrix_mdev,
}
}
-static void vfio_ap_mdev_hot_unplug_domain(struct ap_matrix_mdev *matrix_mdev,
- unsigned long apqi)
+static void vfio_ap_mdev_hot_unplug_domains(struct ap_matrix_mdev *matrix_mdev,
+ unsigned long *apqis)
{
struct vfio_ap_queue *q, *tmpq;
struct list_head qlist;
+ unsigned long apqi;
+ bool apcb_update = false;
INIT_LIST_HEAD(&qlist);
- vfio_ap_mdev_unlink_domain(matrix_mdev, apqi, &qlist);
- if (test_bit_inv(apqi, matrix_mdev->shadow_apcb.aqm)) {
- clear_bit_inv(apqi, matrix_mdev->shadow_apcb.aqm);
- vfio_ap_mdev_update_guest_apcb(matrix_mdev);
+ for_each_set_bit_inv(apqi, apqis, AP_DOMAINS) {
+ vfio_ap_mdev_unlink_domain(matrix_mdev, apqi, &qlist);
+
+ if (test_bit_inv(apqi, matrix_mdev->shadow_apcb.aqm)) {
+ clear_bit_inv(apqi, matrix_mdev->shadow_apcb.aqm);
+ apcb_update = true;
+ }
}
+ /* Only update apcb if needed to avoid impacting guest */
+ if (apcb_update)
+ vfio_ap_mdev_update_guest_apcb(matrix_mdev);
+
vfio_ap_mdev_reset_qlist(&qlist);
list_for_each_entry_safe(q, tmpq, &qlist, reset_qnode) {
@@ -1322,6 +1351,16 @@ static void vfio_ap_mdev_hot_unplug_domain(struct ap_matrix_mdev *matrix_mdev,
}
}
+static void vfio_ap_mdev_hot_unplug_domain(struct ap_matrix_mdev *matrix_mdev,
+ unsigned long apqi)
+{
+ DECLARE_BITMAP(apqis, AP_DOMAINS);
+
+ bitmap_zero(apqis, AP_DEVICES);
+ set_bit_inv(apqi, apqis);
+ vfio_ap_mdev_hot_unplug_domains(matrix_mdev, apqis);
+}
+
/**
* unassign_domain_store - parses the APQI from @buf and clears the
* corresponding bit in the mediated matrix device's AQM
@@ -1570,6 +1609,158 @@ static ssize_t guest_matrix_show(struct device *dev,
}
static DEVICE_ATTR_RO(guest_matrix);
+static ssize_t write_ap_bitmap(unsigned long *bitmap, char *buf, int offset, char sep)
+{
+ return sysfs_emit_at(buf, offset, "0x%016lx%016lx%016lx%016lx%c",
+ bitmap[0], bitmap[1], bitmap[2], bitmap[3], sep);
+}
+
+static ssize_t ap_config_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(dev);
+ int idx = 0;
+
+ idx += write_ap_bitmap(matrix_mdev->matrix.apm, buf, idx, ',');
+ idx += write_ap_bitmap(matrix_mdev->matrix.aqm, buf, idx, ',');
+ idx += write_ap_bitmap(matrix_mdev->matrix.adm, buf, idx, '\n');
+
+ return idx;
+}
+
+/* Number of characters needed for a complete hex mask representing the bits in .. */
+#define AP_DEVICES_STRLEN (AP_DEVICES / 4 + 3)
+#define AP_DOMAINS_STRLEN (AP_DOMAINS / 4 + 3)
+#define AP_CONFIG_STRLEN (AP_DEVICES_STRLEN + 2 * AP_DOMAINS_STRLEN)
+
+static int parse_bitmap(char **strbufptr, unsigned long *bitmap, int nbits)
+{
+ char *curmask;
+
+ curmask = strsep(strbufptr, ",\n");
+ if (!curmask)
+ return -EINVAL;
+
+ bitmap_clear(bitmap, 0, nbits);
+ return ap_hex2bitmap(curmask, bitmap, nbits);
+}
+
+static int ap_matrix_overflow_check(struct ap_matrix_mdev *matrix_mdev)
+{
+ unsigned long bit;
+
+ for_each_set_bit_inv(bit, matrix_mdev->matrix.apm, AP_DEVICES) {
+ if (bit > matrix_mdev->matrix.apm_max)
+ return -ENODEV;
+ }
+
+ for_each_set_bit_inv(bit, matrix_mdev->matrix.aqm, AP_DOMAINS) {
+ if (bit > matrix_mdev->matrix.aqm_max)
+ return -ENODEV;
+ }
+
+ for_each_set_bit_inv(bit, matrix_mdev->matrix.adm, AP_DOMAINS) {
+ if (bit > matrix_mdev->matrix.adm_max)
+ return -ENODEV;
+ }
+
+ return 0;
+}
+
+static void ap_matrix_copy(struct ap_matrix *dst, struct ap_matrix *src)
+{
+ /* This check works around false positive gcc -Wstringop-overread */
+ if (!src)
+ return;
+
+ bitmap_copy(dst->apm, src->apm, AP_DEVICES);
+ bitmap_copy(dst->aqm, src->aqm, AP_DOMAINS);
+ bitmap_copy(dst->adm, src->adm, AP_DOMAINS);
+}
+
+static ssize_t ap_config_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(dev);
+ struct ap_matrix m_new, m_old, m_added, m_removed;
+ DECLARE_BITMAP(apm_filtered, AP_DEVICES);
+ unsigned long newbit;
+ char *newbuf, *rest;
+ int rc = count;
+ bool do_update;
+
+ newbuf = kstrndup(buf, AP_CONFIG_STRLEN, GFP_KERNEL);
+ if (!newbuf)
+ return -ENOMEM;
+ rest = newbuf;
+
+ mutex_lock(&ap_perms_mutex);
+ get_update_locks_for_mdev(matrix_mdev);
+
+ /* Save old state */
+ ap_matrix_copy(&m_old, &matrix_mdev->matrix);
+ if (parse_bitmap(&rest, m_new.apm, AP_DEVICES) ||
+ parse_bitmap(&rest, m_new.aqm, AP_DOMAINS) ||
+ parse_bitmap(&rest, m_new.adm, AP_DOMAINS)) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ bitmap_andnot(m_removed.apm, m_old.apm, m_new.apm, AP_DEVICES);
+ bitmap_andnot(m_removed.aqm, m_old.aqm, m_new.aqm, AP_DOMAINS);
+ bitmap_andnot(m_added.apm, m_new.apm, m_old.apm, AP_DEVICES);
+ bitmap_andnot(m_added.aqm, m_new.aqm, m_old.aqm, AP_DOMAINS);
+
+ /* Need new bitmaps in matrix_mdev for validation */
+ ap_matrix_copy(&matrix_mdev->matrix, &m_new);
+
+ /* Ensure new state is valid, else undo new state */
+ rc = vfio_ap_mdev_validate_masks(matrix_mdev);
+ if (rc) {
+ ap_matrix_copy(&matrix_mdev->matrix, &m_old);
+ goto out;
+ }
+ rc = ap_matrix_overflow_check(matrix_mdev);
+ if (rc) {
+ ap_matrix_copy(&matrix_mdev->matrix, &m_old);
+ goto out;
+ }
+ rc = count;
+
+ /* Need old bitmaps in matrix_mdev for unplug/unlink */
+ ap_matrix_copy(&matrix_mdev->matrix, &m_old);
+
+ /* Unlink removed adapters/domains */
+ vfio_ap_mdev_hot_unplug_adapters(matrix_mdev, m_removed.apm);
+ vfio_ap_mdev_hot_unplug_domains(matrix_mdev, m_removed.aqm);
+
+ /* Need new bitmaps in matrix_mdev for linking new adapters/domains */
+ ap_matrix_copy(&matrix_mdev->matrix, &m_new);
+
+ /* Link newly added adapters */
+ for_each_set_bit_inv(newbit, m_added.apm, AP_DEVICES)
+ vfio_ap_mdev_link_adapter(matrix_mdev, newbit);
+
+ for_each_set_bit_inv(newbit, m_added.aqm, AP_DOMAINS)
+ vfio_ap_mdev_link_domain(matrix_mdev, newbit);
+
+ /* filter resources not bound to vfio-ap */
+ do_update = vfio_ap_mdev_filter_matrix(matrix_mdev, apm_filtered);
+ do_update |= vfio_ap_mdev_filter_cdoms(matrix_mdev);
+
+ /* Apply changes to shadow apbc if things changed */
+ if (do_update) {
+ vfio_ap_mdev_update_guest_apcb(matrix_mdev);
+ reset_queues_for_apids(matrix_mdev, apm_filtered);
+ }
+out:
+ release_update_locks_for_mdev(matrix_mdev);
+ mutex_unlock(&ap_perms_mutex);
+ kfree(newbuf);
+ return rc;
+}
+static DEVICE_ATTR_RW(ap_config);
+
static struct attribute *vfio_ap_mdev_attrs[] = {
&dev_attr_assign_adapter.attr,
&dev_attr_unassign_adapter.attr,
@@ -1577,6 +1768,7 @@ static struct attribute *vfio_ap_mdev_attrs[] = {
&dev_attr_unassign_domain.attr,
&dev_attr_assign_control_domain.attr,
&dev_attr_unassign_control_domain.attr,
+ &dev_attr_ap_config.attr,
&dev_attr_control_domains.attr,
&dev_attr_matrix.attr,
&dev_attr_guest_matrix.attr,
diff --git a/drivers/s390/crypto/vfio_ap_private.h b/drivers/s390/crypto/vfio_ap_private.h
index 98d37aa27044a6..437a161c865983 100644
--- a/drivers/s390/crypto/vfio_ap_private.h
+++ b/drivers/s390/crypto/vfio_ap_private.h
@@ -75,11 +75,11 @@ extern struct ap_matrix_dev *matrix_dev;
*/
struct ap_matrix {
unsigned long apm_max;
- DECLARE_BITMAP(apm, 256);
+ DECLARE_BITMAP(apm, AP_DEVICES);
unsigned long aqm_max;
- DECLARE_BITMAP(aqm, 256);
+ DECLARE_BITMAP(aqm, AP_DOMAINS);
unsigned long adm_max;
- DECLARE_BITMAP(adm, 256);
+ DECLARE_BITMAP(adm, AP_DOMAINS);
};
/**
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index d9e82ae68244e6..defa5109cc62ac 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -313,6 +313,38 @@ out_release:
goto out;
}
+static int mfill_atomic_pte_zeroed_folio(pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr)
+{
+ struct folio *folio;
+ int ret = -ENOMEM;
+
+ folio = vma_alloc_zeroed_movable_folio(dst_vma, dst_addr);
+ if (!folio)
+ return ret;
+
+ if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL))
+ goto out_put;
+
+ /*
+ * The memory barrier inside __folio_mark_uptodate makes sure that
+ * zeroing out the folio become visible before mapping the page
+ * using set_pte_at(). See do_anonymous_page().
+ */
+ __folio_mark_uptodate(folio);
+
+ ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
+ &folio->page, true, 0);
+ if (ret)
+ goto out_put;
+
+ return 0;
+out_put:
+ folio_put(folio);
+ return ret;
+}
+
static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr)
@@ -321,6 +353,9 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
spinlock_t *ptl;
int ret;
+ if (mm_forbids_zeropage(dst_vma->vm_mm))
+ return mfill_atomic_pte_zeroed_folio(dst_pmd, dst_vma, dst_addr);
+
_dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
dst_vma->vm_page_prot));
ret = -EAGAIN;
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index 2f5b91da5afa9e..937294ff164fc8 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -601,11 +601,6 @@ static int ignore_undef_symbol(struct elf_info *info, const char *symname)
strstarts(symname, "_savevr_") ||
strcmp(symname, ".TOC.") == 0)
return 1;
-
- if (info->hdr->e_machine == EM_S390)
- /* Expoline thunks are linked on all kernel modules during final link of .ko */
- if (strstarts(symname, "__s390_indirect_jump_r"))
- return 1;
/* Do not ignore this symbol */
return 0;
}
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 741c7dc16afc75..ed4ad591f19353 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -180,6 +180,7 @@ TEST_GEN_PROGS_s390x += s390x/sync_regs_test
TEST_GEN_PROGS_s390x += s390x/tprot
TEST_GEN_PROGS_s390x += s390x/cmma_test
TEST_GEN_PROGS_s390x += s390x/debug_test
+TEST_GEN_PROGS_s390x += s390x/shared_zeropage_test
TEST_GEN_PROGS_s390x += demand_paging_test
TEST_GEN_PROGS_s390x += dirty_log_test
TEST_GEN_PROGS_s390x += guest_print_test
diff --git a/tools/testing/selftests/kvm/s390x/shared_zeropage_test.c b/tools/testing/selftests/kvm/s390x/shared_zeropage_test.c
new file mode 100644
index 00000000000000..dc1f5f18f33088
--- /dev/null
+++ b/tools/testing/selftests/kvm/s390x/shared_zeropage_test.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Test shared zeropage handling (with/without storage keys)
+ *
+ * Copyright (C) 2024, Red Hat, Inc.
+ */
+#include <sys/mman.h>
+
+#include <linux/fs.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "kselftest.h"
+
+static void set_storage_key(void *addr, uint8_t skey)
+{
+ asm volatile("sske %0,%1" : : "d" (skey), "a" (addr));
+}
+
+static void guest_code(void)
+{
+ /* Issue some storage key instruction. */
+ set_storage_key((void *)0, 0x98);
+ GUEST_DONE();
+}
+
+/*
+ * Returns 1 if the shared zeropage is mapped, 0 if something else is mapped.
+ * Returns < 0 on error or if nothing is mapped.
+ */
+static int maps_shared_zeropage(int pagemap_fd, void *addr)
+{
+ struct page_region region;
+ struct pm_scan_arg arg = {
+ .start = (uintptr_t)addr,
+ .end = (uintptr_t)addr + 4096,
+ .vec = (uintptr_t)&region,
+ .vec_len = 1,
+ .size = sizeof(struct pm_scan_arg),
+ .category_mask = PAGE_IS_PFNZERO,
+ .category_anyof_mask = PAGE_IS_PRESENT,
+ .return_mask = PAGE_IS_PFNZERO,
+ };
+ return ioctl(pagemap_fd, PAGEMAP_SCAN, &arg);
+}
+
+int main(int argc, char *argv[])
+{
+ char *mem, *page0, *page1, *page2, tmp;
+ const size_t pagesize = getpagesize();
+ struct kvm_vcpu *vcpu;
+ struct kvm_vm *vm;
+ struct ucall uc;
+ int pagemap_fd;
+
+ ksft_print_header();
+ ksft_set_plan(3);
+
+ /*
+ * We'll use memory that is not mapped into the VM for simplicity.
+ * Shared zeropages are enabled/disabled per-process.
+ */
+ mem = mmap(0, 3 * pagesize, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0);
+ TEST_ASSERT(mem != MAP_FAILED, "mmap() failed");
+
+ /* Disable THP. Ignore errors on older kernels. */
+ madvise(mem, 3 * pagesize, MADV_NOHUGEPAGE);
+
+ page0 = mem;
+ page1 = page0 + pagesize;
+ page2 = page1 + pagesize;
+
+ /* Can we even detect shared zeropages? */
+ pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
+ TEST_REQUIRE(pagemap_fd >= 0);
+
+ tmp = *page0;
+ asm volatile("" : "+r" (tmp));
+ TEST_REQUIRE(maps_shared_zeropage(pagemap_fd, page0) == 1);
+
+ vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+
+ /* Verify that we get the shared zeropage after VM creation. */
+ tmp = *page1;
+ asm volatile("" : "+r" (tmp));
+ ksft_test_result(maps_shared_zeropage(pagemap_fd, page1) == 1,
+ "Shared zeropages should be enabled\n");
+
+ /*
+ * Let our VM execute a storage key instruction that should
+ * unshare all shared zeropages.
+ */
+ vcpu_run(vcpu);
+ get_ucall(vcpu, &uc);
+ TEST_ASSERT_EQ(uc.cmd, UCALL_DONE);
+
+ /* Verify that we don't have a shared zeropage anymore. */
+ ksft_test_result(!maps_shared_zeropage(pagemap_fd, page1),
+ "Shared zeropage should be gone\n");
+
+ /* Verify that we don't get any new shared zeropages. */
+ tmp = *page2;
+ asm volatile("" : "+r" (tmp));
+ ksft_test_result(!maps_shared_zeropage(pagemap_fd, page2),
+ "Shared zeropages should be disabled\n");
+
+ kvm_vm_free(vm);
+
+ ksft_finished();
+}